# Global birth rate analysis

Birth rate by country and the global birth rate will be analyzed under the impacts of the following factors: 
- Financial: GDP per Capita
- Social: Happiness index, Social support score, life expectancy, freedom to make life choices, generosity,perception of corruption
- Jobs: Unemployment rate

In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy import stats
import statsmodels.api as sm

## Data Import & Data Preparation

In [51]:
# Data import
df_birth_rate = pd.read_csv("birth_rate.csv")
df_gdp = pd.read_csv("GDP.csv")
df_happiness_index = pd.read_excel("DataForFigure2.1WHR2023.xls")
df_unemployment_rate = pd.read_csv('unemployment_rate.csv')


In [52]:
# Drop duplicates & rename columns
df_birth_rate = df_birth_rate.rename({'country': 'Country'}, axis='columns')
df_gdp = df_gdp.drop_duplicates(subset='country', keep='first')
df_gdp = df_gdp.rename({'country': 'Country'}, axis='columns')
df_happiness_index = df_happiness_index.rename({'Country name': 'Country'},
                                               axis='columns')
df_unemployment_rate = df_unemployment_rate.rename({'country': 'Country'},
                                                    axis='columns')


In [53]:
# Preliminary variable selection
df_birth_rate = df_birth_rate[['birthRate', 'Country']]
df_gdp = df_gdp[['Country', 'gdpPerCapita']]
df_happiness_index = df_happiness_index[['Country', 'Ladder score', 'Social support', 
                                         'Healthy life expectancy',
                                         'Freedom to make life choices', 
                                         'Generosity', 'Perceptions of corruption']]
df_unemployment_rate = df_unemployment_rate[['Country', 'rateWb']]

In [54]:
# Main data frame preparation
df_final = df_birth_rate.merge(df_gdp,
                                on='Country').merge(df_happiness_index,
                                                     on='Country').merge(df_unemployment_rate, on='Country')
df_final.columns

Index(['birthRate', 'Country', 'gdpPerCapita', 'Ladder score',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'rateWb'],
      dtype='object')

In [55]:
df_final[['gdpPerCapita', 'Social support', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']] = df_final[['gdpPerCapita', 'Social support', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']]*100

In [56]:
df_final.loc[128,'rateWb'] = 3.56

In [57]:
# Data partition
df_predictors = df_final[['gdpPerCapita', 'Ladder score', 'Social support', 
                          'Healthy life expectancy', 'Freedom to make life choices',
                          'Generosity', 'Perceptions of corruption', 'rateWb']]
df_target = df_final['birthRate']

x_train, x_test, y_train, y_test = train_test_split(df_predictors,df_target,test_size=0.25)


## Model Fit & Model result

In [58]:
model = linear_model.LinearRegression()

In [59]:
model.fit(x_train,y_train)

In [60]:
# birth_rate fitted and birth_rate prediction
y_fitted = model.predict(x_train)
y_predict = model.predict(x_test)

In [61]:
# residuals from train & test
residuals_train = y_train - y_fitted
residuals_test = y_test - y_predict

In [62]:
# Coefficients
coef_gdp = model.coef_[0]
coef_happiness_index = model.coef_[1]
coef_social_support = model.coef_[2]
coef_life_expectancy = model.coef_[3]
coef_freedom = model.coef_[4]
coef_generosity = model.coef_[5]
coef_corruption = model.coef_[6]
coef_unemployment = model.coef_[7]
intercept = model.intercept_
model.coef_

array([ 2.32039765e-07, -8.22943908e-01, -8.99597281e-02, -1.32327391e+00,
       -1.88030135e-02, -1.07109233e-02, -4.58469291e-02, -2.32136034e-01])

In [64]:
print('model = %f + %f GDP %f Happiness index  %f Social support %f life expectancy  %f freedom  %f generosity  %f corruption  %f Unemployment' % (intercept, coef_gdp, coef_happiness_index, coef_social_support, coef_life_expectancy, coef_freedom, coef_generosity, coef_corruption, coef_unemployment))

model = 122.412560 + 0.000000 GDP -0.822944 Happiness index  -0.089960 Social support -1.323274 life expectancy  -0.018803 freedom  -0.010711 generosity  -0.045847 corruption  -0.232136 Unemployment


In [83]:
stats_model = sm.OLS(y_train, x_train).fit()

## Model & Predictors Evaluation

In [86]:
def model_significance(y_train_set, y_pred, p):
    y_mean = np.mean(y_train_set)
    tss = ((y_train_set - y_mean)**2).sum()
    rss = ((y_train_set - y_pred)**2).sum()
    n = len(y_train_set)
    f_test = (tss-rss)/(rss/(n-p-1))
    if f_test > 1:
        print("Reject Ho. At least one of the model coefficients is significant.")
    if f_test <=1: 
        print("Accept Ho. No coefficients are useful.")


def coefficient_testing(coeff, residuals, trainset,p):
    """Perform Hypothesis testing to determine if Coefficient in regression is significant"""
    # Obtain the residual standard deviation
    residual_variance = np.var(residuals)
    x_mean = np.mean(trainset)
    x_squared = ((trainset - x_mean)**2).sum()
    coeff_var = residual_variance/ x_squared
    coeff_std = np.sqrt(coeff_var)
    print("coeff_var, std:",coeff_var, coeff_std)
    # Obtain t test
    print("Hypothesis: Ho: B1 = 0, Ha: B1 !=0")
    t_value = coeff/coeff_std
    print("t_value:", t_value)
    degree_of_freedom = len(trainset) - p -1
    p_value = stats.t.sf(abs(t_value), degree_of_freedom)*2
    print ("p_value: %.24f" % p_value)
    # Conclusion:
    if p_value*2 < 0.05:
        print('Reject Ho. The coefficient is significant, and has a relationship with the respond variable')
    if p_value*2 > 0.05: 
        print('Accept Ho. The coefficient is not significant, and does not have a relationship with respond variable')
    return None


def residual_zero_mean(residuals,p):
    n = len(residuals)
    residual_mean = np.mean(residuals)
    residual_std = np.std(residuals)
    t_test = (residual_mean)/(residual_std/np.sqrt(n))
    p_value = stats.t.sf(t_test, n - p - 1)
    if p_value > 0.05:
        print("As the p_value is larger than 0.05, there is no sufficient evidence to reject Ho. From this, residuals mean is equal to 0. Linear regression assumption on residuals is appropriate.")
    elif p_value <= 0.05: 
        print("As p_value is smaller than 0.95, there is sufficient evidence to reject Ho. Thus zero mean assumption is not appropriate.")

### Model Evaluation

In [66]:
# Model evaluation - F test
model_significance(y_train, y_fitted, 8)

Reject Ho. At least one of the model coefficients is significant.


### Coefficient evaluation


In [85]:
stats_model.summary()

0,1,2,3
Dep. Variable:,birthRate,R-squared (uncentered):,0.844
Model:,OLS,Adj. R-squared (uncentered):,0.83
Method:,Least Squares,F-statistic:,60.67
Date:,"Fri, 05 May 2023",Prob (F-statistic):,5.83e-33
Time:,14:59:57,Log-Likelihood:,-345.94
No. Observations:,98,AIC:,707.9
Df Residuals:,90,BIC:,728.6
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
gdpPerCapita,-1.185e-07,6.65e-07,-0.178,0.859,-1.44e-06,1.2e-06
Ladder score,-2.7697,1.962,-1.412,0.161,-6.667,1.128
Social support,-0.3744,0.142,-2.630,0.010,-0.657,-0.092
Healthy life expectancy,0.3740,0.173,2.168,0.033,0.031,0.717
Freedom to make life choices,0.3082,0.113,2.723,0.008,0.083,0.533
Generosity,0.1270,0.068,1.855,0.067,-0.009,0.263
Perceptions of corruption,0.1992,0.072,2.750,0.007,0.055,0.343
rateWb,0.0256,0.189,0.136,0.892,-0.350,0.401

0,1,2,3
Omnibus:,3.51,Durbin-Watson:,2.034
Prob(Omnibus):,0.173,Jarque-Bera (JB):,3.355
Skew:,0.451,Prob(JB):,0.187
Kurtosis:,2.909,Cond. No.,7330000.0


In [88]:
# GDP coefficient
coefficient_testing(coef_gdp, residuals_train, x_train['gdpPerCapita'],8)

coeff_var, std: 3.552010617084641e-14 1.884677854988656e-07
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: 1.2311905969703867
p_value: 0.221495165463667043059814
Accept Ho. The coefficient is not significant, and does not have a relationship with respond variable


In [89]:
# Happiness index
coefficient_testing(coef_happiness_index, residuals_train, x_train['Ladder score'], 8)

coeff_var, std: 0.16838506971241882 0.41034749872811316
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: -2.005480502530947
p_value: 0.047949139965001372210018
Accept Ho. The coefficient is not significant, and does not have a relationship with respond variable


In [90]:
# Social support
coefficient_testing(coef_social_support, residuals_train, x_train['Social support'],8)

coeff_var, std: 0.0012617560464571768 0.03552120558845345
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: -2.532564044164085
p_value: 0.013075905674820909735390
Reject Ho. The coefficient is significant, and has a relationship with the respond variable


In [91]:
# Healthy life expectancy
coefficient_testing(coef_life_expectancy, residuals_train, x_train['Healthy life expectancy'], 8)

coeff_var, std: 0.0072340563213693825 0.08505325579523328
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: -15.55818059238335
p_value: 0.000000000000000000000000
Reject Ho. The coefficient is significant, and has a relationship with the respond variable


In [92]:
# Freedom to make life choices
coefficient_testing(coef_freedom, residuals_train, x_train['Freedom to make life choices'], 8)

coeff_var, std: 0.001961363049456264 0.044287278641346475
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: -0.4245691793203524
p_value: 0.672175370815218498066201
Accept Ho. The coefficient is not significant, and does not have a relationship with respond variable


In [93]:
# Generosity
coefficient_testing(coef_generosity, residuals_train, x_train['Generosity'], 8)

coeff_var, std: 0.001102431530123563 0.03320288436451814
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: -0.32259014473863584
p_value: 0.747762261755753110392675
Accept Ho. The coefficient is not significant, and does not have a relationship with respond variable


In [94]:
# Perceptions of corruption
coefficient_testing(coef_corruption, residuals_train, x_train['Perceptions of corruption'], 8)

coeff_var, std: 0.0008033870137933444 0.02834408251810851
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: -1.6175132534334893
p_value: 0.109306518402386684818417
Accept Ho. The coefficient is not significant, and does not have a relationship with respond variable


In [95]:
# Unemployment rate: rateWb
coefficient_testing(coef_unemployment, residuals_train, x_train['rateWb'], 8)

coeff_var, std: 0.008059283285982375 0.08977351104854023
Hypothesis: Ho: B1 = 0, Ha: B1 !=0
t_value: -2.585796535610298
p_value: 0.011340326988791797971357
Reject Ho. The coefficient is significant, and has a relationship with the respond variable


## Accuracy Evaluation

## Residuals Evaluation