In [127]:
import pandas as pd
import numpy as np
# Importing the libraries
from sklearn.model_selection import train_test_split # for data validation

# Models
from sklearn.linear_model import LinearRegression, BayesianRidge, LassoLars
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRegressor

# Metrics and Grid Search
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score

In [113]:
hapiness_df= pd.read_csv('hapiness_data.csv')
hapiness_df.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government
0,Afghanistan,2008,3.72359,7.302574,0.450662,50.5,0.718114,0.173169,0.881686,0.414297,0.258195,0.612072
1,Afghanistan,2009,4.401778,7.472446,0.552308,50.799999,0.678896,0.195469,0.850035,0.481421,0.237092,0.611545
2,Afghanistan,2010,4.758381,7.579183,0.539075,51.099998,0.600127,0.125859,0.706766,0.516907,0.275324,0.299357
3,Afghanistan,2011,3.831719,7.552006,0.521104,51.400002,0.495901,0.167723,0.731109,0.479835,0.267175,0.307386
4,Afghanistan,2012,3.782938,7.637953,0.520637,51.700001,0.530935,0.241247,0.77562,0.613513,0.267919,0.43544


In [114]:
#Check how many missing values there are in the dataset
hapiness_df.isnull().sum().sum()

579

In [115]:
# Drop null values
hapiness_df=hapiness_df.dropna()
hapiness_df= hapiness_df.reindex()

In [116]:
hapiness_df.shape

(1747, 12)

In [117]:
#Summary Statistics
#from pydoc import describe
#hapiness_df.groupby(by='year')['Life Ladder'].describe()

In [118]:
#Creating the table model
data_model = hapiness_df.groupby(by= 'Country name')['Life Ladder', 'Log GDP per capita','Social support', 'Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption', 'Positive affect', 'Negative affect','Confidence in national government'].mean().reset_index()

# Creating the independent and depend variables
y = data_model['Life Ladder']
X = data_model[['Log GDP per capita','Social support', 'Healthy life expectancy at birth','Freedom to make life choices','Generosity' ,'Perceptions of corruption', 'Positive affect', 'Negative affect','Confidence in national government']]

# Splitting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

  data_model = hapiness_df.groupby(by= 'Country name')['Life Ladder', 'Log GDP per capita','Social support', 'Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption', 'Positive affect', 'Negative affect','Confidence in national government'].mean().reset_index()


In [119]:
# Creating a predefined function to test the models
def modelfit(model):
   model.fit(X_train, y_train)
   preds = model.predict(X_test)
   mae = metrics.mean_absolute_error(y_test, preds)
   return (round(mae,4))

In [120]:

#Linear Regression
lm = LinearRegression(n_jobs = 10000)
model1=('Linear Regression', modelfit(lm))

# Random Forest Regressor
rf = RandomForestRegressor(n_jobs = 1000)
model2=('Random Forest Regressor', modelfit(rf))

# XGBoost
xg = XGBRegressor(learning_rate=0.1, n_estimators=5000)
model3=('XGBoost',modelfit(xg))

# Decision Tree
dt = DecisionTreeRegressor()
model4= ('Decision Tree', modelfit(dt))

# Bayesian Linear Model
br = BayesianRidge(n_iter=1000, tol = 0.5)
model5=('Bayesian Linear Model', modelfit(br))

#Create a DataFrame with Models
models = pd.DataFrame(data= [model1, model2, model3, model4, model5], columns=['Model', 'MAE'])
models

Unnamed: 0,Model,MAE
0,Linear Regression,0.3381
1,Random Forest Regressor,0.3121
2,XGBoost,0.2496
3,Decision Tree,0.336
4,Bayesian Linear Model,0.3439


In [130]:
xg_reg_model = xgb.XGBRegressor(objective ='reg:linear', learning_rate = 0.1, n_estimators=5000)
xg_reg_model.fit(X_train, y_train)
predictions = xg_reg_model.predict(X_test)
#print(explained_variance_score(predictions,y_test))
print("Accuracy of Model::",xg_reg_model.score(X_test,y_test))

predictions

0.906972990684741
Accuracy of Model:: 0.919930142515041


array([5.227877 , 5.2211776, 5.3684063, 4.312965 , 4.018866 , 4.7306104,
       5.671484 , 3.953761 , 7.204849 , 4.738239 , 7.348206 , 4.836131 ,
       4.167731 , 5.3687043, 7.375043 , 6.189487 , 5.4327154, 4.522335 ,
       4.6490316, 7.312394 , 6.2902637, 5.8162036, 6.8719883, 6.0077467,
       5.686565 , 4.2055144, 5.324521 , 4.4011164, 5.252072 , 5.150143 ],
      dtype=float32)