In [199]:
import pandas as pd
import numpy as np
# Importing the libraries
from sklearn.model_selection import train_test_split # for data validation

# Models
from sklearn.linear_model import LinearRegression, BayesianRidge, LassoLars
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRegressor

# Metrics and Grid Search
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score, confusion_matrix, accuracy_score, classification_report, r2_score

In [164]:
hapiness_df= pd.read_csv('Resources/hapiness_data.csv')
hapiness_df.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government
0,Afghanistan,2008,3.72359,7.302574,0.450662,50.5,0.718114,0.173169,0.881686,0.414297,0.258195,0.612072
1,Afghanistan,2009,4.401778,7.472446,0.552308,50.799999,0.678896,0.195469,0.850035,0.481421,0.237092,0.611545
2,Afghanistan,2010,4.758381,7.579183,0.539075,51.099998,0.600127,0.125859,0.706766,0.516907,0.275324,0.299357
3,Afghanistan,2011,3.831719,7.552006,0.521104,51.400002,0.495901,0.167723,0.731109,0.479835,0.267175,0.307386
4,Afghanistan,2012,3.782938,7.637953,0.520637,51.700001,0.530935,0.241247,0.77562,0.613513,0.267919,0.43544


In [146]:
#Check how many missing values there are in the dataset
hapiness_df.isnull().sum().sum()

579

In [147]:
# Drop null values
#hapiness_df=hapiness_df.dropna()
hapiness_df= hapiness_df.reindex()

In [148]:
hapiness_df.shape

(1747, 12)

In [149]:
#Grouping the dataset by Country name and calculate the Mean
data_model = hapiness_df.groupby(by= 'Country name')['Life Ladder', 'Log GDP per capita','Social support', 'Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption', 'Positive affect', 'Negative affect','Confidence in national government'].mean().reset_index()
data_model

  data_model = hapiness_df.groupby(by= 'Country name')['Life Ladder', 'Log GDP per capita','Social support', 'Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption', 'Positive affect', 'Negative affect','Confidence in national government'].mean().reset_index()


Unnamed: 0,Country name,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government
0,Afghanistan,3.594628,7.583020,0.508245,52.170833,0.518012,0.075328,0.843283,0.473482,0.326684,0.392547
1,Albania,4.898745,9.405987,0.698942,68.691365,0.714779,-0.044853,0.884626,0.555175,0.300746,0.407852
2,Angola,4.420299,8.989725,0.737973,52.150001,0.455957,-0.088896,0.867018,0.625734,0.351173,0.397389
3,Argentina,6.285048,10.031485,0.903041,66.628125,0.771455,-0.152174,0.840388,0.739983,0.287840,0.381224
4,Armenia,4.566087,9.285748,0.721515,65.573333,0.579216,-0.197553,0.837074,0.495732,0.437056,0.349403
...,...,...,...,...,...,...,...,...,...,...,...
144,Venezuela,6.042505,8.533054,0.914680,65.079231,0.657372,-0.096557,0.799767,0.787192,0.251597,0.386531
145,Vietnam,5.400847,8.556147,0.806390,64.540000,0.858053,0.016298,0.783006,0.583961,0.200419,0.836712
146,Yemen,3.899032,8.081041,0.702208,58.420625,0.627088,-0.134543,0.828157,0.465654,0.300970,0.420266
147,Zambia,4.453841,8.066699,0.729828,51.535000,0.761771,0.022134,0.828492,0.678728,0.297978,0.572063


In [189]:
#Creating the table model

# Creating the independent and depend variables
y = data_model['Life Ladder']
X = data_model[['Log GDP per capita','Social support', 'Healthy life expectancy at birth','Freedom to make life choices','Generosity' ,'Perceptions of corruption','Positive affect', 'Negative affect','Confidence in national government']]

# Splitting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=1)

In [190]:
y_test.count()

90

In [191]:
# Creating a predefined function to test the models
def modelfit(model):
   model.fit(X_train, y_train)
   preds = model.predict(X_test)
   mae = metrics.mean_absolute_error(y_test, preds)
   return (round(mae,4))

In [192]:

#Linear Regression
lm = LinearRegression(n_jobs = 10000)
model1=('Linear Regression', modelfit(lm))

# Random Forest Regressor
rf = RandomForestRegressor()
model2=('Random Forest Regressor', modelfit(rf))

# XGBoost
xg = XGBRegressor(learning_rate=0.1, n_estimators=5000)
model3=('XGBoost',modelfit(xg))

# Decision Tree
dt = DecisionTreeRegressor()
model4= ('Decision Tree', modelfit(dt))

# Bayesian Linear Model
br = BayesianRidge(n_iter=1000, tol = 0.5)
model5=('Bayesian Linear Model', modelfit(br))

#Create a DataFrame with Models
models = pd.DataFrame(data= [model1, model2, model3, model4, model5], columns=['Model', 'MAE'])
models
# mmodels.style.apply(highlight_cell, )

Unnamed: 0,Model,MAE
0,Linear Regression,0.3906
1,Random Forest Regressor,0.3783
2,XGBoost,0.3874
3,Decision Tree,0.5198
4,Bayesian Linear Model,0.4194


In [202]:
xg_reg_model = xgb.XGBRegressor(objective ='reg:linear', learning_rate = 0.1, n_estimators=5000)
xg_reg_model.fit(X_train, y_train)
xg_predictions = xg_reg_model.predict(X_test)
xg_predictions



array([5.262018 , 5.4608755, 6.0133257, 4.163924 , 3.915171 , 4.8255253,
       5.9982886, 3.9846919, 6.8619604, 4.849948 , 6.9992366, 5.345094 ,
       4.075257 , 5.3883066, 7.456012 , 6.779226 , 5.3529716, 4.7769494,
       4.7351604, 7.1164837, 6.78809  , 6.0909624, 6.5795765, 6.6198993,
       5.7921634, 4.035946 , 5.8300586, 4.4551444, 5.332234 , 5.4235635,
       5.8538313, 7.110623 , 5.3307858, 4.895803 , 4.0929756, 5.4917903,
       6.181309 , 5.6505475, 5.3609166, 5.4556804, 5.1753197, 4.7559376,
       5.5678   , 5.631397 , 4.863908 , 6.6557913, 6.1342063, 4.8108096,
       4.3620067, 4.1734624, 4.677459 , 4.072746 , 6.6662555, 7.2318926,
       4.0388117, 6.80899  , 4.6861076, 5.6457453, 4.070741 , 5.3476524,
       3.7105572, 4.9170294, 7.1302733, 6.6848335, 7.3285007, 6.759253 ,
       5.413713 , 6.0446796, 6.70063  , 4.4127383, 4.062571 , 6.465551 ,
       4.8414607, 4.8459587, 4.9598327, 5.8157754, 4.7361107, 6.0227637,
       4.570067 , 6.814828 , 6.2832904, 5.912626 , 

In [203]:
r2_score(y_test, xg_predictions)

0.7928957647069097

In [204]:
# Create a random forest regresor.
rf_model = RandomForestRegressor(n_estimators=128, random_state=78) 
rf_model = rf_model.fit(X_train, y_train)
rf_predictions= rf_model.predict(X_test)
rf_predictions

array([5.24339532, 5.22115167, 5.82301676, 4.50335405, 4.16570305,
       4.67768635, 5.56302996, 4.18622058, 7.01520208, 4.7821355 ,
       6.99405451, 5.29720026, 4.18685016, 5.2562597 , 7.12793378,
       6.21760345, 5.31306281, 4.75284224, 4.76048971, 7.02421015,
       6.70869939, 6.01395375, 6.4692332 , 6.31652859, 5.96567863,
       4.11109183, 5.68445745, 4.81161454, 5.41482535, 5.30500049,
       5.6316678 , 6.94334947, 5.18157767, 4.81080868, 4.20061303,
       4.94059947, 6.0862235 , 5.57674178, 5.42009015, 5.19388478,
       5.53502683, 4.77176941, 5.60765356, 5.58866989, 4.75688841,
       6.78233653, 5.71008884, 4.77124838, 4.30662296, 4.2475578 ,
       4.8359778 , 4.04351933, 6.63905638, 6.91982763, 4.0731257 ,
       6.55773674, 4.80851352, 5.76807824, 4.23090092, 5.19092772,
       4.05776509, 4.81363655, 7.02470612, 6.37854647, 7.09973475,
       6.94568077, 5.38386142, 6.02390924, 6.47789114, 4.34081595,
       4.10404161, 5.6848231 , 4.79393805, 4.84663585, 4.72536

In [205]:
r2_score(y_test, rf_predictions)

0.810495830809052