In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
whr = pd.read_csv('whr_NoNA_all_topia.csv', sep=';',index_col = 0)
whr.head()

Unnamed: 0,Country,Year,Life Ladder,LogGDP,SocialSupport,LifeExpectancy,Freedom,Generosity,Corruption,PosAffect,...,Christianisme,Islam,Sans-religion,Hindouisme,Bouddhisme,Religions traditionnelles,Autres,Judaisme,GiniCoeff21,Corrup
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,...,0.001,0.997,0.0,0.0,0.0,0.0,0.0,0.0,29.4,18
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,...,0.001,0.997,0.0,0.0,0.0,0.0,0.0,0.0,29.4,15
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,...,0.001,0.997,0.0,0.0,0.0,0.0,0.0,0.0,29.4,14
3,Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,...,0.001,0.997,0.0,0.0,0.0,0.0,0.0,0.0,29.4,15
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,...,0.001,0.997,0.0,0.0,0.0,0.0,0.0,0.0,29.4,8


In [3]:
whr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1618 entries, 0 to 1617
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    1618 non-null   object 
 1   Year                       1618 non-null   int64  
 2   Life Ladder                1618 non-null   float64
 3   LogGDP                     1618 non-null   float64
 4   SocialSupport              1618 non-null   float64
 5   LifeExpectancy             1618 non-null   float64
 6   Freedom                    1618 non-null   float64
 7   Generosity                 1618 non-null   float64
 8   Corruption                 1618 non-null   float64
 9   PosAffect                  1618 non-null   float64
 10  NegAffect                  1618 non-null   float64
 11  Region                     1618 non-null   object 
 12  Regime                     1618 non-null   object 
 13  AreaInK2                   1618 non-null   int64

In [19]:
## REGRESSION LINEAIRE MULTIPLE DE LASSO
## Modèle retenu : Ensemble des variables

score = whr[['Life Ladder']]
data = whr.drop(columns=['Region','Regime', 'Country', 'Year', 'Life Ladder', 'PopAnnualGrowthRate', 'Population', 'AreaInK2', 'InfantMortalityRate', 'Corruption'])

# TRANSFORMATION DES REGIONS EN DUMMIES
regions = whr[['Region']]
regions = pd.get_dummies(regions)
data = data.join(regions).drop(columns='Region_Topia')

# TRANSFORMATION DES REGIMES POLITIQUES EN DUMMIES
politics = whr[['Regime']]
politics = pd.get_dummies(politics)
data = data.join(politics)

# ENSEMBLES, ENTRAINEMENT ET PERFORMANCES
X_train, X_test, y_train, y_test = train_test_split(data, score, test_size=0.2, random_state=123) 
model_lasso = LassoCV(cv=10, alphas=([10, 1, 0.1, 0.01, 0.001])).fit(X_train, y_train)
pred_test = model_lasso.predict(X_test)

# RESULTATS
print('Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder :')
r2train2, r2test2 = round(model_lasso.score(X_train, y_train),3), round(model_lasso.score(X_test, y_test),3)
print("Score sur l'ensemble d'entrainement :", r2train2)
print("Score sur l'ensemble de test :",r2test2)
print('\n')

print('Plus de 80% de la variance du score Life Ladder peut être expliqué grâce aux variables,')
print("à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.")
print('\n')

coeff_table = pd.DataFrame(
    {'Variable': list(X_train.columns),
     'Coeff': list(model_lasso.coef_)
    })

print("Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :")
print(coeff_table.head(31).sort_values(by='Coeff', ascending=False))
#print('\n')
#print('Nombre de variables retenues :', model_lasso.coef_[model_lasso.coef_ != 0].shape)
#print('Nombre de variables éliminées :', model_lasso.coef_[model_lasso.coef_ == 0].shape)

Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder :
Score sur l'ensemble d'entrainement : 0.835
Score sur l'ensemble de test : 0.851


Plus de 80% de la variance du score Life Ladder peut être expliqué grâce aux variables,
à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.


Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :
                                     Variable     Coeff
15                                   Judaisme  1.384359
1                               SocialSupport  1.346097
5                                   PosAffect  1.260522
3                                     Freedom  0.788405
21         Region_Latin America and Caribbean  0.531216
4                                  Generosity  0.453898
0                                      LogGDP  0.421360
23               Region_North America and ANZ  0.409551
27                      Region_Western Europe  0.298166
9                      

  return f(*args, **kwargs)


In [20]:
## REGRESSION LINEAIRE MULTIPLE DE LASSO
## Modèle retenu : Retrait des autres réponses au sondage World Happiness Report

score = whr[['Life Ladder']]
data = whr.drop(columns=['Region','Regime', 'Country', 'Year', 'Life Ladder', 'PopAnnualGrowthRate', 'Population', 'AreaInK2', 'InfantMortalityRate', 'PosAffect', 'NegAffect', 'Freedom', 'Generosity', 'Freedom', 'SocialSupport', 'Corruption'])

# TRANSFORMATION DES REGIMES POLITIQUES EN DUMMIES
data = data.join(politics)
# TRANSFORMATION DES REGIONS EN DUMMIES
data = data.join(regions).drop(columns='Region_Topia')

# ENSEMBLES, ENTRAINEMENT ET PERFORMANCES
X_train, X_test, y_train, y_test = train_test_split(data, score, test_size=0.2, random_state=123) 
model_lasso = LassoCV(cv=10, alphas=([10, 1, 0.1, 0.01, 0.001])).fit(X_train, y_train)
pred_test = model_lasso.predict(X_test)

# RESULTATS
print('Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder :')
r2train2, r2test2 = round(model_lasso.score(X_train, y_train),3), round(model_lasso.score(X_test, y_test),3)
print("Score sur l'ensemble d'entrainement :", r2train2)
print("Score sur l'ensemble de test :",r2test2)
print('\n')

print('Plus de 80% de la variance du score Life Ladder peut être expliqué grâce aux variables,')
print("à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.")
print('\n')

coeff_table = pd.DataFrame(
    {'Variable': list(X_train.columns),
     'Coeff': list(model_lasso.coef_)
    })

print("Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :")
print(coeff_table.head(31).sort_values(by='Coeff', ascending=False))
#print('\n')
#print('Nombre de variables retenues :', model_lasso.coef_[model_lasso.coef_ != 0].shape)
#print('Nombre de variables éliminées :', model_lasso.coef_[model_lasso.coef_ == 0].shape)

Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder :
Score sur l'ensemble d'entrainement : 0.793
Score sur l'ensemble de test : 0.815


Plus de 80% de la variance du score Life Ladder peut être expliqué grâce aux variables,
à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.


Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :
                                     Variable     Coeff
10                                   Judaisme  1.694630
20         Region_Latin America and Caribbean  0.964091
22               Region_North America and ANZ  0.733196
0                                      LogGDP  0.509536
26                      Region_Western Europe  0.455316
24                      Region_Southeast Asia  0.451607
4                                       Islam  0.326301
15                      Regime_Full democracy  0.220733
3                               Christianisme  0.082231
6                      

  return f(*args, **kwargs)


In [14]:
def Previous_Life_ladder(df):
    """
    Description : Add a column life ladder Year-1 to a given dataframe
    Input : Dataframe with the columns 'Country', 'Year', 'Life Ladder'
    Output : Dataframe with the column 'Life Ladder-1' added and other years range dropped.
    """
    df_whr_previous = pd.DataFrame()
    
    for i in df["Country"].unique():
        df_filtered = df[df["Country"]==i]
        df_filtered["Year-1"] = df_filtered["Year"].shift(1)
        df_filtered["Life Ladder-1"] = df_filtered["Life Ladder"].shift(1)
        
        df_whr_previous = df_whr_previous.append(df_filtered)
        df_whr_previous = df_whr_previous.fillna(method="bfill")
    
    df_whr_previous["diff_Y"] = df_whr_previous["Year"]-df_whr_previous["Year-1"]
    df_whr_previous = df_whr_previous[df_whr_previous["diff_Y"]==1]
    df_whr_previous = df_whr_previous.drop(['Year-1','diff_Y'], axis=1)
    
    return df_whr_previous

whrP=Previous_Life_ladder(whr)
whrP.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["Year-1"] = df_filtered["Year"].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["Life Ladder-1"] = df_filtered["Life Ladder"].shift(1)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1371 entries, 1 to 1617
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    1371 non-null   object 
 1   Year                       1371 non-null   int64  
 2   Life Ladder                1371 non-null   float64
 3   LogGDP                     1371 non-null   float64
 4   SocialSupport              1371 non-null   float64
 5   LifeExpectancy             1371 non-null   float64
 6   Freedom                    1371 non-null   float64
 7   Generosity                 1371 non-null   float64
 8   Corruption                 1371 non-null   float64
 9   PosAffect                  1371 non-null   float64
 10  NegAffect                  1371 non-null   float64
 11  Region                     1371 non-null   object 
 12  Regime                     1371 non-null   object 
 13  AreaInK2                   1371 non-null   int64

In [21]:
## REGRESSION LINEAIRE MULTIPLE DE LASSO
## Modèle alternatif "à inertie" : Ajout de la variable score Life Ladder n-1

LifeLadderP = whrP['Life Ladder']
dataP = whrP.drop(columns=['Region','Regime', 'Country', 'Year', 'Life Ladder', 'PopAnnualGrowthRate', 'Population', 'AreaInK2', 'InfantMortalityRate', 'Corruption', 'PosAffect', 'NegAffect', 'Freedom', 'Generosity', 'SocialSupport'])

## TRANSFORMATION DES REGIONS EN DUMMIES
regions = whrP[['Region']]
regions = pd.get_dummies(regions)
dataP = dataP.join(regions).drop(columns='Region_Topia')

## TRANSFORMATION DES REGIMES POLITIQUES EN DUMMIES
politics = whrP[['Regime']]
politics = pd.get_dummies(politics)
dataP = dataP.join(politics)

# ENSEMBLES, ENTRAINEMENT ET PERFORMANCES
X_train, X_test, y_train, y_test = train_test_split(dataP, LifeLadderP, test_size=0.2, random_state=123) 
model_lasso = LassoCV(cv=10, alphas=([10, 1, 0.1, 0.01, 0.001])).fit(X_train, y_train)
pred_test = model_lasso.predict(X_test)

# RESULTATS
print('Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder avec le PIB et les régions :')
r2train2, r2test2 = round(model_lasso.score(X_train, y_train),3), round(model_lasso.score(X_test, y_test),3)
print("Score sur l'ensemble d'entrainement :", r2train2)
print("Score sur l'ensemble de test :",r2test2)
print('\n')

print('Plus de 90% de la variance du score Life Ladder peut être expliqué grâce aux variables,')
print("à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.")
print('\n')

coeff_table = pd.DataFrame(
    {'Variable': list(X_train.columns),
     'Coeff': list(model_lasso.coef_)
    })

print("Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :")
print(coeff_table.head(31).sort_values(by='Coeff', ascending=False))
#print('\n')
#print('Nombre de variables retenues :', model_lasso.coef_[model_lasso.coef_ != 0].shape)
#print('Nombre de variables éliminées :', model_lasso.coef_[model_lasso.coef_ == 0].shape)

Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder avec le PIB et les régions :
Score sur l'ensemble d'entrainement : 0.9
Score sur l'ensemble de test : 0.911


Plus de 90% de la variance du score Life Ladder peut être expliqué grâce aux variables,
à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.


Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :
                                     Variable     Coeff
13                              Life Ladder-1  0.715039
10                                   Judaisme  0.433392
17         Region_Latin America and Caribbean  0.243875
0                                      LogGDP  0.136526
19               Region_North America and ANZ  0.128569
23                      Region_Western Europe  0.091746
4                                       Islam  0.072730
26                      Regime_Full democracy  0.057025
21                      Region_Southeast Asia  0.04606

In [2]:
## REGRESSION LINEAIRE MULTIPLE DE LASSO
## Modèle retenu : Prédiction de l'année 2020

score = whr[['Life Ladder', 'Year']]
data = whr.drop(columns=['Region','Regime', 'Country', 'Life Ladder', 'PopAnnualGrowthRate', 'Population', 'AreaInK2', 'InfantMortalityRate', 'Corruption', 'SocialSupport', 'PosAffect', 'NegAffect', 'Freedom', 'Generosity'])

# TRANSFORMATION DES REGIONS EN DUMMIES
regions = whr[['Region']]
regions = pd.get_dummies(regions)
data = data.join(regions).drop(columns='Region_Topia')

# TRANSFORMATION DES REGIMES POLITIQUES EN DUMMIES
politics = whr[['Regime']]
politics = pd.get_dummies(politics)
data = data.join(politics)

# ENSEMBLES
data20 = data.drop(data[data.Year < 2020].index).drop(columns=['Year'])
score20 = score.drop(whr[whr.Year < 2020].index)
score20 = score20['Life Ladder']

data = data.drop(data[data.Year > 2019].index).drop(columns=['Year'])
score = score.drop(whr[whr.Year > 2019].index)
score = score['Life Ladder'] 

X_train, X_test, y_train, y_test = data, data20, score, score20

# ENTRAINEMENT ET PERFORMANCES
model_lasso = LassoCV(cv=10, alphas=([10, 1, 0.1, 0.001])).fit(X_train, y_train)
pred_test = model_lasso.predict(X_test)

# RESULTATS
print('Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder 2020 :')
r2train2, r2test2 = round(model_lasso.score(X_train, y_train),3), round(model_lasso.score(X_test, y_test),3)
print("Score sur l'ensemble d'entrainement :", r2train2)
print("Score sur l'ensemble de test :",r2test2)
print('\n')

print('Près de 80% de la variance du score Life Ladder peut être expliqué grâce aux variables,')
print("à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.")
print('\n')

coeff_table = pd.DataFrame(
    {'Variable': list(X_train.columns),
     'Coeff': list(model_lasso.coef_)
    })

print("Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :")
print(coeff_table.head(31).sort_values(by='Coeff', ascending=False))
#print('\n')
#print('Nombre de variables retenues :', model_lasso.coef_[model_lasso.coef_ != 0].shape)
#print('Nombre de variables éliminées :', model_lasso.coef_[model_lasso.coef_ == 0].shape)

Score R2 obtenu avec une régression de Lasso expliquant le score Life Ladder 2020 :
Score sur l'ensemble d'entrainement : 0.799
Score sur l'ensemble de test : 0.772


Près de 80% de la variance du score Life Ladder peut être expliqué grâce aux variables,
à savoir que le modèle a retenu uniquement les variables qui étaient pertinentes.


Tabe des coefficients associés à chaque variable dans le modèle le plus efficace :
                                     Variable     Coeff
10                                   Judaisme  1.910138
16         Region_Latin America and Caribbean  1.005977
18               Region_North America and ANZ  0.802722
20                      Region_Southeast Asia  0.506809
0                                      LogGDP  0.503906
22                      Region_Western Europe  0.490510
4                                       Islam  0.296517
25                      Regime_Full democracy  0.240960
6                                  Hindouisme  0.134282
3                 

In [12]:
pred_test = model_lasso.predict(X_test)
whr_perf = whr.drop(whr[whr.Year < 2020].index)
whr_perf = whr_perf[['Life Ladder', 'Country', 'Region', 'Year']]
whr_perf['Prediction'] = pred_test
whr_perf['Error'] = whr_perf['Prediction'] - whr_perf['Life Ladder']
whr_perf.head()

Unnamed: 0,Life Ladder,Country,Region,Year,Prediction,Error
24,5.365,Albania,Central and Eastern Europe,2020,5.265192,-0.099808
48,5.901,Argentina,Latin America and Caribbean,2020,6.265231,0.364231
75,7.137,Australia,North America and ANZ,2020,7.322074,0.185074
88,7.213,Austria,Western Europe,2020,7.091623,-0.121377
116,5.28,Bangladesh,South Asia,2020,4.745322,-0.534678


In [23]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource,LabelSet, Label, CategoricalColorMapper
from bokeh.models.tools import HoverTool
from bokeh.transform import factor_cmap
from bokeh.palettes import Category10_10
from bokeh.models import Range1d
output_notebook()

source = ColumnDataSource(data={'Error':whr_perf['Error'].values,
                                'Life Ladder':whr_perf['Life Ladder'].values,
                                'Country':whr_perf['Country'].values,
                                'Region_color': whr_perf['Region'].values,
                                'Year':whr_perf['Year'].values})
color_mapper = CategoricalColorMapper(factors=whr_perf['Region'].unique(), palette=Category10_10)
p = figure(plot_width = 1000, plot_height = 600)
p.circle(x = 'Error', y= 'Life Ladder', size = 15, color = {'field': 'Region_color', 'transform': color_mapper},
             source = source,  legend_field='Region_color', fill_alpha = 0.2)
p.add_tools(HoverTool(tooltips=[("Error", "$x"),("Life Ladder", "$y"), ("Country", "@Country"), ("Year", "@Year")]))
p.xaxis.axis_label = 'Error'
p.yaxis.axis_label = 'Life Ladder'
p.add_layout(LabelSet(x='Error', y='Life Ladder', text='Country',text_font_size='6pt',
                  x_offset=0, y_offset=5, source=source, render_mode='canvas'))
p.legend.location = "top_right"
p.x_range = Range1d(-2, 2)
p.y_range = Range1d(1, 10)
from bokeh.models import BoxAnnotation
green_box = BoxAnnotation(left=-0.5, right=0.5, fill_color='green', fill_alpha=0.1)

p.legend.location = "bottom_left"


p.add_layout(green_box)
show(p)

