<a href="https://colab.research.google.com/github/nicolesaade/WorldHappinessReportAnalysis/blob/main/WorldHappinessReport_Time_Series_2023final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#3. Predict 2023-29 Life Ladder

##3(a) Libraries

In [14]:
import numpy as np
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

##3(b) Load Life Ladder Score



In [None]:
# Load data
data = pd.read_csv('happiness_dataset.csv')

# Normalize data
scaler = StandardScaler()
data['Life Ladder'] = scaler.fit_transform(data[['Life Ladder']])

# Dataframe with life ladder score 2018-2023
ladder_data = data[['Country name', 'year', 'Life Ladder']].loc[data['year'].isin([2018, 2019, 2020, 2021, 2022, 2023])]

# Remove countries missing any of the 2018-2023 life ladder scores
countries = set(ladder_data.loc[ladder_data['year']==2018]['Country name'].unique()).intersection(
    ladder_data.loc[ladder_data['year']==2019]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2020]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2021]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2022]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2023]['Country name'].unique()
)
ladder_data = ladder_data.loc[ladder_data['Country name'].isin(countries)]

# Show all the countries using df.head()
ladder_data.head(-1)

Unnamed: 0,Country name,year,Life Ladder
25,Albania,2018,-0.426173
26,Albania,2019,-0.434171
27,Albania,2020,-0.105365
28,Albania,2021,-0.203118
29,Albania,2022,-0.241331
...,...,...,...
2357,Zimbabwe,2018,-1.659641
2358,Zimbabwe,2019,-2.478990
2359,Zimbabwe,2020,-2.064872
2360,Zimbabwe,2021,-2.069315


Visualize Life Ladder Score Trajectory

In [None]:
fig = px.line(ladder_data, x='year', y='Life Ladder', color='Country name', title='Life Ladder Score by Country (Normalized)')
fig.show()

##3(c) LSTM

In [None]:
prediction = {}
for country in countries:
    country_data = ladder_data.loc[ladder_data['Country name']==country]

    X = country_data['Life Ladder'].values.reshape(-1, 1)

    model = Sequential()
    model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(1, 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(50, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    model.fit(X[:-1], X[:-1], epochs=100, verbose=0) #train model with 2018-2022 data

    last_ladder = X[-2] #2022 data
    forecast = [] #stores predicted 2023-29 data
    for _ in range(6):
        lstm_preds = model.predict(last_ladder.reshape(-1, 1))
        forecast.append(lstm_preds[0,0])
        last_ladder = lstm_preds[0,0]

    prediction[country] = forecast

#print(prediction)



In [None]:
#Convert dictionary into Dataframe
p1_df = pd.DataFrame.from_dict(prediction, orient='index', columns=['Pred 2023', 'Pred 2024', 'Pred 2025', 'Pred 2026', 'Pred 2027', 'Pred 2028'])

#Store True 2023 values and the squared error (True 2023 vs Predicted 2023) into a list
true_2023 = []
error = []
for country in p1_df.index:
  y_true = ladder_data.loc[(ladder_data['Country name'] == country) & (ladder_data['year'] == 2023), 'Life Ladder'].values[0]
  y_pred = p1_df['Pred 2023'][country]
  true_2023.append(y_true)
  error.append((y_true-y_pred)**2)

#Add new columns 'True 2023' and 'Error' to the DataFrame
p1_df['True 2023'] = true_2023
p1_df['Squared Error'] = error

#Move country names in the index into a column
p1_df = p1_df.reset_index()
p1_df.columns=['Country name', 'Pred 2023', 'Pred 2024', 'Pred 2025', 'Pred 2026', 'Pred 2027', 'Pred 2028', 'True 2023', 'Squared Error']

from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/DS Group Project/'
p1_df.to_csv(path+'prediction_2023final.csv', index=False)

p1_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Country name,Pred 2023,Pred 2024,Pred 2025,Pred 2026,Pred 2027,Pred 2028,True 2023,Squared Error
0,Denmark,1.922651,2.025877,2.147871,2.298412,2.493907,2.764235,1.795489,0.01617
1,Nigeria,-0.353192,-0.435064,-0.473782,-0.492634,-0.501942,-0.506569,-0.546143,0.03723
2,Uganda,-0.949478,-0.955141,-0.958811,-0.961195,-0.962745,-0.963754,-0.903387,0.002124
3,Croatia,0.343286,0.432447,0.465676,0.478401,0.483324,0.485236,0.421613,0.006135
4,Japan,0.486912,0.451594,0.442334,0.439927,0.439304,0.439142,0.378957,0.011654


##3(d) Evaluation

In [16]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/DS Group Project/'
p1_df = pd.read_csv(path+'prediction_2023final.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
#Initialize list of mean squared errors
mse = []

for country in p1_df['Country name']:
  y_true = p1_df.loc[p1_df['Country name']==country]['True 2023']
  y_pred = p1_df.loc[p1_df['Country name']==country]['Pred 2023']
  mse.append(mean_squared_error(y_true, y_pred)) #Store MSEs for each country

p1_df['MSE'] = mse
print('Average MSE: ', sum(mse)/len(mse))

r_squared = r2_score(p1_df['True 2023'], p1_df['Pred 2023'])
print('R Squared: ', r_squared)

p1_df

Average MSE:  0.05717388945530661
R Squared:  0.9261219473306039


Unnamed: 0,Country name,Pred 2023,Pred 2024,Pred 2025,Pred 2026,Pred 2027,Pred 2028,True 2023,Squared Error,MSE
0,Denmark,1.922651,2.025877,2.147871,2.298412,2.493907,2.764235,1.795489,0.016170,0.016170
1,Nigeria,-0.353192,-0.435064,-0.473782,-0.492634,-0.501942,-0.506569,-0.546143,0.037230,0.037230
2,Uganda,-0.949478,-0.955141,-0.958811,-0.961195,-0.962745,-0.963754,-0.903387,0.002124,0.002124
3,Croatia,0.343286,0.432447,0.465676,0.478401,0.483324,0.485236,0.421613,0.006135,0.006135
4,Japan,0.486912,0.451594,0.442334,0.439927,0.439304,0.439142,0.378957,0.011654,0.011654
...,...,...,...,...,...,...,...,...,...,...
99,Pakistan,-0.514387,-0.522254,-0.524924,-0.525832,-0.526142,-0.526247,-0.830516,0.099937,0.099937
100,Indonesia,-0.025775,-0.079970,-0.105888,-0.118378,-0.124419,-0.127347,0.187894,0.045655,0.045655
101,Kyrgyzstan,0.179851,0.188240,0.192667,0.195008,0.196247,0.196903,0.378957,0.039643,0.039643
102,Italy,0.662104,0.650428,0.645254,0.642970,0.641963,0.641520,0.676660,0.000212,0.000212


###3(d).1 Scatter plot of True 2023 vs Predicted 2023

In [9]:
import plotly.graph_objects as go

#Scatter plot of True 2023 vs Predicted 2023 by country
scatter = go.Scatter(
    x=p1_df['True 2023'],
    y=p1_df['Pred 2023'],
    mode='markers',
    marker=dict(color='red', size=8),
    text=p1_df['Country name'])

#Add a layout
layout = go.Layout(
    title='True 2023 vs Predicted 2023',
    xaxis=dict(title='True 2023'),
    yaxis=dict(title='Predicted 2023'),
    width=600,
    height=600)

fig = go.Figure(data=scatter, layout=layout)

#Add a y=x line
fig.add_trace(go.Scatter(
        x=np.linspace(-2, 2, 400),
        y=np.linspace(-2, 2, 400),
        mode='lines',
        line=dict(color='black'),
        name='y=x'))

fig.show()

###3(d).2 Squared Error

In [11]:
line_plot = go.Scatter(
    x=p1_df['Country name'],
    y=p1_df['Squared Error'],
    mode='lines',
    line=dict(color='red'))

layout = go.Layout(
    title='Squared Error',
    xaxis=dict(title='Countries'),
    yaxis=dict(title='Squared Error'))

fig = go.Figure(data=line_plot, layout=layout)

#Add a y=0.5 line
fig.add_shape(type="line",
              x0=0, x1=104,
              y0=0.5, y1=0.5,
              line=dict(color="black", width=2))

#Add an average line
fig.add_shape(type="line",
              x0=0, x1=104,
              y0=0.05717388945530661, y1=0.05717388945530661,
              line=dict(color="red", width=2))

fig.update_xaxes(tickangle=30)
fig.show()

###3(d).3 2024-2029 Forcast #Not final results used in slides, improved in following sections

In [None]:
# Calculate the happiness score increase/decrease and final happiness score for each country
country_stats = []
for country, forecast in prediction.items():
  initial_score = forecast[0]
  final_score = forecast[-1]
  change = final_score - initial_score
  country_stats.append((country, change, final_score))

# Sort the countries based on the happiness score increase/decrease and final happiness score
country_stats.sort(key=lambda x: (x[1], x[2]), reverse=True)

# Create traces for each country's predicted happiness scores
traces = []
for country, _, _ in country_stats:
  predicted = pd.DataFrame({'Year': range(2023, 2029), 'Life Ladder': prediction[country]})
  trace = go.Scatter(x=predicted['Year'], y=predicted['Life Ladder'], mode='lines+markers', name=country)
  traces.append(trace)

# Create the layout for the plot
layout = go.Layout(
  title='Predicted Life Ladder for All Countries',
  xaxis=dict(title='Year'),
  yaxis=dict(title='Life Ladder'),
  hovermode='closest',
  width=800,
  height=600)

# Create the figure and display the plot
fig = go.Figure(data=traces, layout=layout)
fig.show()

# Print the ranking of countries based on happiness score increase/decrease and final happiness score
print("Ranking of Countries:")
for i, (country, change, final_score) in enumerate(country_stats, start=1):
    print(f"{i}. {country}: Increase/Decrease = {change:.4f}, Final Score = {final_score:.4f}")

Ranking of Countries:
1. Finland: Increase/Decrease = 1.2907, Final Score = 3.3963
2. Lebanon: Increase/Decrease = 1.1287, Final Score = -0.5253
3. Denmark: Increase/Decrease = 0.8416, Final Score = 2.7642
4. Bangladesh: Increase/Decrease = 0.6600, Final Score = -0.8472
5. Switzerland: Increase/Decrease = 0.4888, Final Score = 1.8514
6. Sri Lanka: Increase/Decrease = 0.3585, Final Score = -0.5619
7. Tanzania: Increase/Decrease = 0.3364, Final Score = -1.1928
8. India: Increase/Decrease = 0.2810, Final Score = -0.9642
9. Canada: Increase/Decrease = 0.2400, Final Score = 1.5753
10. Ghana: Increase/Decrease = 0.2295, Final Score = -0.6223
11. Benin: Increase/Decrease = 0.2151, Final Score = -0.7447
12. Zambia: Increase/Decrease = 0.1874, Final Score = -1.3000
13. Cambodia: Increase/Decrease = 0.1803, Final Score = -0.7384
14. Belgium: Increase/Decrease = 0.1671, Final Score = 1.4346
15. Kosovo: Increase/Decrease = 0.1651, Final Score = 0.9079
16. Egypt: Increase/Decrease = 0.1650, Final S