<a href="https://colab.research.google.com/github/nicolesaade/WorldHappinessReportAnalysis/blob/main/WorldHappinessReport_Time_Series_2023final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#3. Predict 2023-29 Life Ladder

##3(a) Libraries

In [3]:
import numpy as np
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

##3(b) Load Life Ladder Score



In [2]:
# Load data
data = pd.read_csv('happiness_dataset.csv')

# Normalize data
scaler = StandardScaler()
data['Life Ladder'] = scaler.fit_transform(data[['Life Ladder']])

# Dataframe with life ladder score 2018-2023
ladder_data = data[['Country name', 'year', 'Life Ladder']].loc[data['year'].isin([2018, 2019, 2020, 2021, 2022, 2023])]

# Remove countries missing any of the 2018-2023 life ladder scores
countries = set(ladder_data.loc[ladder_data['year']==2018]['Country name'].unique()).intersection(
    ladder_data.loc[ladder_data['year']==2019]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2020]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2021]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2022]['Country name'].unique(),
    ladder_data.loc[ladder_data['year']==2023]['Country name'].unique()
)
ladder_data = ladder_data.loc[ladder_data['Country name'].isin(countries)]

# Show all the countries using df.head()
ladder_data.head(-1)

Unnamed: 0,Country name,year,Life Ladder
25,Albania,2018,-0.426173
26,Albania,2019,-0.434171
27,Albania,2020,-0.105365
28,Albania,2021,-0.203118
29,Albania,2022,-0.241331
...,...,...,...
2357,Zimbabwe,2018,-1.659641
2358,Zimbabwe,2019,-2.478990
2359,Zimbabwe,2020,-2.064872
2360,Zimbabwe,2021,-2.069315


Visualize Life Ladder Score Trajectory

In [4]:
fig = px.line(data, x='year', y='Life Ladder', color='Country name', title='Life Ladder by Country')
fig.show()

##3(c) LSTM

In [24]:
prediction = {}
for country in countries:
    country_data = ladder_data.loc[ladder_data['Country name']==country]

    X = country_data['Life Ladder'].values.reshape(-1, 1)

    model = Sequential()
    model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(1, 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(50, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    model.fit(X[:-1], X[:-1], epochs=100, verbose=0) #train model with 2018-2022 data

    last_ladder = X[-2] #2022 data
    forecast = [] #stores Predicted 2023-29 data
    for _ in range(6):
        lstm_preds = model.predict(last_ladder.reshape(-1, 1))
        forecast.append(lstm_preds[0,0])
        last_ladder = lstm_preds[0,0]

    prediction[country] = forecast

print(prediction)

{'Denmark': [1.634913, 1.4608586, 1.3188651, 1.2107959, 1.1330155, 1.079339], 'Nigeria': [-0.3404407, -0.42714566, -0.4736426, -0.49937832, -0.51386833, -0.52210456], 'Uganda': [-0.94767976, -0.95172495, -0.95407844, -0.95544994, -0.95624965, -0.9567163], 'Croatia': [0.33108756, 0.41514766, 0.44617257, 0.45793763, 0.46244416, 0.46417692], 'Japan': [0.53472376, 0.5095928, 0.50207597, 0.49984095, 0.49917752, 0.49898076], 'Benin': [-0.8143719, -0.6696739, -0.60880136, -0.58369386, -0.57267505, -0.567688], 'Saudi Arabia': [0.81343377, 0.8218773, 0.8266467, 0.8293486, 0.83088195, 0.83175284], 'Serbia': [0.7649859, 0.68594795, 0.65278894, 0.63917834, 0.6336423, 0.63139904], 'Senegal': [-0.49090755, -0.4814999, -0.47735846, -0.47554123, -0.47474486, -0.4743961], 'Tanzania': [-0.91789174, -0.61558574, -0.51701295, -0.48759392, -0.4790746, -0.4766281], 'Australia': [1.3671359, 1.3569976, 1.3481572, 1.3404814, 1.3338419, 1.3281177], 'Iran': [-0.57194, -0.62604934, -0.6510715, -0.6628514, -0.6684

In [35]:
#Convert dictionary into Dataframe
p1_df = pd.DataFrame.from_dict(prediction, orient='index', columns=['Pred 2023', 'Pred 2024', 'Pred 2025', 'Pred 2026', 'Pred 2027', 'Pred 2028'])

#Store True 2023 values and the squared error (True 2023 vs Predicted 2023) into a list
true_2023 = []
error = []
for country in p1_df.index:
  y_true = ladder_data.loc[(ladder_data['Country name'] == country) & (ladder_data['year'] == 2023), 'Life Ladder'].values[0]
  y_pred = p1_df['Pred 2023'][country]
  true_2023.append(y_true)
  error.append((y_true-y_pred)**2)

#Add new columns 'True 2023' and 'Error' to the DataFrame
p1_df['True 2023'] = true_2023
p1_df['Error'] = error

#Move country names in the index into a column
p1_df = p1_df.reset_index()
p1_df.columns=['Country name', 'Pred 2023', 'Pred 2024', 'Pred 2025', 'Pred 2026', 'Pred 2027', 'Pred 2028', 'True 2023', 'Error']

from google.colab import drive
#!pip install pandas==2.0
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/DS Group Project/'
p1_df.to_csv(path+'prediction_2023final.csv', index=False)

p1_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Country name,Pred 2023,Pred 2024,Pred 2025,Pred 2026,Pred 2027,Pred 2028,True 2023,Error
0,Denmark,1.634913,1.460859,1.318865,1.210796,1.133016,1.079339,1.795489,0.025785
1,Nigeria,-0.340441,-0.427146,-0.473643,-0.499378,-0.513868,-0.522105,-0.546143,0.042314
2,Uganda,-0.94768,-0.951725,-0.954078,-0.95545,-0.95625,-0.956716,-0.903387,0.001962
3,Croatia,0.331088,0.415148,0.446173,0.457938,0.462444,0.464177,0.421613,0.008195
4,Japan,0.534724,0.509593,0.502076,0.499841,0.499178,0.498981,0.378957,0.024263


#3(d) Evaluation

In [36]:
p1_df = pd.read_csv(path+'prediction_2023final.csv')

###3(d).1 Scatter plot of True 2023 vs Predicted 2023

In [38]:
import plotly.graph_objects as go

#Scatter plot of True 2023 vs Predicted 2023 by country
scatter = go.Scatter(
    x=p1_df['True 2023'],
    y=p1_df['Pred 2023'],
    mode='markers',
    marker=dict(color='red', size=5),
    text=p1_df['Country name'])

#Add a layout
layout = go.Layout(
    title='True 2023 vs Predicted 2023',
    xaxis=dict(title='True 2023'),
    yaxis=dict(title='Predicted 2023'),
    width=400,
    height=400)

fig = go.Figure(data=scatter, layout=layout)

#Add a y=x line
fig.add_trace(go.Scatter(
        x=np.linspace(-2, 2, 400),
        y=np.linspace(-2, 2, 400),
        mode='lines',
        line=dict(color='black'),
        name='y=x'))

fig.show()

###3(d).2 Squared Error

In [39]:
line_plot = go.Scatter(
    x=p1_df['Country name'],
    y=p1_df['Error'],
    mode='lines',
    line=dict(color='red'))

layout = go.Layout(
    title='R^2 Score',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Squared Error'))

fig = go.Figure(data=line_plot, layout=layout)
fig.update_xaxes(tickangle=30)
fig.show()

###3(d).3 2024-2029 Forcast

In [40]:
# Calculate the happiness score increase/decrease and final happiness score for each country
country_stats = []
for country, forecast in prediction.items():
  initial_score = forecast[0]
  final_score = forecast[-1]
  change = final_score - initial_score
  country_stats.append((country, change, final_score))

# Sort the countries based on the happiness score increase/decrease and final happiness score
country_stats.sort(key=lambda x: (x[1], x[2]), reverse=True)

# Create traces for each country's predicted happiness scores
traces = []
for country, _, _ in country_stats:
  predicted = pd.DataFrame({'Year': range(2023, 2029), 'Life Ladder': prediction[country]})
  trace = go.Scatter(x=predicted['Year'], y=predicted['Life Ladder'], mode='lines+markers', name=country)
  traces.append(trace)

# Create the layout for the plot
layout = go.Layout(
  title='Predicted Life Ladder for All Countries',
  xaxis=dict(title='Year'),
  yaxis=dict(title='Life Ladder'),
  hovermode='closest',
  width=800,
  height=600)

# Create the figure and display the plot
fig = go.Figure(data=traces, layout=layout)
fig.show()

# Print the ranking of countries based on happiness score increase/decrease and final happiness score
print("Ranking of Countries:")
for i, (country, change, final_score) in enumerate(country_stats, start=1):
    print(f"{i}. {country}: Increase/Decrease = {change:.4f}, Final Score = {final_score:.4f}")

Ranking of Countries:
1. Lebanon: Increase/Decrease = 1.3596, Final Score = -0.6009
2. Norway: Increase/Decrease = 1.2253, Final Score = 2.9839
3. Bangladesh: Increase/Decrease = 0.6302, Final Score = -0.9070
4. Tanzania: Increase/Decrease = 0.4413, Final Score = -0.4766
5. Zambia: Increase/Decrease = 0.4307, Final Score = -0.5188
6. Sri Lanka: Increase/Decrease = 0.3011, Final Score = -0.5799
7. United States: Increase/Decrease = 0.2822, Final Score = 1.4522
8. Ghana: Increase/Decrease = 0.2707, Final Score = -0.5697
9. Benin: Increase/Decrease = 0.2467, Final Score = -0.5677
10. Malta: Increase/Decrease = 0.1934, Final Score = 1.0382
11. Tunisia: Increase/Decrease = 0.1824, Final Score = -0.7648
12. Cambodia: Increase/Decrease = 0.1763, Final Score = -0.7456
13. Croatia: Increase/Decrease = 0.1331, Final Score = 0.4642
14. Kosovo: Increase/Decrease = 0.1287, Final Score = 0.8393
15. Slovakia: Increase/Decrease = 0.1207, Final Score = 0.7707
16. Cyprus: Increase/Decrease = 0.1014, Fin