In [679]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pylab as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn import model_selection
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
import pickle

# Multiclass classification models

In this notebook I will create classification models to label the years as per their economic activity. According to bibliography, the world has experience 4 recessions (decrease of GDP) and 4 downturns (low GDP growth) in the last 70 years. I will label them accordingly:

1. Recession-peak: 1975, 1982, 1991, 2009, 2020
2. Recovery: 2 years after each crisis
3. Global-downturn: 1958, 1998, 2001, 2012
4. Expansion: all other years

## IMF dataset

This is the dataset containing global data for 1980-2027.

In [683]:
orignal_imf = pd.read_csv(r'C:\Users\luana\Ironhack DA\Unit 9\final_bootcamp_project\csv_files\imf_clean.csv')
orignal_imf

Unnamed: 0,WEO Subject Code,Country,Subject Descriptor,Subject Notes,Units,Scale,Country/Series-specific Notes,1980,1981,1982,...,2019,2020,2021,2022,2023,2024,2025,2026,2027,Estimates Start After
0,NGDP_R,Albania,"Gross domestic product, constant prices",Expressed in billions of national currency uni...,National currency,Billions,Source: IMF Staff Estimates. Official national...,311.514,329.270,338.819,...,837.786,808.617,877.475,912.574,935.388,965.321000,996.211000,1030.080000,1065.110000,2020.0
1,NGDP_RPCH,Albania,"Gross domestic product, constant prices",Annual percentages of constant price GDP are y...,Percent change,,"See notes for: Gross domestic product, consta...",2.684,5.700,2.900,...,2.088,-3.482,8.516,4.000,2.500,3.200000,3.200000,3.400000,3.400000,2020.0
2,NGDP,Albania,"Gross domestic product, current prices",Expressed in billions of national currency uni...,National currency,Billions,Source: IMF Staff Estimates. Official national...,18.489,19.126,19.698,...,1691.900,1644.080,1889.840,2059.020,2176.660,2287.470000,2403.260000,2530.790000,2664.390000,2020.0
3,NGDPD,Albania,"Gross domestic product, current prices",Values are based upon GDP in national currency...,U.S. dollars,Billions,"See notes for: Gross domestic product, curren...",1.946,2.229,2.296,...,15.399,15.161,18.310,18.256,18.842,19.858000,21.219000,22.528000,23.889000,2020.0
4,PPPGDP,Albania,"Gross domestic product, current prices",These data form the basis for the country weig...,Purchasing power parity; international dollars,Billions,"See notes for: Gross domestic product, curren...",5.759,6.663,7.280,...,41.623,40.658,45.953,51.189,54.338,57.254000,60.184000,63.415000,66.840000,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4040,D_NGDPD,Sub-Saharan Africa,"External debt, total",,Percent of GDP,,,18.911,21.069,21.791,...,41.042,43.811,40.804,39.774,37.853,37.758550,37.758550,37.758550,37.758550,
4041,DS,Sub-Saharan Africa,"External debt, total debt service",,U.S. dollars,Billions,,10.493,10.300,11.741,...,113.329,109.146,127.688,149.875,129.985,42.604425,42.604425,42.604425,42.604425,
4042,DS_NGDPD,Sub-Saharan Africa,"External debt, total debt service",,Percent of GDP,,,3.350,3.426,3.512,...,6.637,6.380,7.232,7.655,6.163,6.297775,6.297775,6.297775,6.297775,
4043,DSI,Sub-Saharan Africa,"External debt, total debt service, interest",,U.S. dollars,Billions,,3.301,3.541,3.144,...,18.733,18.035,18.286,18.511,20.489,7.451600,7.451600,7.451600,7.451600,


In [681]:
orignal_imf['Scale'].unique()

array(['Billions', 'None', 'Units', 'Millions'], dtype=object)

### Preparing the dataset

In [684]:
# I need to rearrange the dataset to make it suitable for the classification
# I can drop all the categorical columns because they only give information about the indicators
# I will use data until 2020 for the model and then try to label the years 2021-2027 

imf = orignal_imf.select_dtypes(np.number)
imf

Unnamed: 0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027
0,311.514,329.270,338.819,342.546,349.397,344.156,363.428,360.521,355.474,390.310,...,820.653,837.786,808.617,877.475,912.574,935.388,965.321000,996.211000,1030.080000,1065.110000
1,2.684,5.700,2.900,1.100,2.000,-1.500,5.600,-0.800,-1.400,9.800,...,4.019,2.088,-3.482,8.516,4.000,2.500,3.200000,3.200000,3.400000,3.400000
2,18.489,19.126,19.698,19.900,19.645,20.065,20.692,20.531,20.238,22.228,...,1636.730,1691.900,1644.080,1889.840,2059.020,2176.660,2287.470000,2403.260000,2530.790000,2664.390000
3,1.946,2.229,2.296,2.319,2.290,2.339,2.587,2.566,2.530,2.779,...,15.157,15.399,15.161,18.310,18.256,18.842,19.858000,21.219000,22.528000,23.889000
4,5.759,6.663,7.280,7.649,8.083,8.214,8.848,8.994,9.181,10.476,...,40.055,41.623,40.658,45.953,51.189,54.338,57.254000,60.184000,63.415000,66.840000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4040,18.911,21.069,21.791,29.217,38.232,41.154,41.957,38.565,39.362,40.063,...,39.422,41.042,43.811,40.804,39.774,37.853,37.758550,37.758550,37.758550,37.758550
4041,10.493,10.300,11.741,13.934,15.031,16.869,17.096,17.030,17.986,16.911,...,114.102,113.329,109.146,127.688,149.875,129.985,42.604425,42.604425,42.604425,42.604425
4042,3.350,3.426,3.512,5.457,7.972,8.582,7.109,6.116,6.621,5.700,...,7.230,6.637,6.380,7.232,7.655,6.163,6.297775,6.297775,6.297775,6.297775
4043,3.301,3.541,3.144,3.457,3.234,3.887,4.041,4.385,4.650,4.894,...,17.880,18.733,18.035,18.286,18.511,20.489,7.451600,7.451600,7.451600,7.451600


In [685]:
# I need each year to be a row
imf = imf.transpose().reset_index()
imf

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,4035,4036,4037,4038,4039,4040,4041,4042,4043,4044
0,1980,311.514,2.684,18.489,1.946,5.759,5.935,116584.54,5557.56,6919.41,...,0.39000,0.70600,-6.29800,4.9720,60.0180,18.91100,10.493000,3.350000,3.3010,1.040000
1,1981,329.270,5.700,19.126,2.229,6.663,5.809,120786.15,5757.84,7016.15,...,-0.52400,0.19700,-9.89200,-8.5170,66.1260,21.06900,10.300000,3.426000,3.5410,1.055000
2,1982,338.819,2.900,19.698,2.296,7.280,5.814,121689.95,5800.93,7074.57,...,-1.24300,-0.55800,-11.62700,-2.0610,76.5560,21.79100,11.741000,3.512000,3.1440,0.929000
3,1983,342.546,1.100,19.900,2.319,7.649,5.809,120446.72,5741.66,6997.24,...,-0.88400,0.62700,-9.15100,-0.2070,80.0050,29.21700,13.934000,5.457000,3.4570,1.109000
4,1984,349.397,2.000,19.645,2.290,8.083,5.623,120297.85,5734.57,6763.89,...,-0.54900,-1.06200,-1.78900,-0.0870,83.2400,38.23200,15.031000,7.972000,3.2340,1.307000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,2023,935.388,2.500,2176.660,18.842,54.338,232.701,327239.88,15599.44,761490.02,...,-40.27200,4.48500,-9.66100,1.0360,803.0300,37.85300,129.985000,6.163000,20.4890,0.938000
44,2024,965.321,3.200,2287.470,19.858,57.254,236.964,338749.82,16148.12,802716.63,...,-13.42365,-4.79135,1.80165,4.4091,246.9388,37.75855,42.604425,6.297775,7.4516,1.191625
45,2025,996.211,3.200,2403.260,21.219,60.184,241.240,350720.75,16718.77,846078.15,...,-13.42365,-4.79135,1.80165,4.4091,246.9388,37.75855,42.604425,6.297775,7.4516,1.191625
46,2026,1030.080,3.400,2530.790,22.528,63.415,245.688,363876.33,17345.89,894000.51,...,-13.42365,-4.79135,1.80165,4.4091,246.9388,37.75855,42.604425,6.297775,7.4516,1.191625


In [686]:
#Column names from the indicators
columns = ['year']

for i in range(len(orignal_imf)):
    column_name = (orignal_imf['Country'][i]+'_'+orignal_imf['Subject Descriptor'][i])
    column_name = str(column_name).lower().replace(' ','_')
    columns.append(column_name)

len(columns)

4046

In [687]:
imf.columns= columns
imf.head()

Unnamed: 0,year,"albania_gross_domestic_product,_constant_prices","albania_gross_domestic_product,_constant_prices.1","albania_gross_domestic_product,_current_prices","albania_gross_domestic_product,_current_prices.1","albania_gross_domestic_product,_current_prices.2","albania_gross_domestic_product,_deflator","albania_gross_domestic_product_per_capita,_constant_prices","albania_gross_domestic_product_per_capita,_constant_prices.1","albania_gross_domestic_product_per_capita,_current_prices",...,"sub-saharan_africa_direct_investment,_net","sub-saharan_africa_portfolio_investment,_net","sub-saharan_africa_other_investment,_net",sub-saharan_africa_change_in_reserves,"sub-saharan_africa_external_debt,_total","sub-saharan_africa_external_debt,_total.1","sub-saharan_africa_external_debt,_total_debt_service","sub-saharan_africa_external_debt,_total_debt_service.1","sub-saharan_africa_external_debt,_total_debt_service,_interest","sub-saharan_africa_external_debt,_total_debt_service,_interest.1"
0,1980,311.514,2.684,18.489,1.946,5.759,5.935,116584.54,5557.56,6919.41,...,0.39,0.706,-6.298,4.972,60.018,18.911,10.493,3.35,3.301,1.04
1,1981,329.27,5.7,19.126,2.229,6.663,5.809,120786.15,5757.84,7016.15,...,-0.524,0.197,-9.892,-8.517,66.126,21.069,10.3,3.426,3.541,1.055
2,1982,338.819,2.9,19.698,2.296,7.28,5.814,121689.95,5800.93,7074.57,...,-1.243,-0.558,-11.627,-2.061,76.556,21.791,11.741,3.512,3.144,0.929
3,1983,342.546,1.1,19.9,2.319,7.649,5.809,120446.72,5741.66,6997.24,...,-0.884,0.627,-9.151,-0.207,80.005,29.217,13.934,5.457,3.457,1.109
4,1984,349.397,2.0,19.645,2.29,8.083,5.623,120297.85,5734.57,6763.89,...,-0.549,-1.062,-1.789,-0.087,83.24,38.232,15.031,7.972,3.234,1.307


In [688]:
# Now I can label the years
imf['year'] = imf['year'].astype(np.number)

crisis_years = [1975, 1982, 1991, 2009, 2020]
recovery_years = [1976, 1977, 1983, 1984, 1992, 1993, 2010, 2011]
downturn_years = [1958, 1998, 2001, 2012]

labels = []
for i in range(len(imf)):
    if imf['year'][i] in crisis_years:
        labels.append('recession')

    elif imf.loc[i]['year'] in recovery_years:
        labels.append('recovery')

    elif imf.loc[i]['year'] in downturn_years:
        labels.append('downturn')
        
    else:
        labels.append('expansion')

imf['year_label'] = labels

In [None]:
imf[['year', 'year_label']]

In [None]:
imf

### Model

In [None]:
X = imf.drop(['year', 'year_label'],axis=1)

In [None]:
Y = imf['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.3, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
y_test

In [None]:
y_train

In [None]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(3724,)),
    Dense(64, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=10,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

In [None]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

predictions

In [None]:
# The model accuracy is acceptable, however the high accuracy is only achieved because of the imbalanced data.
# The model is predicting every year as expansion
# I will try up- and downsampling

In [None]:
all_data = imf
all_data

### Downsampling

In [None]:
other_categories = all_data[all_data['year_label']!='expansion']
other_categories['year_label'].value_counts()

In [None]:
category_expansion = all_data[all_data['year_label']=='expansion']
category_expansion_undersampled = resample(category_expansion, replace=False, n_samples = 6)
category_expansion_undersampled

In [None]:
imf_undersampled = pd.concat([other_categories, category_expansion_undersampled], axis=0)
imf_undersampled.shape

In [None]:
X_undersampled = imf_undersampled.drop(['year', 'year_label'],axis=1)

Y_undersampled = imf_undersampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y_undersampled)
encoded_Y_undersampled = pd.DataFrame(encoder.transform(Y_undersampled))

X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_undersampled, encoded_Y_undersampled,
                                                                            test_size=0.3)


scaler = preprocessing.StandardScaler()
X_train_under = scaler.fit_transform(X_train_under)
X_test_under = scaler.transform(X_test_under)

In [None]:
# Build the model.
model_under = Sequential([ # as far as we know, all networks are sequential
    Dense(32, activation='sigmoid', input_shape=(3724,)),
    Dense(32, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model_under.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history_under = model_under.fit(
  X_train_under,
  to_categorical(y_train_under), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model_under.evaluate(
  X_test_under,
  to_categorical(y_test_under)
)

In [None]:
predictions = model_under.predict(X_test_under)
print(np.argmax(predictions, axis=1))

predictions

In [None]:
y_test_under

In [None]:
# Much better!

## Upsampling

In [None]:
category_expansion = all_data[all_data['year_label']=='expansion']
category_recession = all_data[all_data['year_label']=='recession']
category_recovery = all_data[all_data['year_label']=='recovery']
category_downturn = all_data[all_data['year_label']=='downturn']

category_recession_oversampled = resample(category_recession,replace=True, n_samples = len(category_expansion))
category_recovery_oversampled = resample(category_recovery,replace=True, n_samples = len(category_expansion))
category_downturn_oversampled = resample(category_downturn,replace=True, n_samples = len(category_expansion))

imf_upsampled= pd.concat([category_expansion,
                          category_recession_oversampled,
                          category_recovery_oversampled,
                          category_downturn_oversampled], axis=0)
imf_upsampled.shape

In [None]:
X_updersampled = imf_upsampled.drop(['year', 'year_label'],axis=1)

Y_upsampled = imf_upsampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y_upsampled)
encoded_Y_upsampled = pd.DataFrame(encoder.transform(Y_upsampled))

X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_updersampled, encoded_Y_upsampled,
                                                                            test_size=0.15)


scaler = preprocessing.StandardScaler()
X_train_under = scaler.fit_transform(X_train_up)
X_test_under = scaler.transform(X_test_up)

In [None]:
# Build the model.
model_up = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(3724,)),
    Dense(64, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model_up.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history_up = model_up.fit(
  X_train_up,
  to_categorical(y_train_up), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model_up.evaluate(
  X_test_up,
  to_categorical(y_test_up)
)

In [None]:
predictions = model_up.predict(X_test_up)
print(np.argmax(predictions, axis=1))

predictions

In [None]:
y_test_up

### Conclusions

None of the models was able to predict any of the recession years. Even with upsampling, almost every other category is being missed.

# World Bank Data

In [561]:
wb_gdp = pd.read_csv(r'C:\Users\luana\Ironhack DA\Unit 9\final_bootcamp_project\csv_files\wb_gdp_predictions.csv')
wb_gdp

Unnamed: 0,WLD_gdp_per_capita_us$,WLD_gdp_per_capita_%_growth,ARG_gdp_per_capita_us$,ARG_gdp_per_capita_%_growth,AUS_gdp_per_capita_us$,AUS_gdp_per_capita_%_growth,BRA_gdp_per_capita_us$,BRA_gdp_per_capita_%_growth,CAN_gdp_per_capita_us$,CAN_gdp_per_capita_%_growth,...,ZAF_gdp_per_capita_%_growth,KOR_gdp_per_capita_us$,KOR_gdp_per_capita_%_growth,TUR_gdp_per_capita_us$,TUR_gdp_per_capita_%_growth,GBR_gdp_per_capita_us$,GBR_gdp_per_capita_%_growth,USA_gdp_per_capita_us$,USA_gdp_per_capita_%_growth,time
0,471.544201,2.457848,1184.627932,3.728779,1877.616638,0.464273,232.480546,5.484104,2240.433039,1.119387,...,1.328117,93.828649,3.809555,283.828284,-1.268241,1472.385714,1.899262,3066.562869,0.618121,YR1961
1,496.165533,3.531222,1155.890717,-2.425843,1854.656834,-1.147839,251.348317,3.530786,2268.585346,5.445858,...,3.570428,106.148506,0.966642,309.446624,3.059925,1525.775853,0.248519,3243.843078,4.480669,YR1962
2,523.989741,3.038246,850.304579,-6.78824,1967.116537,4.19735,295.631971,-2.284082,2374.498448,3.405213,...,4.707754,146.314342,6.026426,350.662985,6.486465,1613.456884,4.092469,3374.515171,2.908272,YR1963
3,562.186364,4.414647,1173.238105,8.437244,2131.377948,4.899706,258.605371,0.476589,2555.111146,4.650755,...,5.2369,123.603495,6.64777,369.583469,2.973251,1748.288118,4.849645,3573.941185,4.340549,YR1964
4,600.107959,3.423587,1279.113778,8.896809,2281.098797,3.924186,269.45629,-0.435883,2770.361804,4.409518,...,3.448301,108.722131,4.624622,386.358061,0.40428,1873.567774,1.488048,3827.52711,5.078098,YR1965
5,637.822013,3.529408,1272.803204,-2.136478,2343.99529,0.070881,330.040586,3.807691,3047.106147,4.735676,...,1.794792,133.474849,9.213102,444.549483,8.602306,1986.747159,1.014665,4146.316646,5.277114,YR1966
6,665.055206,2.064721,1062.543412,1.678435,2580.271096,4.970237,353.110711,1.431168,3217.159294,1.252817,...,4.472315,161.159931,6.562089,481.69368,2.280141,2058.781882,2.234412,4336.426587,1.389951,YR1967
7,703.361657,3.807017,1141.080432,3.286938,2724.372519,3.257317,375.390574,6.943742,3462.678872,3.31525,...,1.491199,198.431298,10.570155,526.213475,4.278525,1951.758596,4.929077,4695.92339,3.758819,YR1968
8,759.933925,3.651471,1329.058546,8.050279,2991.607198,4.826364,400.787069,6.714256,3763.953379,3.626733,...,2.0175,243.422413,11.997471,571.61777,1.639443,2100.667869,1.501173,5032.144743,2.09737,YR1969
9,813.897741,1.799655,1322.590638,1.479494,3305.126394,5.084881,445.023781,7.652969,4121.932814,9.51073,...,2.512703,279.304969,7.675053,489.930368,0.803638,2347.544318,5.894837,5234.296666,-1.438451,YR1970


In [562]:
wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')

In [563]:
# Dropping 2021,2022 and 2023. I will label them later
wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')
wb_gdp_to_predict = wb_gdp[-3:]
wb_gdp = wb_gdp[:-3]
wb_gdp

Unnamed: 0,WLD_gdp_per_capita_us$,WLD_gdp_per_capita_%_growth,ARG_gdp_per_capita_us$,ARG_gdp_per_capita_%_growth,AUS_gdp_per_capita_us$,AUS_gdp_per_capita_%_growth,BRA_gdp_per_capita_us$,BRA_gdp_per_capita_%_growth,CAN_gdp_per_capita_us$,CAN_gdp_per_capita_%_growth,...,ZAF_gdp_per_capita_%_growth,KOR_gdp_per_capita_us$,KOR_gdp_per_capita_%_growth,TUR_gdp_per_capita_us$,TUR_gdp_per_capita_%_growth,GBR_gdp_per_capita_us$,GBR_gdp_per_capita_%_growth,USA_gdp_per_capita_us$,USA_gdp_per_capita_%_growth,time
0,471.544201,2.457848,1184.627932,3.728779,1877.616638,0.464273,232.480546,5.484104,2240.433039,1.119387,...,1.328117,93.828649,3.809555,283.828284,-1.268241,1472.385714,1.899262,3066.562869,0.618121,1961
1,496.165533,3.531222,1155.890717,-2.425843,1854.656834,-1.147839,251.348317,3.530786,2268.585346,5.445858,...,3.570428,106.148506,0.966642,309.446624,3.059925,1525.775853,0.248519,3243.843078,4.480669,1962
2,523.989741,3.038246,850.304579,-6.78824,1967.116537,4.19735,295.631971,-2.284082,2374.498448,3.405213,...,4.707754,146.314342,6.026426,350.662985,6.486465,1613.456884,4.092469,3374.515171,2.908272,1963
3,562.186364,4.414647,1173.238105,8.437244,2131.377948,4.899706,258.605371,0.476589,2555.111146,4.650755,...,5.2369,123.603495,6.64777,369.583469,2.973251,1748.288118,4.849645,3573.941185,4.340549,1964
4,600.107959,3.423587,1279.113778,8.896809,2281.098797,3.924186,269.45629,-0.435883,2770.361804,4.409518,...,3.448301,108.722131,4.624622,386.358061,0.40428,1873.567774,1.488048,3827.52711,5.078098,1965
5,637.822013,3.529408,1272.803204,-2.136478,2343.99529,0.070881,330.040586,3.807691,3047.106147,4.735676,...,1.794792,133.474849,9.213102,444.549483,8.602306,1986.747159,1.014665,4146.316646,5.277114,1966
6,665.055206,2.064721,1062.543412,1.678435,2580.271096,4.970237,353.110711,1.431168,3217.159294,1.252817,...,4.472315,161.159931,6.562089,481.69368,2.280141,2058.781882,2.234412,4336.426587,1.389951,1967
7,703.361657,3.807017,1141.080432,3.286938,2724.372519,3.257317,375.390574,6.943742,3462.678872,3.31525,...,1.491199,198.431298,10.570155,526.213475,4.278525,1951.758596,4.929077,4695.92339,3.758819,1968
8,759.933925,3.651471,1329.058546,8.050279,2991.607198,4.826364,400.787069,6.714256,3763.953379,3.626733,...,2.0175,243.422413,11.997471,571.61777,1.639443,2100.667869,1.501173,5032.144743,2.09737,1969
9,813.897741,1.799655,1322.590638,1.479494,3305.126394,5.084881,445.023781,7.652969,4121.932814,9.51073,...,2.512703,279.304969,7.675053,489.930368,0.803638,2347.544318,5.894837,5234.296666,-1.438451,1970


In [564]:
# Label the years

wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')


crisis_years = ['1975', '1982', '1991', '2009', '2020']
recovery_years = ['1976', '1977', '1983', '1984', '1992', '1993', '2010', '2011']
downturn_years = ['1958', '1998', '2001', '2012']

labels = []
for i in range(len(wb_gdp)):
    if wb_gdp ['time'][i] in crisis_years:
        labels.append('recession')

    elif wb_gdp['time'][i] in recovery_years:
        labels.append('recovery')

    elif wb_gdp ['time'][i] in downturn_years:
        labels.append('downturn')
        
    else:
        labels.append('expansion')

wb_gdp['year_label'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wb_gdp['year_label'] = labels


In [565]:
X = wb_gdp.drop(['time', 'year_label'],axis=1)

Y = wb_gdp['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.3, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(60, 40)


In [566]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(40,)),
    Dense(64, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.8894979953765869, 0.7222222089767456]

In [567]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

predictions

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


array([[0.06654615, 0.711251  , 0.08572263, 0.13648023],
       [0.06642244, 0.7111225 , 0.08580007, 0.13665502],
       [0.0664174 , 0.71134126, 0.0856978 , 0.13654351],
       [0.06638524, 0.7117159 , 0.08551022, 0.13638866],
       [0.06633606, 0.71054554, 0.08607472, 0.1370437 ],
       [0.06640004, 0.71084595, 0.08593155, 0.13682245],
       [0.06640695, 0.7109955 , 0.08586323, 0.13673432],
       [0.06632764, 0.71038276, 0.08615166, 0.13713793],
       [0.0662938 , 0.7097566 , 0.08644325, 0.13750629],
       [0.06647029, 0.71137875, 0.08567846, 0.13647251],
       [0.06634283, 0.7100383 , 0.08630884, 0.13730998],
       [0.06642808, 0.7118866 , 0.08542362, 0.13626175],
       [0.06655991, 0.7115682 , 0.08557009, 0.13630182],
       [0.06645274, 0.7109825 , 0.08587421, 0.13669053],
       [0.06643219, 0.71177804, 0.08548436, 0.13630542],
       [0.06646612, 0.70877635, 0.08688901, 0.13786845],
       [0.06655757, 0.7116024 , 0.08555486, 0.13628514],
       [0.06653945, 0.7112392 ,

In [568]:
# Again, it can only predict the expansion years

In [569]:
columns_percentage = ['time', 'year_label']

for column in wb_gdp.columns:
    if '%' in column:
        columns_percentage.append(column)

        
columns_percentage

['time',
 'year_label',
 'WLD_gdp_per_capita_%_growth',
 'ARG_gdp_per_capita_%_growth',
 'AUS_gdp_per_capita_%_growth',
 'BRA_gdp_per_capita_%_growth',
 'CAN_gdp_per_capita_%_growth',
 'CHN_gdp_per_capita_%_growth',
 'FRA_gdp_per_capita_%_growth',
 'DEU_gdp_per_capita_%_growth',
 'IND_gdp_per_capita_%_growth',
 'IDN_gdp_per_capita_%_growth',
 'ITA_gdp_per_capita_%_growth',
 'JPN_gdp_per_capita_%_growth',
 'MEX_gdp_per_capita_%_growth',
 'RUS_gdp_per_capita_%_growth',
 'SAU_gdp_per_capita_%_growth',
 'ZAF_gdp_per_capita_%_growth',
 'KOR_gdp_per_capita_%_growth',
 'TUR_gdp_per_capita_%_growth',
 'GBR_gdp_per_capita_%_growth',
 'USA_gdp_per_capita_%_growth']

In [570]:
gdp_percentage = wb_gdp[columns_percentage]
gdp_percentage

Unnamed: 0,time,year_label,WLD_gdp_per_capita_%_growth,ARG_gdp_per_capita_%_growth,AUS_gdp_per_capita_%_growth,BRA_gdp_per_capita_%_growth,CAN_gdp_per_capita_%_growth,CHN_gdp_per_capita_%_growth,FRA_gdp_per_capita_%_growth,DEU_gdp_per_capita_%_growth,...,ITA_gdp_per_capita_%_growth,JPN_gdp_per_capita_%_growth,MEX_gdp_per_capita_%_growth,RUS_gdp_per_capita_%_growth,SAU_gdp_per_capita_%_growth,ZAF_gdp_per_capita_%_growth,KOR_gdp_per_capita_%_growth,TUR_gdp_per_capita_%_growth,GBR_gdp_per_capita_%_growth,USA_gdp_per_capita_%_growth
0,1961,expansion,2.457848,3.728779,0.464273,5.484104,1.119387,-26.527644,3.604901,1.710386,...,7.486419,11.044073,1.782077,0.786128,1.169525,1.328117,3.809555,-1.268241,1.899262,0.618121
1,1962,expansion,3.531222,-2.425843,-1.147839,3.530786,5.445858,-6.351505,5.361747,1.710386,...,5.487478,7.901714,1.463526,0.786128,1.169525,3.570428,0.966642,3.059925,0.248519,4.480669
2,1963,expansion,3.038246,-6.78824,4.19735,-2.284082,3.405213,7.622254,4.751682,1.710386,...,4.842052,7.379857,4.803232,0.786128,1.169525,4.707754,6.026426,6.486465,4.092469,2.908272
3,1964,expansion,4.414647,8.437244,4.899706,0.476589,4.650755,15.468995,5.24837,1.710386,...,1.955533,10.520792,8.481411,0.786128,1.169525,5.2369,6.64777,2.973251,4.849645,4.340549
4,1965,expansion,3.423587,8.896809,3.924186,-0.435883,4.409518,14.197889,3.625023,1.710386,...,2.402046,4.68645,3.816439,0.786128,1.169525,3.448301,4.624622,0.40428,1.488048,5.078098
5,1966,expansion,3.529408,-2.136478,0.070881,3.807691,4.735676,7.608404,4.18061,1.710386,...,5.164163,9.63236,2.842655,0.786128,1.169525,1.794792,9.213102,8.602306,1.014665,5.277114
6,1967,expansion,2.064721,1.678435,4.970237,1.431168,1.252817,-8.161498,4.000543,1.710386,...,6.405678,9.938834,2.616661,0.786128,1.169525,4.472315,6.562089,2.280141,2.234412,1.389951
7,1968,expansion,3.807017,3.286938,3.257317,6.943742,3.31525,-6.571452,3.675154,1.710386,...,5.873595,11.617428,6.089059,0.786128,1.169525,1.491199,10.570155,4.278525,4.929077,3.758819
8,1969,expansion,3.651471,8.050279,4.826364,6.714256,3.626733,13.77934,6.30696,1.710386,...,5.49918,11.148657,0.285108,0.786128,2.021882,2.0175,11.997471,1.639443,1.501173,2.09737
9,1970,expansion,1.799655,1.479494,5.084881,7.652969,9.51073,16.050399,5.30014,1.710386,...,4.713421,1.281814,3.300212,0.786128,52.219252,2.512703,7.675053,0.803638,5.894837,-1.438451


In [571]:
X = gdp_percentage.drop(['time', 'year_label'],axis=1)

Y = gdp_percentage['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.35, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(60, 20)


In [572]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(20,)),
    Dense(128, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
#     Dense(32, activation='sigmoid'),
#     Dense(16, activation='sigmoid'),
#     Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.5681012272834778, 0.8571428656578064]

In [573]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

predictions

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


array([[0.03369792, 0.85397696, 0.01222778, 0.10009731],
       [0.07253859, 0.5277977 , 0.08827052, 0.3113932 ],
       [0.08760554, 0.5767098 , 0.1844024 , 0.15128224],
       [0.03519737, 0.845736  , 0.01496629, 0.1041003 ],
       [0.03657202, 0.7478079 , 0.02180177, 0.19381823],
       [0.03139187, 0.83028996, 0.01045119, 0.12786694],
       [0.06262567, 0.51145554, 0.08750664, 0.3384122 ],
       [0.04941919, 0.7554308 , 0.02714007, 0.16800989],
       [0.04451082, 0.7752033 , 0.0306844 , 0.1496015 ],
       [0.06371427, 0.6797378 , 0.06055267, 0.19599526],
       [0.02554888, 0.9023786 , 0.00696037, 0.0651122 ],
       [0.02622475, 0.8855088 , 0.00567699, 0.08258945],
       [0.03748483, 0.85600215, 0.01205161, 0.09446139],
       [0.04753204, 0.76656044, 0.03072873, 0.1551788 ],
       [0.07784919, 0.512681  , 0.0969847 , 0.31248507],
       [0.03903766, 0.7970745 , 0.02003297, 0.14385487],
       [0.06714635, 0.65822035, 0.09402805, 0.1806053 ],
       [0.04509513, 0.72049856,

In [574]:
y_test

Unnamed: 0,0
43,1
33,1
40,0
45,1
5,1
27,1
34,1
4,1
28,1
56,1


In [597]:
# Removing the columns in dolar seems to have improved the predictions
# I will upsample again and downsample again

category_expansion = gdp_percentage[gdp_percentage['year_label']=='expansion']
category_recession = gdp_percentage[gdp_percentage['year_label']=='recession']
category_recovery = gdp_percentage[gdp_percentage['year_label']=='recovery']
category_downturn = gdp_percentage[gdp_percentage['year_label']=='downturn']

category_recession_oversampled = resample(category_recession,replace=True, n_samples = len(category_expansion))
category_recovery_oversampled = resample(category_recovery,replace=True, n_samples = len(category_expansion))
category_downturn_oversampled = resample(category_downturn,replace=True, n_samples = len(category_expansion))

gdp_percentage_upsampled = pd.concat([category_expansion,
                          category_recession_oversampled,
                          category_recovery_oversampled,
                          category_downturn_oversampled], axis=0)

gdp_percentage_upsampled.shape

(176, 22)

In [623]:
X = gdp_percentage_upsampled.drop(['time', 'year_label'],axis=1)

Y = gdp_percentage_upsampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.35, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(176, 20)


In [628]:
# 1: Expansion, 2: recession, 3: recovery, 0: downturn

In [599]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(20,)),
    Dense(128, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
#     Dense(32, activation='sigmoid'),
#     Dense(16, activation='sigmoid'),
#     Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.2806028127670288, 0.9354838728904724]

In [600]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

[2 3 2 2 2 3 3 1 1 3 2 0 3 0 2 0 3 0 0 0 2 1 2 0 0 2 3 3 2 1 0 2 0 3 0 3 3
 0 1 0 1 1 3 2 3 3 0 3 0 3 1 3 2 1 3 3 3 0 3 2 3 2]


In [617]:
predictions_array =np.argmax(predictions, axis=1)
y_test_df = pd.DataFrame(y_test).reset_index(drop=True)
y_test_df.columns= ['y_test']
predictions_df = pd.DataFrame(predictions_array)
predictions_df.columns= ['predicted_y']
predictions_df = pd.concat([predictions_df,y_test_df], axis=1)
pd.set_option('display.max_rows',None)
predictions_df

Unnamed: 0,predicted_y,y_test
0,2,2
1,3,3
2,2,2
3,2,2
4,2,2
5,3,3
6,3,3
7,1,1
8,1,1
9,3,3


In [584]:
# This model has a much better accuracy! Will be used for predictions
model_upsampled = model 
scaler_upsampled  = scaler

filename = 'model_gdp_upsampled.sav'
pickle.dump(model, open(filename, 'wb'))

filename= 'scaler_gdp_upsample.sav'
pickle.dump(scaler, open(filename, 'wb'))

INFO:tensorflow:Assets written to: C:\Users\luana\AppData\Local\Temp\tmpdemxf2sj\assets


In [585]:
# Just in case I will also downsample
gdp_percentage['year_label'].value_counts()

expansion    44
recovery      8
recession     5
downturn      3
Name: year_label, dtype: int64

In [586]:
category_expansion = gdp_percentage[gdp_percentage['year_label']=='expansion']
other_categories = gdp_percentage[gdp_percentage['year_label']!='expansion']
category_category_expansion_undersampled = resample(category_expansion,replace=True, n_samples = 8)

gdp_percentage_undersampled = pd.concat([other_categories, category_category_expansion_undersampled], axis=0)

gdp_percentage_undersampled.shape

(24, 22)

In [587]:
X = gdp_percentage_undersampled.drop(['time', 'year_label'],axis=1)

Y = gdp_percentage_undersampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.15, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(24, 20)


In [588]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(20,)),
    Dense(128, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[1.5776002407073975, 0.0]

In [589]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

[1 1 1 1]


In [590]:
y_test

Unnamed: 0,0
4,3
10,0
1,3
3,2


In [591]:
y_train

Unnamed: 0,0
9,0
11,2
14,0
15,2
12,3
20,1
22,1
7,3
21,1
5,3


In [592]:
# Not great... moving on with the upsampled model

In [593]:
wb_gdp_to_predict

Unnamed: 0,WLD_gdp_per_capita_us$,WLD_gdp_per_capita_%_growth,ARG_gdp_per_capita_us$,ARG_gdp_per_capita_%_growth,AUS_gdp_per_capita_us$,AUS_gdp_per_capita_%_growth,BRA_gdp_per_capita_us$,BRA_gdp_per_capita_%_growth,CAN_gdp_per_capita_us$,CAN_gdp_per_capita_%_growth,...,ZAF_gdp_per_capita_%_growth,KOR_gdp_per_capita_us$,KOR_gdp_per_capita_%_growth,TUR_gdp_per_capita_us$,TUR_gdp_per_capita_%_growth,GBR_gdp_per_capita_us$,GBR_gdp_per_capita_%_growth,USA_gdp_per_capita_us$,USA_gdp_per_capita_%_growth,time
60,11169.974582,-0.334936,8570.634519,-0.991484,52389.355728,1.22423,6837.643155,-1.241252,43804.96692,-2.114828,...,-4.075358,32529.782043,3.344573,8625.57201,2.785851,41604.094252,-2.305666,64639.616426,0.129421,2021
61,11406.017489,1.029786,8556.235071,0.504945,53095.141956,1.708921,6859.729819,0.342835,44347.911525,0.382067,...,-2.08389,33479.340388,4.958773,8713.704291,2.786096,42106.029471,0.284902,66277.833117,1.385264,2022
62,11644.205505,1.503147,8542.467406,0.734062,53797.690131,1.802788,6881.155986,1.098611,44887.123525,1.864915,...,-0.96373,34446.500011,5.560093,8800.841526,2.786097,42603.917211,1.204767,67942.758047,1.739038,2023


In [594]:
X = wb_gdp_to_predict.drop('time', axis=1)
X = wb_gdp_to_predict.drop([column for column in wb_gdp_to_predict.columns if '%' not in column], axis=1)
X

Unnamed: 0,WLD_gdp_per_capita_%_growth,ARG_gdp_per_capita_%_growth,AUS_gdp_per_capita_%_growth,BRA_gdp_per_capita_%_growth,CAN_gdp_per_capita_%_growth,CHN_gdp_per_capita_%_growth,FRA_gdp_per_capita_%_growth,DEU_gdp_per_capita_%_growth,IND_gdp_per_capita_%_growth,IDN_gdp_per_capita_%_growth,ITA_gdp_per_capita_%_growth,JPN_gdp_per_capita_%_growth,MEX_gdp_per_capita_%_growth,RUS_gdp_per_capita_%_growth,SAU_gdp_per_capita_%_growth,ZAF_gdp_per_capita_%_growth,KOR_gdp_per_capita_%_growth,TUR_gdp_per_capita_%_growth,GBR_gdp_per_capita_%_growth,USA_gdp_per_capita_%_growth
60,-0.334936,-0.991484,1.22423,-1.241252,-2.114828,5.462254,-4.279322,0.556327,1.199434,0.988048,-3.443038,-1.697046,-1.676989,-1.357894,-0.912741,-4.075358,3.344573,2.785851,-2.305666,0.129421
61,1.029786,0.504945,1.708921,0.342835,0.382067,6.77398,-1.980928,1.485164,2.744545,2.383816,-0.907248,-0.134418,0.519137,-0.633264,0.508637,-2.08389,4.958773,2.786096,0.284902,1.385264
62,1.503147,0.734062,1.802788,1.098611,1.864915,7.270462,-0.574128,1.650952,3.018479,2.860059,0.340169,0.831293,1.165903,-0.16562,0.936077,-0.96373,5.560093,2.786097,1.204767,1.739038


In [595]:
# load the model from disk

filename = 'model_gdp_upsampled.sav'
loaded_model = pickle.load(open(filename, 'rb'))

filename= 'scaler_gdp_upsample.sav'
loaded_scaler = pickle.load(open(filename, 'rb'))

In [596]:

X_scaled = loaded_scaler.transform(X)

predictions = loaded_model.predict(X_scaled)
print(np.argmax(predictions, axis=1))

[2 2 1]


In [None]:
# 1: Expansion, 2: recession, 3: recovery, 0: downturn

# Conclusion:

#### This model has predicted recession for 2021 and 2022, but expansion for 2023

# World Bank: data from 1981

In [675]:
pd.set_option('display.max_rows', 10)
wb_1981 = pd.read_csv(r'C:\Users\luana\Ironhack DA\Unit 9\final_bootcamp_project\csv_files\wb_from_1981_predictions.csv')
wb_1981

Unnamed: 0,time,WLD_gdp_per_capita_us$,WLD_gdp_per_capita_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,WLD_government_expense_%_gdp,WLD_industry_value_added_us$,ARG_gdp_per_capita_us$,ARG_gdp_per_capita_%_growth,ARG_unemployment_%_of_total_labor_force,...,USA_gdp_per_capita_%_growth,USA_consumer_price_index_%,USA_unemployment_%_of_total_labor_force,USA_government_debt_total_local_currency,USA_government_debt_total_%_of_gdp,USA_government_expense_local_currency,USA_government_expense_%_gdp,USA_current_account_balance_us$,USA_industry_value_added_us$,USA_net_trade_goods_services_us$
0,YR1981,2599.297941,0.166623,12.471612,5.681880,26.023013,26.023013,2776.322088,-6.668498,10.980400,...,1.536320,10.334715,5.928000,9.779907e+12,67.388391,6.736300e+11,21.004721,4.810000e+09,21.004721,-1.568000e+10
1,YR1982,2527.674030,-1.381191,10.240268,5.681880,26.023013,26.023013,2927.897357,-2.307766,10.980400,...,-2.734570,6.131427,5.928000,9.779907e+12,67.388391,7.511200e+11,22.463140,-1.160700e+10,22.463140,-2.353700e+10
2,YR1983,2532.623191,0.850482,8.771147,5.681880,26.023013,26.023013,3553.377509,2.681982,10.980400,...,3.631979,3.212435,5.928000,9.779907e+12,67.388391,8.286600e+11,22.802733,-4.422200e+10,22.802733,-5.713500e+10
3,YR1984,2579.857213,2.878301,8.116398,5.681880,26.023013,26.023013,2659.708242,-0.051847,10.980400,...,6.312168,4.300535,5.928000,9.779907e+12,67.388391,8.668900e+11,21.470359,-9.900800e+10,21.470359,-1.082770e+11
4,YR1985,2657.496341,1.917044,6.856812,5.681880,26.023013,26.023013,2926.126485,-6.692591,10.980400,...,3.250656,3.545644,5.928000,9.779907e+12,67.388391,9.621600e+11,22.174802,-1.244550e+11,22.174802,-1.211020e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,YR2019,11407.479334,1.531947,2.186902,5.356915,26.956431,26.956431,10076.355241,-2.994388,9.840000,...,1.824124,1.812210,3.670000,2.157670e+13,100.955077,4.860451e+12,22.741536,-4.721450e+11,22.741536,-5.763380e+11
39,YR2020,10936.057466,-4.269491,1.920968,6.573234,34.063097,34.063097,8585.694742,-10.765108,11.460000,...,-4.328618,1.233584,8.050000,3.377096e+13,161.631918,6.881506e+12,32.935727,-6.160870e+11,32.935727,-6.766790e+11
40,YR2021,11127.421453,0.046068,2.440144,6.204328,33.670439,33.670439,8619.597289,-2.626874,11.386141,...,-0.276751,1.925820,7.541917,3.829610e+13,176.148250,7.494930e+12,32.960833,-6.135865e+11,32.960833,-6.788526e+11
41,YR2022,11317.806416,1.111767,2.843239,5.993507,33.306294,33.306294,8650.472253,-0.347035,11.325219,...,0.968701,2.246906,7.165148,4.354030e+13,192.481725,8.170006e+12,32.985316,-6.112341e+11,32.985316,-6.809354e+11


In [676]:
wb_1981 ['time'] = wb_1981 ['time'].str.replace('YR', '')
percentage_columns_1981 = ['time']
for column in wb_1981.columns:
    if "%" in column:
        percentage_columns_1981.append(column)

wb_1981  = wb_1981[percentage_columns_1981]
wb_1981_to_predict = wb_1981[-3:]
wb_1981 = wb_1981[:-3]
wb_1981

Unnamed: 0,time,WLD_gdp_per_capita_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,WLD_government_expense_%_gdp,ARG_gdp_per_capita_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_industry_value_added_%_growth,AUS_gdp_per_capita_%_growth,AUS_consumer_price_index_%,...,GBR_consumer_price_index_%,GBR_unemployment_%_of_total_labor_force,GBR_government_debt_total_%_of_gdp,GBR_government_expense_%_gdp,GBR_industry_value_added_%_growth,USA_gdp_per_capita_%_growth,USA_consumer_price_index_%,USA_unemployment_%_of_total_labor_force,USA_government_debt_total_%_of_gdp,USA_government_expense_%_gdp
0,1981,0.166623,12.471612,5.681880,26.023013,-6.668498,10.9804,-10.663334,1.711915,9.487666,...,11.876627,6.391067,103.084945,37.484438,0.331610,1.536320,10.334715,5.928,67.388391,21.004721
1,1982,-1.381191,10.240268,5.681880,26.023013,-2.307766,10.9804,-3.209381,1.614208,11.351820,...,8.598864,6.391067,103.084945,37.671549,0.331610,-2.734570,6.131427,5.928,67.388391,22.463140
2,1983,0.850482,8.771147,5.681880,26.023013,2.681982,10.9804,5.377688,-3.437117,10.038911,...,4.609303,6.391067,103.084945,36.881620,0.331610,3.631979,3.212435,5.928,67.388391,22.802733
3,1984,2.878301,8.116398,5.681880,26.023013,-0.051847,10.9804,0.674286,3.418345,3.960396,...,4.960711,6.391067,103.084945,36.819563,0.331610,6.312168,4.300535,5.928,67.388391,21.470359
4,1985,1.917044,6.856812,5.681880,26.023013,-6.692591,10.9804,-8.651012,3.822749,6.734694,...,6.071394,6.391067,103.084945,35.932758,0.331610,3.250656,3.545644,5.928,67.388391,22.174802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2016,1.610004,1.550016,5.656542,26.537560,-3.110064,8.1110,-5.825159,1.149347,1.276991,...,1.008417,4.810000,153.825030,36.747498,-0.380297,0.933375,1.261583,4.870,98.504102,22.565094
36,2017,2.200971,2.192010,5.557650,26.433629,1.757648,8.3500,2.302833,0.587161,1.948647,...,2.557756,4.330000,159.196774,36.800352,1.258502,1.610808,2.130110,4.360,97.763473,22.312344
37,2018,2.144471,2.438737,5.389680,26.296561,-3.601610,9.2200,-3.049315,1.302841,1.911401,...,2.292840,4.000000,155.535877,37.101933,1.769630,2.378479,2.442583,3.900,99.178089,22.336230
38,2019,1.531947,2.186902,5.356915,26.956431,-2.994388,9.8400,-4.790141,0.571103,1.610768,...,1.738105,3.740000,157.742820,36.322360,2.443486,1.824124,1.812210,3.670,100.955077,22.741536


In [641]:
wb_1981_to_predict

Unnamed: 0,time,WLD_gdp_per_capita_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,WLD_government_expense_%_gdp,ARG_gdp_per_capita_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_industry_value_added_%_growth,AUS_gdp_per_capita_%_growth,AUS_consumer_price_index_%,...,GBR_consumer_price_index_%,GBR_unemployment_%_of_total_labor_force,GBR_government_debt_total_%_of_gdp,GBR_government_expense_%_gdp,GBR_industry_value_added_%_growth,USA_gdp_per_capita_%_growth,USA_consumer_price_index_%,USA_unemployment_%_of_total_labor_force,USA_government_debt_total_%_of_gdp,USA_government_expense_%_gdp
40,2021,0.046068,2.440144,6.204328,33.670439,-2.626874,11.386141,-1.386956,1.009692,1.353727,...,1.45707,4.580937,191.56666,46.260029,-2.003782,-0.276751,1.92582,7.541917,176.14825,32.960833
41,2022,1.111767,2.843239,5.993507,33.306294,-0.347035,11.325219,0.512524,1.470736,1.73929,...,1.775478,4.681121,192.347959,45.422538,-0.203435,0.968701,2.246906,7.165148,192.481725,32.985316
42,2023,1.374935,3.156206,5.873028,32.968592,0.291637,11.274967,0.964538,1.56363,2.032605,...,1.992303,4.773256,193.116626,44.682401,0.164021,1.351526,2.395839,6.885755,210.859812,33.009191


In [677]:
# Label the years

crisis_years = ['1975', '1982', '1991', '2009', '2020']
recovery_years = ['1976', '1977', '1983', '1984', '1992', '1993', '2010', '2011']
downturn_years = ['1958', '1998', '2001', '2012']

labels = []
for i in range(len(wb_1981)):
    if wb_1981 ['time'][i] in crisis_years:
        labels.append('recession')

    elif wb_1981['time'][i] in recovery_years:
        labels.append('recovery')

    elif wb_1981 ['time'][i] in downturn_years:
        labels.append('downturn')
        
    else:
        labels.append('expansion')

wb_1981['year_label'] = labels

In [678]:
category_expansion = wb_1981[wb_1981['year_label']=='expansion']
category_recession = wb_1981[wb_1981['year_label']=='recession']
category_recovery = wb_1981[wb_1981['year_label']=='recovery']
category_downturn = wb_1981[wb_1981['year_label']=='downturn']

category_recession_oversampled = resample(category_recession,replace=True, n_samples = len(category_expansion))
category_recovery_oversampled = resample(category_recovery,replace=True, n_samples = len(category_expansion))
category_downturn_oversampled = resample(category_downturn,replace=True, n_samples = len(category_expansion))

wb_1981_upsampled = pd.concat([category_expansion,
                          category_recession_oversampled,
                          category_recovery_oversampled,
                          category_downturn_oversampled], axis=0)

X = wb_1981_upsampled.drop(['time', 'year_label'],axis=1)

Y = wb_1981_upsampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.35, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(108, 91)


In [None]:
# 1: Expansion, 2: recession , 3:recovery , 4: downturn

In [667]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(91,)),
    Dense(64, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=50,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[0.863950252532959, 0.8157894611358643]

In [669]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

[0 0 1 2 0 2 2 3 0 1 1 0 0 2 3 2 2 3 3 3 0 1 2 0 3 2 0 2 0 3 3 0 3 0 2 1 0
 1]


In [666]:
y_test

Unnamed: 0,0
84,0
5,1
9,1
53,2
102,0
42,2
35,2
80,3
104,0
24,1


In [670]:
# Also here the accuracy is quite good!


# This model has a much better accuracy! Will be used for predictions
model_1981_upsampled = model 
scaler_1981_upsampled  = scaler

filename = 'model_1981_upsampled.sav'
pickle.dump(model_1981_upsampled, open(filename, 'wb'))

filename= 'scaler_1981_upsampled.sav'
pickle.dump(scaler_1981_upsampled, open(filename, 'wb'))


INFO:tensorflow:Assets written to: C:\Users\luana\AppData\Local\Temp\tmpe53ejhh_\assets


In [672]:
X = wb_1981_to_predict.drop(['time'], axis=1)
X

Unnamed: 0,WLD_gdp_per_capita_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,WLD_government_expense_%_gdp,ARG_gdp_per_capita_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_industry_value_added_%_growth,AUS_gdp_per_capita_%_growth,AUS_consumer_price_index_%,AUS_unemployment_%_of_total_labor_force,...,GBR_consumer_price_index_%,GBR_unemployment_%_of_total_labor_force,GBR_government_debt_total_%_of_gdp,GBR_government_expense_%_gdp,GBR_industry_value_added_%_growth,USA_gdp_per_capita_%_growth,USA_consumer_price_index_%,USA_unemployment_%_of_total_labor_force,USA_government_debt_total_%_of_gdp,USA_government_expense_%_gdp
40,0.046068,2.440144,6.204328,33.670439,-2.626874,11.386141,-1.386956,1.009692,1.353727,6.468546,...,1.45707,4.580937,191.56666,46.260029,-2.003782,-0.276751,1.92582,7.541917,176.14825,32.960833
41,1.111767,2.843239,5.993507,33.306294,-0.347035,11.325219,0.512524,1.470736,1.73929,6.47621,...,1.775478,4.681121,192.347959,45.422538,-0.203435,0.968701,2.246906,7.165148,192.481725,32.985316
42,1.374935,3.156206,5.873028,32.968592,0.291637,11.274967,0.964538,1.56363,2.032605,6.483084,...,1.992303,4.773256,193.116626,44.682401,0.164021,1.351526,2.395839,6.885755,210.859812,33.009191


In [674]:
# load the model from disk

filename = 'model_1981_upsampled.sav'
loaded_model = pickle.load(open(filename, 'rb'))

filename= 'scaler_1981_upsampled.sav'
loaded_scaler = pickle.load(open(filename, 'rb'))

X_scaled = loaded_scaler.transform(X)

predictions = loaded_model.predict(X_scaled)
print(np.argmax(predictions, axis=1))

[2 3 3]


In [None]:
# 1: Expansion, 2: recession , 3:recovery , 4: downturn

# Conclusion

### The first  model (accuracy 93%) predicted recession for 2021 and 2022 and expansion for 2023.

### This second model (accuracy 81%) predicted recession in 2021 and recovery in 2022 and 2023.