In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pylab as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn import model_selection
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
import pickle

# Multiclass classification models

In this notebook I will create classification models to label the years as per their economic activity. According to bibliography, the world has experience 4 recessions (decrease of GDP) and 4 downturns (low GDP growth) in the last 70 years. I will label them accordingly:

1. Pre-recession: 1 year before each recession
2. Recession-peak: 1975, 1982, 1991, 2009, 2020
3. Recovery: 2 years after each crisis
4. Global-downturn: 1958, 1998, 2001, 2012
5. Expansion: all other years

## IMF dataset

This is the dataset containing global data for 1980-2027.

In [5]:
pd.reset_option('all')
orignal_imf = pd.read_csv(r'C:\Users\luana\Ironhack DA\Unit 9\final_bootcamp_project\csv_files\imf_clean.csv')
orignal_imf

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



Unnamed: 0,WEO Subject Code,Country,Subject Descriptor,Subject Notes,Units,Scale,Country/Series-specific Notes,1980,1981,1982,...,2019,2020,2021,2022,2023,2024,2025,2026,2027,Estimates Start After
0,NGDP_R,Albania,"Gross domestic product, constant prices",Expressed in billions of national currency uni...,National currency,Billions,Source: IMF Staff Estimates. Official national...,311.514,329.270,338.819,...,837.786,808.617,877.475,912.574,935.388,965.321000,996.211000,1030.080000,1065.110000,2020.0
1,NGDP_RPCH,Albania,"Gross domestic product, constant prices",Annual percentages of constant price GDP are y...,Percent change,,"See notes for: Gross domestic product, consta...",2.684,5.700,2.900,...,2.088,-3.482,8.516,4.000,2.500,3.200000,3.200000,3.400000,3.400000,2020.0
2,NGDP,Albania,"Gross domestic product, current prices",Expressed in billions of national currency uni...,National currency,Billions,Source: IMF Staff Estimates. Official national...,18.489,19.126,19.698,...,1691.900,1644.080,1889.840,2059.020,2176.660,2287.470000,2403.260000,2530.790000,2664.390000,2020.0
3,NGDPD,Albania,"Gross domestic product, current prices",Values are based upon GDP in national currency...,U.S. dollars,Billions,"See notes for: Gross domestic product, curren...",1.946,2.229,2.296,...,15.399,15.161,18.310,18.256,18.842,19.858000,21.219000,22.528000,23.889000,2020.0
4,PPPGDP,Albania,"Gross domestic product, current prices",These data form the basis for the country weig...,Purchasing power parity; international dollars,Billions,"See notes for: Gross domestic product, curren...",5.759,6.663,7.280,...,41.623,40.658,45.953,51.189,54.338,57.254000,60.184000,63.415000,66.840000,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4040,D_NGDPD,Sub-Saharan Africa,"External debt, total",,Percent of GDP,,,18.911,21.069,21.791,...,41.042,43.811,40.804,39.774,37.853,37.758550,37.758550,37.758550,37.758550,
4041,DS,Sub-Saharan Africa,"External debt, total debt service",,U.S. dollars,Billions,,10.493,10.300,11.741,...,113.329,109.146,127.688,149.875,129.985,42.604425,42.604425,42.604425,42.604425,
4042,DS_NGDPD,Sub-Saharan Africa,"External debt, total debt service",,Percent of GDP,,,3.350,3.426,3.512,...,6.637,6.380,7.232,7.655,6.163,6.297775,6.297775,6.297775,6.297775,
4043,DSI,Sub-Saharan Africa,"External debt, total debt service, interest",,U.S. dollars,Billions,,3.301,3.541,3.144,...,18.733,18.035,18.286,18.511,20.489,7.451600,7.451600,7.451600,7.451600,


### Preparing the dataset

In [261]:
# I need to rearrange the dataset to make it suitable for the classification
# I can drop all the categorical columns because they only give information about the indicators
# I will use data until 2020 for the model and then try to label the years 2021-2027 

imf = orignal_imf.select_dtypes(np.number)
imf

Unnamed: 0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027
0,311.514,329.270,338.819,342.546,349.397,344.156,363.428,360.521,355.474,390.310,...,820.653,837.786,808.617,877.475,912.574,935.388,965.321000,996.211000,1030.080000,1065.110000
1,2.684,5.700,2.900,1.100,2.000,-1.500,5.600,-0.800,-1.400,9.800,...,4.019,2.088,-3.482,8.516,4.000,2.500,3.200000,3.200000,3.400000,3.400000
2,18.489,19.126,19.698,19.900,19.645,20.065,20.692,20.531,20.238,22.228,...,1636.730,1691.900,1644.080,1889.840,2059.020,2176.660,2287.470000,2403.260000,2530.790000,2664.390000
3,1.946,2.229,2.296,2.319,2.290,2.339,2.587,2.566,2.530,2.779,...,15.157,15.399,15.161,18.310,18.256,18.842,19.858000,21.219000,22.528000,23.889000
4,5.759,6.663,7.280,7.649,8.083,8.214,8.848,8.994,9.181,10.476,...,40.055,41.623,40.658,45.953,51.189,54.338,57.254000,60.184000,63.415000,66.840000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4040,18.911,21.069,21.791,29.217,38.232,41.154,41.957,38.565,39.362,40.063,...,39.422,41.042,43.811,40.804,39.774,37.853,37.758550,37.758550,37.758550,37.758550
4041,10.493,10.300,11.741,13.934,15.031,16.869,17.096,17.030,17.986,16.911,...,114.102,113.329,109.146,127.688,149.875,129.985,42.604425,42.604425,42.604425,42.604425
4042,3.350,3.426,3.512,5.457,7.972,8.582,7.109,6.116,6.621,5.700,...,7.230,6.637,6.380,7.232,7.655,6.163,6.297775,6.297775,6.297775,6.297775
4043,3.301,3.541,3.144,3.457,3.234,3.887,4.041,4.385,4.650,4.894,...,17.880,18.733,18.035,18.286,18.511,20.489,7.451600,7.451600,7.451600,7.451600


In [262]:
# I need each year to be a row
imf = imf.transpose().reset_index()
imf

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,4035,4036,4037,4038,4039,4040,4041,4042,4043,4044
0,1980,311.514,2.684,18.489,1.946,5.759,5.935,116584.54,5557.56,6919.41,...,0.39,0.706,-6.298,4.972,60.018,18.911,10.493,3.35,3.301,1.04
1,1981,329.27,5.7,19.126,2.229,6.663,5.809,120786.15,5757.84,7016.15,...,-0.524,0.197,-9.892,-8.517,66.126,21.069,10.3,3.426,3.541,1.055
2,1982,338.819,2.9,19.698,2.296,7.28,5.814,121689.95,5800.93,7074.57,...,-1.243,-0.558,-11.627,-2.061,76.556,21.791,11.741,3.512,3.144,0.929
3,1983,342.546,1.1,19.9,2.319,7.649,5.809,120446.72,5741.66,6997.24,...,-0.884,0.627,-9.151,-0.207,80.005,29.217,13.934,5.457,3.457,1.109
4,1984,349.397,2.0,19.645,2.29,8.083,5.623,120297.85,5734.57,6763.89,...,-0.549,-1.062,-1.789,-0.087,83.24,38.232,15.031,7.972,3.234,1.307
5,1985,344.156,-1.5,20.065,2.339,8.214,5.83,116082.04,5533.6,6767.92,...,-0.534,-0.585,2.562,0.082,88.998,41.154,16.869,8.582,3.887,1.721
6,1986,363.428,5.6,20.692,2.587,8.848,5.694,120235.6,5731.6,6845.79,...,-0.354,0.477,-1.595,0.665,99.509,41.957,17.096,7.109,4.041,1.569
7,1987,360.521,-0.8,20.531,2.566,8.994,5.695,116915.4,5573.33,6657.96,...,-0.856,-0.387,-3.681,3.869,111.007,38.565,17.03,6.116,4.385,1.567
8,1988,355.474,-1.4,20.238,2.53,9.181,5.693,113124.0,5392.59,6440.37,...,-0.712,-0.08,-2.942,0.105,110.67,39.362,17.986,6.621,4.65,1.542
9,1989,390.31,9.8,22.228,2.779,10.476,5.695,120916.02,5764.04,6886.27,...,-2.175,0.353,-2.026,2.634,119.238,40.063,16.911,5.7,4.894,1.593


In [263]:
#Column names from the indicators
columns = ['year']

for i in range(len(orignal_imf)):
    column_name = (orignal_imf['Country'][i]+'_'+orignal_imf['Subject Descriptor'][i])
    column_name = str(column_name).lower().replace(' ','_')
    columns.append(column_name)

len(columns)

4046

In [264]:
imf.columns= columns
imf.head()

Unnamed: 0,year,"albania_gross_domestic_product,_constant_prices","albania_gross_domestic_product,_constant_prices.1","albania_gross_domestic_product,_current_prices","albania_gross_domestic_product,_current_prices.1","albania_gross_domestic_product,_current_prices.2","albania_gross_domestic_product,_deflator","albania_gross_domestic_product_per_capita,_constant_prices","albania_gross_domestic_product_per_capita,_constant_prices.1","albania_gross_domestic_product_per_capita,_current_prices",...,"sub-saharan_africa_direct_investment,_net","sub-saharan_africa_portfolio_investment,_net","sub-saharan_africa_other_investment,_net",sub-saharan_africa_change_in_reserves,"sub-saharan_africa_external_debt,_total","sub-saharan_africa_external_debt,_total.1","sub-saharan_africa_external_debt,_total_debt_service","sub-saharan_africa_external_debt,_total_debt_service.1","sub-saharan_africa_external_debt,_total_debt_service,_interest","sub-saharan_africa_external_debt,_total_debt_service,_interest.1"
0,1980,311.514,2.684,18.489,1.946,5.759,5.935,116584.54,5557.56,6919.41,...,0.39,0.706,-6.298,4.972,60.018,18.911,10.493,3.35,3.301,1.04
1,1981,329.27,5.7,19.126,2.229,6.663,5.809,120786.15,5757.84,7016.15,...,-0.524,0.197,-9.892,-8.517,66.126,21.069,10.3,3.426,3.541,1.055
2,1982,338.819,2.9,19.698,2.296,7.28,5.814,121689.95,5800.93,7074.57,...,-1.243,-0.558,-11.627,-2.061,76.556,21.791,11.741,3.512,3.144,0.929
3,1983,342.546,1.1,19.9,2.319,7.649,5.809,120446.72,5741.66,6997.24,...,-0.884,0.627,-9.151,-0.207,80.005,29.217,13.934,5.457,3.457,1.109
4,1984,349.397,2.0,19.645,2.29,8.083,5.623,120297.85,5734.57,6763.89,...,-0.549,-1.062,-1.789,-0.087,83.24,38.232,15.031,7.972,3.234,1.307


In [265]:
imf.tail(10)

Unnamed: 0,year,"albania_gross_domestic_product,_constant_prices","albania_gross_domestic_product,_constant_prices.1","albania_gross_domestic_product,_current_prices","albania_gross_domestic_product,_current_prices.1","albania_gross_domestic_product,_current_prices.2","albania_gross_domestic_product,_deflator","albania_gross_domestic_product_per_capita,_constant_prices","albania_gross_domestic_product_per_capita,_constant_prices.1","albania_gross_domestic_product_per_capita,_current_prices",...,"sub-saharan_africa_direct_investment,_net","sub-saharan_africa_portfolio_investment,_net","sub-saharan_africa_other_investment,_net",sub-saharan_africa_change_in_reserves,"sub-saharan_africa_external_debt,_total","sub-saharan_africa_external_debt,_total.1","sub-saharan_africa_external_debt,_total_debt_service","sub-saharan_africa_external_debt,_total_debt_service.1","sub-saharan_africa_external_debt,_total_debt_service,_interest","sub-saharan_africa_external_debt,_total_debt_service,_interest.1"
38,2018,820.653,4.019,1636.73,15.157,40.055,199.442,284678.59,13570.55,567769.71,...,-20.946,-4.538,-19.838,4.652,646.938,39.422,114.102,7.23,17.88,1.021
39,2019,837.786,2.088,1691.9,15.399,41.623,201.949,290805.66,13862.63,587280.29,...,-30.505,-18.702,-11.275,6.185,693.725,41.042,113.329,6.637,18.733,1.063
40,2020,808.617,-3.482,1644.08,15.161,40.658,203.32,280984.5,13394.46,571296.57,...,-10.633,2.399,-2.45,-9.596,717.253,43.811,109.146,6.38,18.035,1.083
41,2021,877.475,8.516,1889.84,18.31,45.953,215.372,305428.18,14559.68,657808.05,...,-69.534,44.701,-11.709,26.585,741.108,40.804,127.688,7.232,18.286,0.975
42,2022,912.574,4.0,2059.02,18.256,51.189,225.628,318371.83,15176.7,718335.97,...,-31.837,5.456,13.071,-8.999,766.929,39.774,149.875,7.655,18.511,0.909
43,2023,935.388,2.5,2176.66,18.842,54.338,232.701,327239.88,15599.44,761490.02,...,-40.272,4.485,-9.661,1.036,803.03,37.853,129.985,6.163,20.489,0.938
44,2024,965.321,3.2,2287.47,19.858,57.254,236.964,338749.82,16148.12,802716.63,...,-13.42365,-4.79135,1.80165,4.4091,246.9388,37.75855,42.604425,6.297775,7.4516,1.191625
45,2025,996.211,3.2,2403.26,21.219,60.184,241.24,350720.75,16718.77,846078.15,...,-13.42365,-4.79135,1.80165,4.4091,246.9388,37.75855,42.604425,6.297775,7.4516,1.191625
46,2026,1030.08,3.4,2530.79,22.528,63.415,245.688,363876.33,17345.89,894000.51,...,-13.42365,-4.79135,1.80165,4.4091,246.9388,37.75855,42.604425,6.297775,7.4516,1.191625
47,2027,1065.11,3.4,2664.39,23.889,66.84,250.153,377613.26,18000.73,944610.78,...,-13.42365,-4.79135,1.80165,4.4091,246.9388,37.75855,42.604425,6.297775,7.4516,1.191625


In [266]:
imf.to_csv('imf_formatted.csv')

In [210]:
imf_to_predict = imf[42:48]
imf = imf[:42]
imf.tail()

Unnamed: 0,year,"albania_gross_domestic_product,_constant_prices","albania_gross_domestic_product,_constant_prices.1","albania_gross_domestic_product,_current_prices","albania_gross_domestic_product,_current_prices.1","albania_gross_domestic_product,_current_prices.2","albania_gross_domestic_product,_deflator","albania_gross_domestic_product_per_capita,_constant_prices","albania_gross_domestic_product_per_capita,_constant_prices.1","albania_gross_domestic_product_per_capita,_current_prices",...,"sub-saharan_africa_direct_investment,_net","sub-saharan_africa_portfolio_investment,_net","sub-saharan_africa_other_investment,_net",sub-saharan_africa_change_in_reserves,"sub-saharan_africa_external_debt,_total","sub-saharan_africa_external_debt,_total.1","sub-saharan_africa_external_debt,_total_debt_service","sub-saharan_africa_external_debt,_total_debt_service.1","sub-saharan_africa_external_debt,_total_debt_service,_interest","sub-saharan_africa_external_debt,_total_debt_service,_interest.1"
37,2017,788.943,3.802,1550.65,13.053,37.609,196.547,273542.43,13039.7,537640.3,...,-37.375,-23.977,0.493,15.856,602.733,36.549,83.895,5.05,14.005,0.868
38,2018,820.653,4.019,1636.73,15.157,40.055,199.442,284678.59,13570.55,567769.71,...,-20.946,-4.538,-19.838,4.652,646.938,39.422,114.102,7.23,17.88,1.021
39,2019,837.786,2.088,1691.9,15.399,41.623,201.949,290805.66,13862.63,587280.29,...,-30.505,-18.702,-11.275,6.185,693.725,41.042,113.329,6.637,18.733,1.063
40,2020,808.617,-3.482,1644.08,15.161,40.658,203.32,280984.5,13394.46,571296.57,...,-10.633,2.399,-2.45,-9.596,717.253,43.811,109.146,6.38,18.035,1.083
41,2021,877.475,8.516,1889.84,18.31,45.953,215.372,305428.18,14559.68,657808.05,...,-69.534,44.701,-11.709,26.585,741.108,40.804,127.688,7.232,18.286,0.975


In [211]:
# Now I can label the years
imf['year'] = imf['year'].astype(np.number)

crisis_years = [1975, 1982, 1991, 2009, 2020]
recovery_years = [1976, 1977, 1983, 1984, 1992, 1993, 2010, 2011]
downturn_years = [1958, 1998, 2001, 2012]

labels = []
for i in range(len(imf)):
    if imf['year'][i] in crisis_years:
        labels.append('recession')

    elif imf.loc[i]['year'] in recovery_years:
        labels.append('recovery')

    elif imf.loc[i]['year'] in recovery_years:
        labels.append('recovery')

    elif imf.loc[i]['year'] in downturn_years:
        labels.append('downturn')
        
    else:
        labels.append('expansion')

imf['year_label'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imf['year'] = imf['year'].astype(np.number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imf['year_label'] = labels


In [212]:
imf[['year', 'year_label']]

Unnamed: 0,year,year_label
0,1980.0,expansion
1,1981.0,expansion
2,1982.0,recession
3,1983.0,recovery
4,1984.0,recovery
5,1985.0,expansion
6,1986.0,expansion
7,1987.0,expansion
8,1988.0,expansion
9,1989.0,expansion


In [213]:
imf

Unnamed: 0,year,"albania_gross_domestic_product,_constant_prices","albania_gross_domestic_product,_constant_prices.1","albania_gross_domestic_product,_current_prices","albania_gross_domestic_product,_current_prices.1","albania_gross_domestic_product,_current_prices.2","albania_gross_domestic_product,_deflator","albania_gross_domestic_product_per_capita,_constant_prices","albania_gross_domestic_product_per_capita,_constant_prices.1","albania_gross_domestic_product_per_capita,_current_prices",...,"sub-saharan_africa_portfolio_investment,_net","sub-saharan_africa_other_investment,_net",sub-saharan_africa_change_in_reserves,"sub-saharan_africa_external_debt,_total","sub-saharan_africa_external_debt,_total.1","sub-saharan_africa_external_debt,_total_debt_service","sub-saharan_africa_external_debt,_total_debt_service.1","sub-saharan_africa_external_debt,_total_debt_service,_interest","sub-saharan_africa_external_debt,_total_debt_service,_interest.1",year_label
0,1980.0,311.514,2.684,18.489,1.946,5.759,5.935,116584.54,5557.56,6919.41,...,0.706,-6.298,4.972,60.018,18.911,10.493,3.35,3.301,1.04,expansion
1,1981.0,329.27,5.7,19.126,2.229,6.663,5.809,120786.15,5757.84,7016.15,...,0.197,-9.892,-8.517,66.126,21.069,10.3,3.426,3.541,1.055,expansion
2,1982.0,338.819,2.9,19.698,2.296,7.28,5.814,121689.95,5800.93,7074.57,...,-0.558,-11.627,-2.061,76.556,21.791,11.741,3.512,3.144,0.929,recession
3,1983.0,342.546,1.1,19.9,2.319,7.649,5.809,120446.72,5741.66,6997.24,...,0.627,-9.151,-0.207,80.005,29.217,13.934,5.457,3.457,1.109,recovery
4,1984.0,349.397,2.0,19.645,2.29,8.083,5.623,120297.85,5734.57,6763.89,...,-1.062,-1.789,-0.087,83.24,38.232,15.031,7.972,3.234,1.307,recovery
5,1985.0,344.156,-1.5,20.065,2.339,8.214,5.83,116082.04,5533.6,6767.92,...,-0.585,2.562,0.082,88.998,41.154,16.869,8.582,3.887,1.721,expansion
6,1986.0,363.428,5.6,20.692,2.587,8.848,5.694,120235.6,5731.6,6845.79,...,0.477,-1.595,0.665,99.509,41.957,17.096,7.109,4.041,1.569,expansion
7,1987.0,360.521,-0.8,20.531,2.566,8.994,5.695,116915.4,5573.33,6657.96,...,-0.387,-3.681,3.869,111.007,38.565,17.03,6.116,4.385,1.567,expansion
8,1988.0,355.474,-1.4,20.238,2.53,9.181,5.693,113124.0,5392.59,6440.37,...,-0.08,-2.942,0.105,110.67,39.362,17.986,6.621,4.65,1.542,expansion
9,1989.0,390.31,9.8,22.228,2.779,10.476,5.695,120916.02,5764.04,6886.27,...,0.353,-2.026,2.634,119.238,40.063,16.911,5.7,4.894,1.593,expansion


### Model

In [222]:
X = imf.drop(['year', 'year_label'],axis=1)
X.shape

(42, 4045)

In [215]:
Y = imf['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))

In [216]:
encoded_Y.value_counts()

1    29
3     6
2     4
0     3
dtype: int64

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.3, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [218]:
y_test

Unnamed: 0,0
25,1
16,1
41,1
34,1
14,1
1,1
24,1
26,1
20,1
11,2


In [219]:
y_train

Unnamed: 0,0
15,1
2,2
29,2
19,1
40,2
27,1
9,1
36,1
22,1
38,1


In [225]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(4045,)),
    Dense(64, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
#     Dense(16, activation='sigmoid'),
#     Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.5591939091682434, 0.692307710647583]

In [226]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

predictions

[1 1 1 1 3 3 1 1 1 3 1 1 1]


array([[0.05155224, 0.87512183, 0.03038181, 0.04294412],
       [0.08629248, 0.7897549 , 0.04027272, 0.08367993],
       [0.05549894, 0.84445983, 0.05618579, 0.04385544],
       [0.0632197 , 0.8503297 , 0.05064053, 0.03581013],
       [0.09099057, 0.32534903, 0.11636287, 0.46729755],
       [0.10156737, 0.35486552, 0.15753584, 0.38603127],
       [0.05397688, 0.8632169 , 0.02983608, 0.05297014],
       [0.04646463, 0.88156253, 0.03102816, 0.04094474],
       [0.05714579, 0.8667754 , 0.02773906, 0.0483398 ],
       [0.10372864, 0.3388023 , 0.14246902, 0.41500002],
       [0.0630362 , 0.7721534 , 0.04529841, 0.11951201],
       [0.12782349, 0.53951585, 0.13569811, 0.19696252],
       [0.06480353, 0.8504706 , 0.03434389, 0.0503821 ]], dtype=float32)

In [227]:
# The model accuracy is acceptable, however the high accuracy is only achieved because of the imbalanced data.
# The model is predicting every year as expansion
# I will try up- and downsampling

In [228]:
all_data = imf
all_data

Unnamed: 0,year,"albania_gross_domestic_product,_constant_prices","albania_gross_domestic_product,_constant_prices.1","albania_gross_domestic_product,_current_prices","albania_gross_domestic_product,_current_prices.1","albania_gross_domestic_product,_current_prices.2","albania_gross_domestic_product,_deflator","albania_gross_domestic_product_per_capita,_constant_prices","albania_gross_domestic_product_per_capita,_constant_prices.1","albania_gross_domestic_product_per_capita,_current_prices",...,"sub-saharan_africa_portfolio_investment,_net","sub-saharan_africa_other_investment,_net",sub-saharan_africa_change_in_reserves,"sub-saharan_africa_external_debt,_total","sub-saharan_africa_external_debt,_total.1","sub-saharan_africa_external_debt,_total_debt_service","sub-saharan_africa_external_debt,_total_debt_service.1","sub-saharan_africa_external_debt,_total_debt_service,_interest","sub-saharan_africa_external_debt,_total_debt_service,_interest.1",year_label
0,1980.0,311.514,2.684,18.489,1.946,5.759,5.935,116584.54,5557.56,6919.41,...,0.706,-6.298,4.972,60.018,18.911,10.493,3.35,3.301,1.04,expansion
1,1981.0,329.27,5.7,19.126,2.229,6.663,5.809,120786.15,5757.84,7016.15,...,0.197,-9.892,-8.517,66.126,21.069,10.3,3.426,3.541,1.055,expansion
2,1982.0,338.819,2.9,19.698,2.296,7.28,5.814,121689.95,5800.93,7074.57,...,-0.558,-11.627,-2.061,76.556,21.791,11.741,3.512,3.144,0.929,recession
3,1983.0,342.546,1.1,19.9,2.319,7.649,5.809,120446.72,5741.66,6997.24,...,0.627,-9.151,-0.207,80.005,29.217,13.934,5.457,3.457,1.109,recovery
4,1984.0,349.397,2.0,19.645,2.29,8.083,5.623,120297.85,5734.57,6763.89,...,-1.062,-1.789,-0.087,83.24,38.232,15.031,7.972,3.234,1.307,recovery
5,1985.0,344.156,-1.5,20.065,2.339,8.214,5.83,116082.04,5533.6,6767.92,...,-0.585,2.562,0.082,88.998,41.154,16.869,8.582,3.887,1.721,expansion
6,1986.0,363.428,5.6,20.692,2.587,8.848,5.694,120235.6,5731.6,6845.79,...,0.477,-1.595,0.665,99.509,41.957,17.096,7.109,4.041,1.569,expansion
7,1987.0,360.521,-0.8,20.531,2.566,8.994,5.695,116915.4,5573.33,6657.96,...,-0.387,-3.681,3.869,111.007,38.565,17.03,6.116,4.385,1.567,expansion
8,1988.0,355.474,-1.4,20.238,2.53,9.181,5.693,113124.0,5392.59,6440.37,...,-0.08,-2.942,0.105,110.67,39.362,17.986,6.621,4.65,1.542,expansion
9,1989.0,390.31,9.8,22.228,2.779,10.476,5.695,120916.02,5764.04,6886.27,...,0.353,-2.026,2.634,119.238,40.063,16.911,5.7,4.894,1.593,expansion


### Downsampling

In [229]:
other_categories = all_data[all_data['year_label']!='expansion']
other_categories['year_label'].value_counts()

recovery     6
recession    4
downturn     3
Name: year_label, dtype: int64

In [230]:
category_expansion = all_data[all_data['year_label']=='expansion']
category_expansion_undersampled = resample(category_expansion, replace=False, n_samples = 6)
category_expansion_undersampled

Unnamed: 0,year,"albania_gross_domestic_product,_constant_prices","albania_gross_domestic_product,_constant_prices.1","albania_gross_domestic_product,_current_prices","albania_gross_domestic_product,_current_prices.1","albania_gross_domestic_product,_current_prices.2","albania_gross_domestic_product,_deflator","albania_gross_domestic_product_per_capita,_constant_prices","albania_gross_domestic_product_per_capita,_constant_prices.1","albania_gross_domestic_product_per_capita,_current_prices",...,"sub-saharan_africa_portfolio_investment,_net","sub-saharan_africa_other_investment,_net",sub-saharan_africa_change_in_reserves,"sub-saharan_africa_external_debt,_total","sub-saharan_africa_external_debt,_total.1","sub-saharan_africa_external_debt,_total_debt_service","sub-saharan_africa_external_debt,_total_debt_service.1","sub-saharan_africa_external_debt,_total_debt_service,_interest","sub-saharan_africa_external_debt,_total_debt_service,_interest.1",year_label
10,1990.0,351.279,-10.0,20.006,2.221,9.782,5.695,106884.08,5095.14,6087.18,...,0.422,-1.089,4.333,173.01,47.315,19.507,5.452,4.41,1.071,expansion
27,2007.0,584.254,5.983,965.528,10.677,23.686,165.258,196717.32,9377.46,325091.66,...,-5.898,0.488,28.006,230.022,20.968,60.796,5.547,5.533,0.526,expansion
35,2015.0,735.657,2.219,1434.31,11.389,33.595,194.969,255374.21,12173.62,497901.56,...,-21.485,8.579,-16.484,476.029,32.607,83.004,6.14,13.256,0.817,expansion
8,1988.0,355.474,-1.4,20.238,2.53,9.181,5.693,113124.0,5392.59,6440.37,...,-0.08,-2.942,0.105,110.67,39.362,17.986,6.621,4.65,1.542,expansion
34,2014.0,719.688,1.774,1395.31,13.246,32.529,193.876,249104.37,11874.74,482954.1,...,-8.625,-10.517,-9.102,445.566,26.221,70.988,4.24,11.04,0.608,expansion
14,1994.0,281.424,9.4,223.571,2.361,8.664,79.443,87738.26,4182.46,69701.86,...,-2.598,-2.177,3.536,200.323,62.662,32.446,9.468,7.562,1.882,expansion


In [231]:
imf_undersampled = pd.concat([other_categories, category_expansion_undersampled], axis=0)
imf_undersampled.shape

(19, 4047)

In [232]:
X_undersampled = imf_undersampled.drop(['year', 'year_label'],axis=1)

Y_undersampled = imf_undersampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y_undersampled)
encoded_Y_undersampled = pd.DataFrame(encoder.transform(Y_undersampled))

X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_undersampled, encoded_Y_undersampled,
                                                                            test_size=0.3)


scaler = preprocessing.StandardScaler()
X_train_under = scaler.fit_transform(X_train_under)
X_test_under = scaler.transform(X_test_under)

In [237]:
X_undersampled.shape

(19, 4045)

In [238]:
# Build the model.
model_under = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(4045,)),
    Dense(64, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model_under.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history_under = model_under.fit(
  X_train_under,
  to_categorical(y_train_under), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model_under.evaluate(
  X_test_under,
  to_categorical(y_test_under)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[2.0892231464385986, 0.1666666716337204]

In [239]:
predictions = model_under.predict(X_test_under)
print(np.argmax(predictions, axis=1))

predictions

[3 3 3 1 3 1]


array([[0.02643569, 0.2696185 , 0.20412655, 0.49981922],
       [0.03921694, 0.1328332 , 0.04001822, 0.78793174],
       [0.02647947, 0.06395682, 0.03171552, 0.8778482 ],
       [0.04423366, 0.7004931 , 0.1406303 , 0.11464298],
       [0.06270721, 0.41904297, 0.03698951, 0.48126033],
       [0.03149334, 0.4239604 , 0.17929727, 0.3652491 ]], dtype=float32)

In [240]:
y_test_under

Unnamed: 0,0
11,0
3,2
0,2
17,1
13,1
9,3


In [None]:
# Not much better!

## Upsampling

In [241]:
category_expansion = all_data[all_data['year_label']=='expansion']
category_recession = all_data[all_data['year_label']=='recession']
category_recovery = all_data[all_data['year_label']=='recovery']
category_downturn = all_data[all_data['year_label']=='downturn']

category_recession_oversampled = resample(category_recession,replace=True, n_samples = len(category_expansion))
category_recovery_oversampled = resample(category_recovery,replace=True, n_samples = len(category_expansion))
category_downturn_oversampled = resample(category_downturn,replace=True, n_samples = len(category_expansion))

imf_upsampled= pd.concat([category_expansion,
                          category_recession_oversampled,
                          category_recovery_oversampled,
                          category_downturn_oversampled], axis=0)
imf_upsampled.shape

(116, 4047)

In [242]:
X_updersampled = imf_upsampled.drop(['year', 'year_label'],axis=1)

Y_upsampled = imf_upsampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y_upsampled)
encoded_Y_upsampled = pd.DataFrame(encoder.transform(Y_upsampled))

X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_updersampled, encoded_Y_upsampled,
                                                                            test_size=0.3)


scaler = preprocessing.StandardScaler()
X_train_under = scaler.fit_transform(X_train_up)
X_test_under = scaler.transform(X_test_up)

In [246]:
X_updersampled.shape

(116, 4045)

In [243]:
Y_upsampled

0     expansion
1     expansion
5     expansion
6     expansion
7     expansion
        ...    
32     downturn
32     downturn
18     downturn
18     downturn
18     downturn
Name: year_label, Length: 116, dtype: object

In [244]:
# 1: expansion, 2: recession, 3: recovery, 0: downturn

In [245]:
encoded_Y_upsampled

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
111,0
112,0
113,0
114,0


In [247]:
# Build the model.
model_up = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(4045,)),
    Dense(128, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(8, activation='sigmoid'), 
    Dense(4, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model_up.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history_up = model_up.fit(
  X_train_up,
  to_categorical(y_train_up), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model_up.evaluate(
  X_test_up,
  to_categorical(y_test_up)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[1.4027689695358276, 0.20000000298023224]

In [248]:
predictions = model_up.predict(X_test_up)
print(np.argmax(predictions, axis=1))

predictions

[1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 3 1]


array([[0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.24196844, 0.27123812, 0.21822278, 0.26857066],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.24196844, 0.27123812, 0.21822278, 0.26857066],
       [0.24352106, 0.26686624, 0.22220705, 0.26740566],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.24196844, 0.27123812, 0.21822278, 0.26857066],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.25177568, 0.27790913, 0.21697238, 0.25334278],
       [0.24196844, 0.27123812, 0.21822278, 0.26857066],
       [0.24196844, 0.27123812, 0.21822278, 0.26857066],
       [0.24548447, 0.2692928 ,

In [250]:
# 1: expansion, 2: recession, 3: recovery, 0: downturn

### Conclusions

None of the models was able to predict the categories with accuracy. Even with upsampling, almost  every category is being missed.

I believe the next database can be better if I focus on % data instead of data in dollars will have better results, because the effect of growth trend and currency change throughout the years.

In [258]:
orignal_imf['Scale'].unique()

array(['Billions', 'None', 'Units', 'Millions'], dtype=object)

# World Bank Data

In [69]:
wb_gdp = pd.read_csv(r'C:\Users\luana\Ironhack DA\Unit 9\final_bootcamp_project\csv_files\wb_gdp_predictions.csv')
wb_gdp

Unnamed: 0,WLD_gdp_per_capita_constant_us$,WLD_gdp_per_capita_constant_%_growth,ARG_gdp_per_capita_constant_us$,ARG_gdp_per_capita_constant_%_growth,AUS_gdp_per_capita_constant_us$,AUS_gdp_per_capita_constant_%_growth,BRA_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_%_growth,CHN_gdp_per_capita_constant_%_growth,...,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth,time
0,3676.575517,2.457848,7637.066652,3.728779,20045.843657,0.464273,2754.246341,13131.899295,1.119387,-26.527644,...,5.740646,8.207246,12.043536,5.0,3.844734,6.935993,1.156069,2.677119,2.3,YR1961
1,3806.403544,3.531222,7451.803394,-2.425843,19815.749592,-1.147839,2851.492888,13847.043915,5.445858,-6.351505,...,1.841978,6.20365,8.908973,4.664415,6.177931,3.895273,5.571429,1.10291,6.1,YR1962
2,3922.051459,3.038246,6945.957125,-6.78824,20647.485947,4.19735,2786.362446,14318.565243,3.405213,7.622254,...,-2.23703,5.609728,8.473642,8.106887,7.373709,9.020568,9.066306,4.874384,4.4,YR1963
3,4095.196204,4.414647,7532.004475,8.437244,21659.152058,4.899706,2799.641949,14984.486701,4.650755,15.468995,...,3.529698,2.797702,11.676708,11.905481,7.939609,9.473825,5.459057,5.533659,5.8,YR1964
4,4235.39882,3.423587,8202.112548,8.896809,22509.097404,3.924186,2787.438778,15645.230386,4.409518,14.197889,...,1.081589,3.268024,5.819708,7.1,6.122798,7.318434,2.82353,2.142177,6.4,YR1965
5,4384.883315,3.529408,8026.876237,-2.136478,22525.051994,0.070881,2893.575825,16386.137772,4.735676,7.608404,...,2.791347,5.984794,10.638562,6.096139,4.438386,11.993957,11.212815,1.5731,6.5,YR1966
6,4475.418941,2.064721,8161.602114,1.678435,23644.600491,4.970237,2934.987745,16591.426063,1.252817,-8.161498,...,1.380403,7.178612,11.082142,5.854925,7.196523,9.079607,4.73251,2.786475,2.5,YR1967
7,4645.798899,3.807017,8429.868888,3.286938,24414.780136,3.257317,3138.785732,17141.473322,3.31525,-6.571452,...,10.915179,6.544555,12.882468,9.423279,4.153373,13.165768,6.777996,5.441083,4.8,YR1968
8,4815.438894,3.651471,9108.496877,8.050279,25593.126397,4.826364,3349.531837,17763.148835,3.626733,13.77934,...,6.822298,6.09806,12.477895,3.41862,4.715903,14.561367,4.081146,1.924097,3.1,YR1969
9,4902.100198,1.799655,9243.256579,1.479494,26894.506478,5.084881,3605.870486,18124.59034,2.034783,16.050399,...,7.554635,5.268693,2.454958,6.502484,5.248661,10.052735,3.233509,6.317907,-0.283491,YR1970


In [70]:
wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')

In [73]:
# Dropping 2021,2022 and 2023. I will label them later
wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')
wb_gdp_to_predict = wb_gdp[-5:]
wb_gdp = wb_gdp[:-2]
wb_gdp

Unnamed: 0,WLD_gdp_per_capita_constant_us$,WLD_gdp_per_capita_constant_%_growth,ARG_gdp_per_capita_constant_us$,ARG_gdp_per_capita_constant_%_growth,AUS_gdp_per_capita_constant_us$,AUS_gdp_per_capita_constant_%_growth,BRA_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_%_growth,CHN_gdp_per_capita_constant_%_growth,...,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth,time,year_label
0,3676.575517,2.457848,7637.066652,3.728779,20045.843657,0.464273,2754.246341,13131.899295,1.119387,-26.527644,...,8.207246,12.043536,5.0,3.844734,6.935993,1.156069,2.677119,2.3,1961,expansion
1,3806.403544,3.531222,7451.803394,-2.425843,19815.749592,-1.147839,2851.492888,13847.043915,5.445858,-6.351505,...,6.20365,8.908973,4.664415,6.177931,3.895273,5.571429,1.10291,6.1,1962,expansion
2,3922.051459,3.038246,6945.957125,-6.78824,20647.485947,4.19735,2786.362446,14318.565243,3.405213,7.622254,...,5.609728,8.473642,8.106887,7.373709,9.020568,9.066306,4.874384,4.4,1963,expansion
3,4095.196204,4.414647,7532.004475,8.437244,21659.152058,4.899706,2799.641949,14984.486701,4.650755,15.468995,...,2.797702,11.676708,11.905481,7.939609,9.473825,5.459057,5.533659,5.8,1964,expansion
4,4235.39882,3.423587,8202.112548,8.896809,22509.097404,3.924186,2787.438778,15645.230386,4.409518,14.197889,...,3.268024,5.819708,7.1,6.122798,7.318434,2.82353,2.142177,6.4,1965,expansion
5,4384.883315,3.529408,8026.876237,-2.136478,22525.051994,0.070881,2893.575825,16386.137772,4.735676,7.608404,...,5.984794,10.638562,6.096139,4.438386,11.993957,11.212815,1.5731,6.5,1966,expansion
6,4475.418941,2.064721,8161.602114,1.678435,23644.600491,4.970237,2934.987745,16591.426063,1.252817,-8.161498,...,7.178612,11.082142,5.854925,7.196523,9.079607,4.73251,2.786475,2.5,1967,expansion
7,4645.798899,3.807017,8429.868888,3.286938,24414.780136,3.257317,3138.785732,17141.473322,3.31525,-6.571452,...,6.544555,12.882468,9.423279,4.153373,13.165768,6.777996,5.441083,4.8,1968,expansion
8,4815.438894,3.651471,9108.496877,8.050279,25593.126397,4.826364,3349.531837,17763.148835,3.626733,13.77934,...,6.09806,12.477895,3.41862,4.715903,14.561367,4.081146,1.924097,3.1,1969,expansion
9,4902.100198,1.799655,9243.256579,1.479494,26894.506478,5.084881,3605.870486,18124.59034,2.034783,16.050399,...,5.268693,2.454958,6.502484,5.248661,10.052735,3.233509,6.317907,-0.283491,1970,expansion


In [74]:
wb_gdp_to_predict

Unnamed: 0,WLD_gdp_per_capita_constant_us$,WLD_gdp_per_capita_constant_%_growth,ARG_gdp_per_capita_constant_us$,ARG_gdp_per_capita_constant_%_growth,AUS_gdp_per_capita_constant_us$,AUS_gdp_per_capita_constant_%_growth,BRA_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_%_growth,CHN_gdp_per_capita_constant_%_growth,...,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth,time,year_label
58,11019.375428,1.531947,12712.970738,-2.994388,58781.046657,0.571103,8622.066599,45109.244486,0.426918,5.575317,...,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.28887,2019,expansion
59,10548.904223,-4.269491,11344.405742,-10.765108,58029.515526,-1.278526,8228.774263,42258.691017,-6.319222,1.996619,...,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.40459,2020,recession
60,11057.420754,4.820563,12390.808688,9.223956,58780.333061,1.293855,8551.205336,43945.55699,3.991761,8.013345,...,6.64379,1.620796,4.797192,4.914603,4.021158,10.986181,7.441273,5.671107,2021,expansion
61,11203.033754,2.270873,12361.024482,1.667638,59435.221629,1.724197,8592.317906,44377.128153,2.401821,7.775952,...,3.622954,2.103044,4.025119,3.694968,5.689092,4.744824,2.632161,3.605779,2022,expansion
62,11349.467439,1.925467,12332.858371,1.027657,60090.414653,1.807011,8632.453242,44806.346223,2.120703,7.687219,...,2.636593,2.41544,3.760614,3.186334,6.46644,4.820301,2.342422,3.136603,2023,expansion


In [75]:
# Label the years

wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')

pre_recession_years = ['1974', '1981','1990', '2008' ]
crisis_years = ['1975', '1982', '1991', '2009', '2020']
recovery_years = ['1976', '1977', '1983', '1984', '1992', '1993', '2010', '2011']
downturn_years = ['1958', '1998', '2001', '2012']

labels = []
for i in range(len(wb_gdp)):
    if wb_gdp['time'][i] in pre_recession_years:
        labels.append('pre_recession')
    
    elif wb_gdp ['time'][i] in crisis_years:
        labels.append('recession')

    elif wb_gdp['time'][i] in recovery_years:
        labels.append('recovery')

    elif wb_gdp ['time'][i] in downturn_years:
        labels.append('downturn')
        
    else:
        labels.append('expansion')

wb_gdp['year_label'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wb_gdp ['time'] = wb_gdp ['time'].str.replace('YR', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wb_gdp['year_label'] = labels


In [76]:
X = wb_gdp.drop(['time', 'year_label'],axis=1)

Y = wb_gdp['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.3, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(61, 55)


In [77]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(64, activation='sigmoid', input_shape=(55,)),
    Dense(32, activation='relu'),      # choosing relu instead of sigmoid, this is somewhat common
#     Dense(32, activation='sigmoid'),
#     Dense(16, activation='sigmoid'),
#     Dense(8, activation='sigmoid'), 
    Dense(5, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=12, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.8855825662612915, 0.7368420958518982]

In [78]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

predictions

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


array([[0.09214434, 0.6225214 , 0.09952252, 0.10269777, 0.08311396],
       [0.00250312, 0.9091983 , 0.02466353, 0.0146426 , 0.04899244],
       [0.02651972, 0.7270423 , 0.04141045, 0.10000681, 0.1050206 ],
       [0.00490177, 0.897357  , 0.02519205, 0.02363337, 0.04891578],
       [0.01996132, 0.6932931 , 0.05005115, 0.05566149, 0.18103294],
       [0.01597294, 0.73842496, 0.06390724, 0.03317683, 0.14851801],
       [0.01673852, 0.83165425, 0.05266203, 0.02407875, 0.07486647],
       [0.01300393, 0.7377654 , 0.04233677, 0.05033065, 0.15656328],
       [0.003906  , 0.8943987 , 0.02509134, 0.01548908, 0.06111495],
       [0.09689139, 0.5303621 , 0.10184642, 0.1591352 , 0.11176495],
       [0.00333629, 0.89837474, 0.02164334, 0.01256252, 0.06408311],
       [0.00683217, 0.907285  , 0.02256753, 0.0291938 , 0.03412158],
       [0.04498187, 0.76585734, 0.02225717, 0.11117001, 0.05573365],
       [0.09715847, 0.6263015 , 0.09883775, 0.10475771, 0.0729446 ],
       [0.02963341, 0.5203088 , 0.

In [None]:
# Again, it can only predict mostly the expansion years

In [79]:
# I will drop currency values and keep only percentages

columns_percentage = ['time', 'year_label']

for column in wb_gdp.columns:
    if '%' in column:
        columns_percentage.append(column)

        
columns_percentage

['time',
 'year_label',
 'WLD_gdp_per_capita_constant_%_growth',
 'ARG_gdp_per_capita_constant_%_growth',
 'AUS_gdp_per_capita_constant_%_growth',
 'CAN_gdp_per_capita_constant_%_growth',
 'CHN_gdp_per_capita_constant_%_growth',
 'FRA_gdp_per_capita_constant_%_growth',
 'DEU_gdp_per_capita_constant_%_growth',
 'IND_gdp_per_capita_constant_%_growth',
 'IDN_gdp_per_capita_constant_%_growth',
 'ITA_gdp_per_capita_constant_%_growth',
 'JPN_gdp_per_capita_constant_%_growth',
 'MEX_gdp_per_capita_constant_%_growth',
 'RUS_gdp_per_capita_constant_%_growth',
 'ZAF_gdp_per_capita_constant_%_growth',
 'KOR_gdp_per_capita_constant_%_growth',
 'TUR_gdp_per_capita_constant_%_growth',
 'GBR_gdp_per_capita_constant_%_growth',
 'USA_gdp_per_capita_constant_%_growth',
 'SAU_gdp_per_capita_%_growth',
 'WLD_gdp_%_growth',
 'ARG_gdp_%_growth',
 'AUS_gdp_%_growth',
 'BRA_gdp_%_growth',
 'CHN_gdp_%_growth',
 'FRA_gdp_%_growth',
 'IND_gdp_%_growth',
 'IDN_gdp_%_growth',
 'ITA_gdp_%_growth',
 'JPN_gdp_%_growt

In [80]:
gdp_percentage = wb_gdp[columns_percentage]
gdp_percentage

Unnamed: 0,time,year_label,WLD_gdp_per_capita_constant_%_growth,ARG_gdp_per_capita_constant_%_growth,AUS_gdp_per_capita_constant_%_growth,CAN_gdp_per_capita_constant_%_growth,CHN_gdp_per_capita_constant_%_growth,FRA_gdp_per_capita_constant_%_growth,DEU_gdp_per_capita_constant_%_growth,IND_gdp_per_capita_constant_%_growth,...,IND_gdp_%_growth,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth
0,1961,expansion,2.457848,3.728779,0.464273,1.119387,-26.527644,3.604901,1.734333,1.670482,...,3.722743,5.740646,8.207246,12.043536,5.0,3.844734,6.935993,1.156069,2.677119,2.3
1,1962,expansion,3.531222,-2.425843,-1.147839,5.445858,-6.351505,5.361747,1.734333,0.86077,...,2.931128,1.841978,6.20365,8.908973,4.664415,6.177931,3.895273,5.571429,1.10291,6.1
2,1963,expansion,3.038246,-6.78824,4.19735,3.405213,7.622254,4.751682,1.734333,3.836414,...,5.994353,-2.23703,5.609728,8.473642,8.106887,7.373709,9.020568,9.066306,4.874384,4.4
3,1964,expansion,4.414647,8.437244,4.899706,4.650755,15.468995,5.24837,1.734333,5.249912,...,7.45295,3.529698,2.797702,11.676708,11.905481,7.939609,9.473825,5.459057,5.533659,5.8
4,1965,expansion,3.423587,8.896809,3.924186,4.409518,14.197889,3.625023,1.734333,-4.640156,...,-2.63577,1.081589,3.268024,5.819708,7.1,6.122798,7.318434,2.82353,2.142177,6.4
5,1966,expansion,3.529408,-2.136478,0.070881,4.735676,7.608404,4.18061,1.734333,-2.116105,...,-0.055329,2.791347,5.984794,10.638562,6.096139,4.438386,11.993957,11.212815,1.5731,6.5
6,1967,expansion,2.064721,1.678435,4.970237,1.252817,-8.161498,4.000543,1.734333,5.594634,...,7.825963,1.380403,7.178612,11.082142,5.854925,7.196523,9.079607,4.73251,2.786475,2.5
7,1968,expansion,3.807017,3.286938,3.257317,3.31525,-6.571452,3.675154,1.734333,1.226223,...,3.387929,10.915179,6.544555,12.882468,9.423279,4.153373,13.165768,6.777996,5.441083,4.8
8,1969,expansion,3.651471,8.050279,4.826364,3.626733,13.77934,6.30696,1.734333,4.269855,...,6.5397,6.822298,6.09806,12.477895,3.41862,4.715903,14.561367,4.081146,1.924097,3.1
9,1970,expansion,1.799655,1.479494,5.084881,2.034783,16.050399,5.30014,1.734333,2.864361,...,5.15723,7.554635,5.268693,2.454958,6.502484,5.248661,10.052735,3.233509,6.317907,-0.283491


In [81]:
X = gdp_percentage.drop(['time', 'year_label'],axis=1)

Y = gdp_percentage['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.35, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(61, 35)


In [84]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(35,)),
    Dense(128, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
#     Dense(32, activation='sigmoid'),
#     Dense(16, activation='sigmoid'),
#     Dense(8, activation='sigmoid'), 
    Dense(5, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[1.166574478149414, 0.5909090638160706]

In [85]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

predictions

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


array([[0.05332763, 0.7425    , 0.04033038, 0.04890316, 0.11493876],
       [0.11058173, 0.37563038, 0.0580908 , 0.30101356, 0.15468352],
       [0.09086144, 0.3464437 , 0.06109138, 0.27309737, 0.22850615],
       [0.017377  , 0.93066925, 0.01481876, 0.0052945 , 0.03184048],
       [0.06718933, 0.626393  , 0.05982855, 0.12848726, 0.11810175],
       [0.02207004, 0.86682206, 0.01847106, 0.00866187, 0.08397492],
       [0.07721409, 0.70190847, 0.04397579, 0.08552225, 0.09137934],
       [0.05553359, 0.66005754, 0.03392054, 0.04522375, 0.2052646 ],
       [0.09021376, 0.61306405, 0.05068747, 0.09260514, 0.15342961],
       [0.01752708, 0.88927126, 0.01460871, 0.00570449, 0.07288846],
       [0.09734059, 0.45910618, 0.05660617, 0.2503517 , 0.1365954 ],
       [0.03499655, 0.8321994 , 0.02225516, 0.01090044, 0.09964842],
       [0.04269451, 0.8003602 , 0.03159103, 0.02401378, 0.10134054],
       [0.02230938, 0.91136694, 0.01843221, 0.00708404, 0.04080731],
       [0.02519186, 0.86635673, 0.

In [86]:
y_test

Unnamed: 0,0
18,1
47,2
14,3
1,1
13,2
49,4
19,1
33,1
55,1
23,4


In [87]:
# Removing the columns in dolar seems to have improved the predictions
# I will upsample again and downsample again


category_expansion = gdp_percentage[gdp_percentage['year_label']=='expansion']
category_pre_recession = gdp_percentage[gdp_percentage['year_label']=='pre_recession']
category_recession = gdp_percentage[gdp_percentage['year_label']=='recession']
category_recovery = gdp_percentage[gdp_percentage['year_label']=='recovery']
category_downturn = gdp_percentage[gdp_percentage['year_label']=='downturn']

category_pre_recession_oversampled = resample(category_pre_recession,replace=True, n_samples = len(category_expansion))
category_recession_oversampled = resample(category_recession,replace=True, n_samples = len(category_expansion))
category_recovery_oversampled = resample(category_recovery,replace=True, n_samples = len(category_expansion))
category_downturn_oversampled = resample(category_downturn,replace=True, n_samples = len(category_expansion))

gdp_percentage_upsampled = pd.concat([category_expansion,
                                      category_pre_recession,
                                      category_recession_oversampled,
                                      category_recovery_oversampled,
                                      category_downturn_oversampled], axis=0)

gdp_percentage_upsampled.shape

(168, 37)

In [88]:
X = gdp_percentage_upsampled.drop(['time', 'year_label'],axis=1)

Y = gdp_percentage_upsampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.35, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(168, 35)


In [22]:
# 1: Expansion, 2: pre_recession , 3:recession, 4:recovery , 0: downturn

In [89]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(35,)),
    Dense(128, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
#     Dense(32, activation='sigmoid'),
#     Dense(16, activation='sigmoid'),
#     Dense(8, activation='sigmoid'), 
    Dense(5, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.45452171564102173, 0.8644067645072937]

In [90]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

[0 3 4 3 0 0 4 3 3 4 1 3 4 0 3 0 0 1 0 1 3 1 3 0 4 3 0 3 3 3 4 4 1 1 3 0 4
 4 0 0 4 3 0 0 1 3 0 0 3 0 3 0 1 0 0 4 0 1 1]


In [91]:
predictions_array =np.argmax(predictions, axis=1)
y_test_df = pd.DataFrame(y_test).reset_index(drop=True)
y_test_df.columns= ['y_test']
predictions_df = pd.DataFrame(predictions_array)
predictions_df.columns= ['predicted_y']
predictions_df = pd.concat([predictions_df,y_test_df], axis=1)
pd.set_option('display.max_rows',None)
predictions_df

Unnamed: 0,predicted_y,y_test
0,0,0
1,3,3
2,4,4
3,3,3
4,0,0
5,0,2
6,4,4
7,3,3
8,3,3
9,4,4


In [92]:
# This model has a much better accuracy! Will be used for predictions
model_upsampled = model 
scaler_upsampled  = scaler

filename = 'model_gdp_upsampled.sav'
pickle.dump(model, open(filename, 'wb'))

filename= 'scaler_gdp_upsample.sav'
pickle.dump(scaler, open(filename, 'wb'))

INFO:tensorflow:Assets written to: C:\Users\luana\AppData\Local\Temp\tmp2hm_vh3x\assets


In [93]:
# Just in case I will also downsample
gdp_percentage['year_label'].value_counts()

expansion        41
recovery          8
recession         5
pre_recession     4
downturn          3
Name: year_label, dtype: int64

In [94]:
category_expansion = gdp_percentage[gdp_percentage['year_label']=='expansion']
other_categories = gdp_percentage[gdp_percentage['year_label']!='expansion']
category_category_expansion_undersampled = resample(category_expansion,replace=True, n_samples = 8)

gdp_percentage_undersampled = pd.concat([other_categories, category_category_expansion_undersampled], axis=0)

gdp_percentage_undersampled.shape

(28, 37)

In [95]:
X = gdp_percentage_undersampled.drop(['time', 'year_label'],axis=1)

Y = gdp_percentage_undersampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.15, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(28, 35)


In [96]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(35,)),
    Dense(128, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(8, activation='sigmoid'), 
    Dense(5, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=20,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[1.5696659088134766, 0.4000000059604645]

In [97]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

[1 1 1 1 1]


In [98]:
y_test

Unnamed: 0,0
17,4
20,1
2,4
26,1
18,0


In [32]:
y_train

Unnamed: 0,0
21,1
0,2
26,1
8,2
2,4
20,1
12,0
19,3
14,2
25,1


In [None]:
# Not great... moving on with the upsampled model

In [99]:
wb_gdp_to_predict

Unnamed: 0,WLD_gdp_per_capita_constant_us$,WLD_gdp_per_capita_constant_%_growth,ARG_gdp_per_capita_constant_us$,ARG_gdp_per_capita_constant_%_growth,AUS_gdp_per_capita_constant_us$,AUS_gdp_per_capita_constant_%_growth,BRA_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_us$,CAN_gdp_per_capita_constant_%_growth,CHN_gdp_per_capita_constant_%_growth,...,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth,time,year_label
58,11019.375428,1.531947,12712.970738,-2.994388,58781.046657,0.571103,8622.066599,45109.244486,0.426918,5.575317,...,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.28887,2019,expansion
59,10548.904223,-4.269491,11344.405742,-10.765108,58029.515526,-1.278526,8228.774263,42258.691017,-6.319222,1.996619,...,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.40459,2020,recession
60,11057.420754,4.820563,12390.808688,9.223956,58780.333061,1.293855,8551.205336,43945.55699,3.991761,8.013345,...,6.64379,1.620796,4.797192,4.914603,4.021158,10.986181,7.441273,5.671107,2021,expansion
61,11203.033754,2.270873,12361.024482,1.667638,59435.221629,1.724197,8592.317906,44377.128153,2.401821,7.775952,...,3.622954,2.103044,4.025119,3.694968,5.689092,4.744824,2.632161,3.605779,2022,expansion
62,11349.467439,1.925467,12332.858371,1.027657,60090.414653,1.807011,8632.453242,44806.346223,2.120703,7.687219,...,2.636593,2.41544,3.760614,3.186334,6.46644,4.820301,2.342422,3.136603,2023,expansion


In [100]:
X = wb_gdp_to_predict.drop('time', axis=1)
X = wb_gdp_to_predict.drop([column for column in wb_gdp_to_predict.columns if '%' not in column], axis=1)
X

Unnamed: 0,WLD_gdp_per_capita_constant_%_growth,ARG_gdp_per_capita_constant_%_growth,AUS_gdp_per_capita_constant_%_growth,CAN_gdp_per_capita_constant_%_growth,CHN_gdp_per_capita_constant_%_growth,FRA_gdp_per_capita_constant_%_growth,DEU_gdp_per_capita_constant_%_growth,IND_gdp_per_capita_constant_%_growth,IDN_gdp_per_capita_constant_%_growth,ITA_gdp_per_capita_constant_%_growth,...,IND_gdp_%_growth,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth
58,1.531947,-2.994388,0.571103,0.426918,5.575317,1.620358,0.827865,2.69209,3.87256,1.665737,...,3.737919,5.019288,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.28887
59,-4.269491,-10.765108,-1.278526,-6.319222,1.996619,-8.034379,-4.647545,-7.515675,-3.102652,-8.597873,...,-6.596081,-2.065005,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.40459
60,4.820563,9.223956,1.293855,3.991761,8.013345,6.774307,2.931698,7.899879,2.626357,7.33584,...,8.947963,3.69124,6.64379,1.620796,4.797192,4.914603,4.021158,10.986181,7.441273,5.671107
61,2.270873,1.667638,1.724197,2.401821,7.775952,3.252823,1.871355,3.517629,2.978407,3.338313,...,5.110389,4.610545,3.622954,2.103044,4.025119,3.694968,5.689092,4.744824,2.632161,3.605779
62,1.925467,1.027657,1.807011,2.120703,7.687219,2.359089,1.752003,3.22011,3.093034,2.282223,...,5.112045,4.921215,2.636593,2.41544,3.760614,3.186334,6.46644,4.820301,2.342422,3.136603


In [101]:
# Loading the model

filename = 'model_gdp_upsampled.sav'
loaded_model  = pickle.load(open(filename, 'rb'))


filename= 'scaler_gdp_upsample.sav'
loaded_scaler = pickle.load(open(filename, 'rb'))




X_scaled = loaded_scaler.transform(X)

predictions = loaded_model.predict(X_scaled)
print(np.argmax(predictions, axis=1))



[0 3 1 1 1]


In [68]:
# 1: Expansion, 2: pre_recession , 3:recession, 4:recovery , 0: downturn

# Conclusion:

#### This model has predicted expansion for 2022 and 2023

# World Bank: data from 1981

In [107]:
pd.set_option('display.max_rows', 10)
wb_1981 = pd.read_csv(r'C:\Users\luana\Ironhack DA\Unit 9\final_bootcamp_project\csv_files\wb_from_1981_predictions.csv')
wb_1981

Unnamed: 0,time,WLD_gdp_per_capita_constant_us$,WLD_gdp_per_capita_constant_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,ARG_gdp_per_capita_constant_us$,ARG_gdp_per_capita_constant_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_current_account_balance_us$,ARG_industry_value_added_%_growth,...,IND_gdp_%_growth,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth
0,YR1981,5931.823617,0.166623,12.471612,5.697886,9630.115178,-6.668498,10.977871,-4.712000e+09,-10.663334,...,6.006204,7.878788,0.844228,4.260624,8.525607,5.360791,7.246176,4.856649,-0.787744,2.537719
1,YR1982,5849.893784,-1.381191,10.240268,5.697886,9407.874641,-2.307766,10.977871,-2.353000e+09,-3.209381,...,3.475733,2.247191,0.413586,3.279743,-0.520808,-0.383419,8.338078,3.563228,1.994891,-1.802887
2,YR1983,5899.646053,0.850482,8.771147,5.697886,9660.192130,2.681982,10.977871,-2.436000e+09,5.377688,...,7.288893,4.395604,1.169203,3.630199,-3.486422,-1.846558,13.376174,4.971081,4.221856,4.583913
3,YR1984,6069.455639,2.878301,8.116398,5.697886,9655.183649,-0.051847,10.977871,-2.495000e+09,0.674286,...,3.820738,6.842105,3.225852,4.410880,3.410814,5.099152,10.551640,6.712016,2.269105,7.236633
4,YR1985,6185.809777,1.917044,6.856812,5.697886,9009.001729,-6.692591,10.977871,-9.520000e+08,-8.651012,...,5.254299,2.463054,2.798086,5.159808,2.187693,-1.211541,7.838864,4.241336,4.147415,4.169656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,YR2019,11019.375428,1.531947,2.186902,5.356915,12712.970738,-2.994388,9.840000,-3.709999e+09,-4.790141,...,3.737919,5.019288,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.288870
39,YR2020,10548.904223,-4.269491,1.920968,6.573234,11344.405742,-10.765108,11.460000,3.312751e+09,-9.369053,...,-6.596081,-2.065005,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.404590
40,YR2021,11057.420754,4.820563,3.423629,6.178060,12390.808688,9.223956,10.902000,6.800104e+09,15.153015,...,8.947963,3.691240,6.643790,1.620796,4.797192,4.914603,4.021158,10.986181,7.441273,5.671107
41,YR2022,11218.990570,1.396958,3.652210,5.973768,12379.876780,2.364294,10.913762,4.343408e+09,3.556215,...,5.806968,4.459982,1.220023,1.594004,1.796818,2.424469,5.108022,4.139784,2.094987,3.099033


In [108]:
wb_1981 ['time'] = wb_1981 ['time'].str.replace('YR', '')
percentage_columns_1981 = ['time']
for column in wb_1981.columns:
    if "%" in column:
        percentage_columns_1981.append(column)

wb_1981  = wb_1981[percentage_columns_1981]
wb_1981_to_predict = wb_1981[-5:]
wb_1981 = wb_1981[:-2]

In [105]:
wb_1981

Unnamed: 0,time,WLD_gdp_per_capita_constant_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,ARG_gdp_per_capita_constant_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_industry_value_added_%_growth,AUS_gdp_per_capita_constant_%_growth,AUS_consumer_price_index_%,AUS_unemployment_%_of_total_labor_force,...,IND_gdp_%_growth,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth
0,1981,0.166623,12.471612,5.697886,-6.668498,10.977871,-10.663334,1.711915,9.487666,6.519742,...,6.006204,7.878788,0.844228,4.260624,8.525607,5.360791,7.246176,4.856649,-0.787744,2.537719
1,1982,-1.381191,10.240268,5.697886,-2.307766,10.977871,-3.209381,1.614208,11.351820,6.519742,...,3.475733,2.247191,0.413586,3.279743,-0.520808,-0.383419,8.338078,3.563228,1.994891,-1.802887
2,1983,0.850482,8.771147,5.697886,2.681982,10.977871,5.377688,-3.437117,10.038911,6.519742,...,7.288893,4.395604,1.169203,3.630199,-3.486422,-1.846558,13.376174,4.971081,4.221856,4.583913
3,1984,2.878301,8.116398,5.697886,-0.051847,10.977871,0.674286,3.418345,3.960396,6.519742,...,3.820738,6.842105,3.225852,4.410880,3.410814,5.099152,10.551640,6.712016,2.269105,7.236633
4,1985,1.917044,6.856812,5.697886,-6.692591,10.977871,-8.651012,3.822749,6.734694,6.519742,...,5.254299,2.463054,2.798086,5.159808,2.187693,-1.211541,7.838864,4.241336,4.147415,4.169656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2017,2.200971,2.192010,5.557650,1.757648,8.350000,2.302833,0.587161,1.948647,5.590000,...,6.795383,5.069786,1.667859,1.675332,2.113129,1.157947,3.159636,7.501997,2.134453,2.255680
37,2018,2.144471,2.438737,5.389680,-3.601610,9.220000,-3.049315,1.302841,1.911401,5.300000,...,6.453851,5.174292,0.925811,0.584068,2.194995,1.487617,2.907404,2.979885,1.650925,2.918857
38,2019,1.531947,2.186902,5.356915,-2.994388,9.840000,-4.790141,0.571103,1.610768,5.160000,...,3.737919,5.019288,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.288870
39,2020,-4.269491,1.920968,6.573234,-10.765108,11.460000,-9.369053,-1.278526,0.846906,6.460000,...,-6.596081,-2.065005,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.404590


In [109]:
wb_1981_to_predict

Unnamed: 0,time,WLD_gdp_per_capita_constant_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,ARG_gdp_per_capita_constant_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_industry_value_added_%_growth,AUS_gdp_per_capita_constant_%_growth,AUS_consumer_price_index_%,AUS_unemployment_%_of_total_labor_force,...,IND_gdp_%_growth,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth
38,2019,1.531947,2.186902,5.356915,-2.994388,9.84,-4.790141,0.571103,1.610768,5.16,...,3.737919,5.019288,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.28887
39,2020,-4.269491,1.920968,6.573234,-10.765108,11.46,-9.369053,-1.278526,0.846906,6.46,...,-6.596081,-2.065005,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.40459
40,2021,4.820563,3.423629,6.17806,9.223956,10.902,15.153015,1.293855,2.86391,5.112,...,8.947963,3.69124,6.64379,1.620796,4.797192,4.914603,4.021158,10.986181,7.441273,5.671107
41,2022,1.396958,3.65221,5.973768,2.364294,10.913762,3.556215,1.537749,2.939112,5.225794,...,5.806968,4.459982,1.220023,1.594004,1.796818,2.424469,5.108022,4.139784,2.094987,3.099033
42,2023,1.585341,3.827234,5.859256,1.146021,10.923457,1.851687,1.584836,2.995278,5.327838,...,5.831424,4.650495,1.061703,1.582481,1.986996,1.988976,5.592555,4.797864,2.161667,2.71173


In [111]:
# Label the years

pre_crisis_years = ['1974', '1981', '1990', '2008','2019']
crisis_years = ['1975', '1982', '1991', '2009', '2020']
recovery_years = ['1976', '1977', '1983', '1984', '1992', '1993', '2010', '2011']
downturn_years = ['1958', '1998', '2001', '2012']

labels = []

for i in range(len(wb_1981)):
    
    if wb_1981['time'][i] in pre_crisis_years:
        labels.append('pre_recession')
        
    elif wb_1981['time'][i] in crisis_years:
        labels.append('recession')

    elif wb_1981['time'][i] in recovery_years:
        labels.append('recovery')

    elif wb_1981['time'][i] in downturn_years:
        labels.append('downturn')
        
    else:
        labels.append('expansion')

wb_1981['year_label']= labels
wb_1981

Unnamed: 0,time,WLD_gdp_per_capita_constant_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,ARG_gdp_per_capita_constant_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_industry_value_added_%_growth,AUS_gdp_per_capita_constant_%_growth,AUS_consumer_price_index_%,AUS_unemployment_%_of_total_labor_force,...,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth,year_label
0,1981,0.166623,12.471612,5.697886,-6.668498,10.977871,-10.663334,1.711915,9.487666,6.519742,...,7.878788,0.844228,4.260624,8.525607,5.360791,7.246176,4.856649,-0.787744,2.537719,pre_recession
1,1982,-1.381191,10.240268,5.697886,-2.307766,10.977871,-3.209381,1.614208,11.351820,6.519742,...,2.247191,0.413586,3.279743,-0.520808,-0.383419,8.338078,3.563228,1.994891,-1.802887,recession
2,1983,0.850482,8.771147,5.697886,2.681982,10.977871,5.377688,-3.437117,10.038911,6.519742,...,4.395604,1.169203,3.630199,-3.486422,-1.846558,13.376174,4.971081,4.221856,4.583913,recovery
3,1984,2.878301,8.116398,5.697886,-0.051847,10.977871,0.674286,3.418345,3.960396,6.519742,...,6.842105,3.225852,4.410880,3.410814,5.099152,10.551640,6.712016,2.269105,7.236633,recovery
4,1985,1.917044,6.856812,5.697886,-6.692591,10.977871,-8.651012,3.822749,6.734694,6.519742,...,2.463054,2.798086,5.159808,2.187693,-1.211541,7.838864,4.241336,4.147415,4.169656,expansion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2017,2.200971,2.192010,5.557650,1.757648,8.350000,2.302833,0.587161,1.948647,5.590000,...,5.069786,1.667859,1.675332,2.113129,1.157947,3.159636,7.501997,2.134453,2.255680,expansion
37,2018,2.144471,2.438737,5.389680,-3.601610,9.220000,-3.049315,1.302841,1.911401,5.300000,...,5.174292,0.925811,0.584068,2.194995,1.487617,2.907404,2.979885,1.650925,2.918857,expansion
38,2019,1.531947,2.186902,5.356915,-2.994388,9.840000,-4.790141,0.571103,1.610768,5.160000,...,5.019288,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.288870,pre_recession
39,2020,-4.269491,1.920968,6.573234,-10.765108,11.460000,-9.369053,-1.278526,0.846906,6.460000,...,-2.065005,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.404590,recession


In [112]:
category_expansion = wb_1981[wb_1981['year_label']=='expansion']
category_pre_recession = wb_1981[wb_1981['year_label']=='pre_recession']
category_recession = wb_1981[wb_1981['year_label']=='recession']
category_recovery = wb_1981[wb_1981['year_label']=='recovery']
category_downturn = wb_1981[wb_1981['year_label']=='downturn']

category_pre_recession_oversampled = resample(category_pre_recession,replace=True, n_samples = len(category_expansion))
category_recession_oversampled = resample(category_recession,replace=True, n_samples = len(category_expansion))
category_recovery_oversampled = resample(category_recovery,replace=True, n_samples = len(category_expansion))
category_downturn_oversampled = resample(category_downturn,replace=True, n_samples = len(category_expansion))

wb_1981_upsampled = pd.concat([category_expansion,
                               category_pre_recession_oversampled,
                              category_recession_oversampled,
                              category_recovery_oversampled,
                              category_downturn_oversampled], axis=0)

X = wb_1981_upsampled.drop(['time', 'year_label'],axis=1)

Y = wb_1981_upsampled['year_label']

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = pd.DataFrame(encoder.transform(Y))
X_train, X_test, y_train, y_test = train_test_split(X, encoded_Y, test_size=0.35, shuffle=None)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X.shape)

(120, 104)


In [113]:
encoded_Y

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
115,0
116,0
117,0
118,0


In [95]:
# 1: Expansion, 2: pre_recession , 3:recession, 4:recovery , 0: downturn

In [114]:
# Build the model.
model = Sequential([ # as far as we know, all networks are sequential
    Dense(128, activation='sigmoid', input_shape=(104,)),
    Dense(64, activation='sigmoid'),      # choosing relu instead of sigmoid, this is somewhat common
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    Dense(5, activation='softmax'),   # the softmax actiavation is the last one to compensate for the high volume additions
])

# Compile the model.
model.compile(
  optimizer='adam', #here we could use stochastic gradient descent, but adam is a de facto standard
  loss='categorical_crossentropy', #this is how we create the original blame to play the blame game
  metrics=['accuracy'],
)

# Train the model.
history = model.fit(
  X_train,
  to_categorical(y_train), # just to make sure the outputs are not considered numeric (because, ya know, they are numbers...)
  epochs=50,
  batch_size=6, # send 32 images at a time before you tweak the network again, to make it faster
)
 
    # Evaluate the model.
model.evaluate(
  X_test,
  to_categorical(y_test)
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[0.2720673084259033, 0.9523809552192688]

In [115]:
predictions = model.predict(X_test)
print(np.argmax(predictions, axis=1))

[2 3 3 3 1 4 4 1 2 0 0 2 0 2 3 2 0 1 4 0 1 1 0 3 3 3 3 3 4 1 4 0 3 4 0 2 0
 0 4 0 4 2]


In [116]:
y_test

Unnamed: 0,0
45,2
60,3
93,4
66,3
15,1
...,...
109,0
74,4
111,0
85,4


In [117]:
# Also here the accuracy is very good!


model_1981_upsampled = model 
scaler_1981_upsampled  = scaler

filename = 'model_1981_upsampled.sav'
pickle.dump(model_1981_upsampled, open(filename, 'wb'))

filename= 'scaler_1981_upsampled.sav'
pickle.dump(scaler_1981_upsampled, open(filename, 'wb'))


INFO:tensorflow:Assets written to: C:\Users\luana\AppData\Local\Temp\tmpv6ubu74l\assets


In [118]:
X = wb_1981_to_predict.drop(['time'], axis=1)
X

Unnamed: 0,WLD_gdp_per_capita_constant_%_growth,WLD_consumer_price_index_%,WLD_unemployment_%_of_total_labor_force,ARG_gdp_per_capita_constant_%_growth,ARG_unemployment_%_of_total_labor_force,ARG_industry_value_added_%_growth,AUS_gdp_per_capita_constant_%_growth,AUS_consumer_price_index_%,AUS_unemployment_%_of_total_labor_force,AUS_government_debt_total_%_of_gdp,...,IND_gdp_%_growth,IDN_gdp_%_growth,ITA_gdp_%_growth,JPN_gdp_%_growth,MEX_gdp_%_growth,ZAF_gdp_%_growth,KOR_gdp_%_growth,TUR_gdp_%_growth,GBR_gdp_%_growth,USA_gdp_%_growth
38,1.531947,2.186902,5.356915,-2.994388,9.84,-4.790141,0.571103,1.610768,5.16,60.259846,...,3.737919,5.019288,0.500234,-0.240351,-0.185907,0.113054,2.243978,0.889585,1.671944,2.28887
39,-4.269491,1.920968,6.573234,-10.765108,11.46,-9.369053,-1.278526,0.846906,6.46,69.364914,...,-6.596081,-2.065005,-9.025669,-4.506905,-8.167358,-6.431975,-0.852031,1.793551,-9.270411,-3.40459
40,4.820563,3.423629,6.17806,9.223956,10.902,15.153015,1.293855,2.86391,5.112,30.393233,...,8.947963,3.69124,6.64379,1.620796,4.797192,4.914603,4.021158,10.986181,7.441273,5.671107
41,1.396958,3.65221,5.973768,2.364294,10.913762,3.556215,1.537749,2.939112,5.225794,30.393233,...,5.806968,4.459982,1.220023,1.594004,1.796818,2.424469,5.108022,4.139784,2.094987,3.099033
42,1.585341,3.827234,5.859256,1.146021,10.923457,1.851687,1.584836,2.995278,5.327838,30.393233,...,5.831424,4.650495,1.061703,1.582481,1.986996,1.988976,5.592555,4.797864,2.161667,2.71173


In [119]:
# load the model from disk

filename = 'model_1981_upsampled.sav'
loaded_model = pickle.load(open(filename, 'rb'))

filename= 'scaler_1981_upsampled.sav'
loaded_scaler = pickle.load(open(filename, 'rb'))

X_scaled = loaded_scaler.transform(X)

predictions = loaded_model.predict(X_scaled)
print(np.argmax(predictions, axis=1))

[2 3 1 1 1]


In [None]:
# 1: Expansion, 2: pre_recession , 3:recession, 4:recovery , 0: downturn

# Conclusion

### Both models predicted expansion for 2022 and 2022.