In [27]:
#coding utf8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import tensorflow as tf
import keras_tuner as kt
from scipy.io import loadmat
from tensorflow import keras
from tensorflow.keras import layers, metrics
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import svm
import statsmodels.api as sm
from pylab import rcParams
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.feature_selection import mutual_info_regression, f_regression, r_regression
import sklearn
from mpl_toolkits.mplot3d import Axes3D
#from PyIF import te_compute as te
#from neuralprophet import NeuralProphet

In [76]:
'''
Functions for loading in and combining data
'''

def load_df(file_name, sheet = 'Sheet 1'):
    '''
    Load a file into a data frame that is quarterly from the Eurostat website
    '''
    df = pd.read_excel(file_name, sheet_name=sheet)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.replace(':', np.nan, inplace=True)
    df.index = df.iloc[:, 0]
    df.drop(df.columns[[0]], axis=1, inplace=True)
    for i in df.columns[:]:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce")
    df.interpolate(method='linear', inplace=True, axis = 1)
    df.sort_index(inplace=True)
    df = df.rename({'Germany (until 1990 former territory of the FRG)': 'Germany'})
    return df

def create_per_employeer(GDP_df, HW_df, employees_df):
    ''' 
    Inputs: GDP, hours worked, and employees dataframes
    Output: dataframe with per-employee GDP and per hour worked GDP
    '''
    cols =(list(set(GDP_df.columns) and set(employees_df.columns) and set(HW_df.columns)))
    cols.sort()
    idx = cols.pop()
    per_employee_df = pd.DataFrame(index=GDP_df.index, columns=cols)
    per_HW_df = pd.DataFrame(index=GDP_df.index, columns=cols)
    for i in cols:
        per_employee_df[i] = GDP_df[i]/employees_df[i]
        per_HW_df[i] = per_employee_df[i]/HW_df[i]
    return per_employee_df, per_HW_df

def country_code_to_name(df):
    '''
    Renames country codes to country names
    '''
    df = df.rename({'AUT': 'Austria', 'BEL': 'Belgium', 'DEU': 'Germany','EST': 'Estonia','FIN': 'Finland','ESP': 'Spain','FRA': 'France','GRC': 'Greece','IRL': 'Ireland','ITA': 'Italy','LTU': 'Lithuania','LUX': 'Luxembourg','LVA': 'Latvia','PRT': 'Portugal','NLD': 'Netherlands','SVK': 'Slovakia','SVN': 'Slovenia','ISL': 'Iceland','GBR': 'United Kingdom','CHE': 'Switzerland','CZE': 'Cezch Republic','DNK': 'Denmark','HUN': 'Hungary','NOR': 'Norway','POL': 'Poland','SWE': 'Sweden'})
    return df

def yearly_to_quarterly(df):
    '''
    Inputs: df
    Converts the data from yearly to quarterly
    Outputs: quarterly_df
    '''
    data = df.to_numpy(dtype=np.float64)
    data = np.repeat(data, 4, axis = 1)
    quarterly_df = pd.DataFrame(data)
    quarterly_df.index = df.index
    quarterly_df.columns = GDP_df.columns[(int(df.columns[0]) - 1975)*4:(int(df.columns[-1]) - 2021)*4]
    return quarterly_df

def match_df(dfs):
    ''' 
    Input : dataframes
    This function matches dataframes by their index and column 
    Output: matched dataframes
    '''
    idx = set(dfs[0].index)
    cols = set(dfs[0].columns)
    for i in dfs:
        idx = idx.intersection(set(i.index))
        cols = cols.intersection(set(i.columns))
    cols = list(cols)
    idx = list(idx)
    cols.sort()
    idx.sort()
    for i in range(len(dfs)):
        dfs[i] = dfs[i].loc[idx]
        dfs[i] = dfs[i][cols]
    return dfs

def to_datetime(df):
    df.columns = [w.replace('-Q1','-01') for w in df.columns]
    df.columns = [w.replace('-Q2','-04') for w in df.columns]
    df.columns = [w.replace('-Q3', '-07') for w in df.columns]
    df.columns = [w.replace('-Q4', '-10') for w in df.columns]
    for i in df.columns[:]:
        i = np.datetime64(i)
    df.columns = pd.to_datetime(df.columns)
    return df

def extend_df(dfs):
    idx = set(dfs[0].index)
    cols = set(dfs[0].columns)
    for i in dfs:
        idx = idx.union(set(i.index))
        cols = cols.union(set(i.columns))
    cols = list(cols)
    idx = list(idx)
    cols.sort()
    idx.sort()
    for i in range(len(dfs)):
        dfs[i] = dfs[i].reindex(index=idx, columns=cols)
        dfs[i] = dfs[i].loc[idx]
        dfs[i] = dfs[i][cols]
        dfs[i] = to_datetime(dfs[i])
        dfs[i].interpolate(method='polynomial', order = 5, inplace = True, limit = 100, limit_direction = 'both', axis = 1)
        dfs[i].fillna(0)
    return dfs


'''
Functions for RNN time series prediction
'''

def process_data(X, y, split_point, time_step, data_memory):
    '''
    Inputs: X, y, split_point, time_step, data_memory
    Splits data into training and testing data
    Outputs: X_train, X_test, y_train, y_test
    '''
    X_norm, X_attrs, y_norm, y_attrs = normalise(X, y)
    split_point = int(X_norm.shape[0] *split_point)
    y_norm = np.roll(y_norm, -time_step)
    nrows = X_norm.shape[0]
    samples = X_norm.shape[1]
    X_norm = np.repeat(X_norm, data_memory, 0).reshape(nrows, data_memory, samples)
    x_train, x_test, y_train, y_test = train_test_split(X_norm, y_norm, split_point)
    return x_train, x_test, y_train, y_test, nrows, samples, X_norm, y_norm, X_attrs, y_attrs

def normalise(X, y):
    '''
    Inputs: X, y
    Normalises the data by subtracting the mean and dividing by the standard deviation
    Outputs : X_norm, y_norm, X_attrs, y_attrs
    '''
    X_attrs = np.zeros((X.shape[-1], 2))
    y_attrs = np.zeros((y.shape[-1], 2))
    X_norm = np.zeros(X.shape)
    y_norm = np.zeros(y.shape)
    for i in range(X.shape[-1]):
        X_attrs[i, :] = [np.mean(X[:, i]), np.var(X[:, i])]
        X_norm[:, i] = (X[:, i] - np.mean(X[:, i]))/np.var(X[:, i])**0.5
    for i in range(y.shape[-1]):
        y_attrs[i, :] = [np.mean(y[:, i]), np.var(y[:, i])]
        y_norm[:, i] = (y[:, i] - np.mean(y[:, i]))/np.var(y[:, i])**0.5 
    return X_norm, X_attrs, y_norm, y_attrs

def train_test_split(X, y, split_point):
    '''
    Splits the data into training and testing data
    '''
    x_train = X[:split_point, :, :]
    x_test = X[split_point:, :, :]
    y_train = y[:split_point]
    y_test = y[split_point:]
    return x_train, x_test, y_train, y_test
        
def create_model(layers, input_shape, print_summary):
    '''
    Creates a model with the specified layers
    '''
    model = keras.Sequential(layers)
    model.build(input_shape=input_shape)
    model.compile(loss='mse', optimizer='adam', metrics = [tf.keras.metrics.MeanSquaredError()])
    if print_summary:
        model.summary()
    return model

def run_model(X, y, time_step, data_memory, epochs, batch_size, model_layer, split_point):
    '''
    Inputs: X, y, time_step, data_memory, epochs, batch_size, model_layer, split_point
    Runs the model on the data
    Outputs: model, history
    '''
    x_train, x_test, y_train, y_test, nrows, samples, X_norm, y_norm, X_attrs, y_attrs = process_data(X, y, split_point, time_step, data_memory)
    input_shape = (x_train.shape[0], data_memory, samples)
    model = create_model(model_layer, input_shape, print_summary)
    history = model.fit(x_train, y_train, validation_split = 0.1, epochs= epochs , batch_size=batch_size)
    y_pred_norm = np.concatenate((model.predict(x_train[:, :, :]), model.predict(x_test[:, :, :])))
    y_pred_norm = np.roll(y_pred_norm, 1, axis = 1)
    y_pred = y_pred_norm *y_attrs[:, 1]**0.5 + y_attrs[:, 0]
    return y_pred, history

'''
Regression stuff
'''

def run_regr(X, y, t, regr, colours):
    regr.fit(X, y)
    nrows = X.shape[0]
    y_reg = np.zeros(nrows)  
    for i in range(nrows):
        y_reg[i] = regr.predict(np.array([X[i, :]]))
    total_loss, loss = cal_loss(y, y_reg, nrows)
    plt.plot(t, y, colours[0], t, y_reg, colours[1])    
    return regr, total_loss, loss

def cal_loss(y, y_reg, nrows):
    abs_loss, per_loss = np.zeros(nrows), np.zeros(nrows)
    for i in range(nrows):
        abs_loss[i] = np.abs(y[i] - y_reg[i])
        if y[i] > 0:
            per_loss[i] = abs_loss[i] / y[i]
    total_loss, loss = np.sum(abs_loss)/ nrows, np.sum(per_loss)/ nrows
    return total_loss, loss

def scatter2d(X, y):

    import plotly.express as px
    fig = px.scatter(X, x=X[:,0], y=X[:,1],
              color=y)
    fig.show()
        
def scatter3d(X, y):

    import plotly.express as px
    fig = px.scatter_3d(X, x=X[:,0], y=X[:,1], z=X[:,2],
              color=y)
    fig.show()

In [77]:
'''
Loads in gross domestic product, hours worked and employees data
Returns GDP per employee and GDP per hour worked
'''
GDP_df =  load_df('./data/GDP_per_quarter_2.xlsx') * 1e6
HW_df = load_df('./data/hours_worked.xlsx')
employees_df = load_df('./data/Employees.xlsx') * 1e3
per_employee_df, per_HW_df = create_per_employeer(GDP_df, HW_df, employees_df)

#per_HW_df.iloc[4, :].T.plot(legend = False)

'''
Depression data
'''
dep_df = pd.read_csv('data/depression_by_age.csv') 
dep_df = dep_df.drop(['Prevalence - Depressive disorders - Sex: Both - Age: 10 to 14 (Percent)','Prevalence - Depressive disorders - Sex: Both - Age: All Ages (Percent)','Prevalence - Depressive disorders - Sex: Both - Age: 70+ years (Percent)','Prevalence - Depressive disorders - Sex: Both - Age: Age-standardized (Percent)', 'Prevalence - Depressive disorders - Sex: Both - Age: 15-49 years (Percent)'],axis=1)
a = ['Belgium','Bulgaria','Denmark','Germany','Estonia','Ireland','Greece','Spain','France','Croatia','Italy','Cyprus','Latvia','Lithuania','Luxemburg','Hungary','Malta','Netherlands','Austria','Poland','Portugal','Romania','Slovenia','Slovakia','Finland','Sweden','Iceland','Norway','Switzerland','United Kingdom','Montenegro','North Macedonia','Serbia','Turkey']
dep_df = dep_df[dep_df['Entity'].isin(a)]
dep_df['Age:15-69_depression_average']=dep_df.iloc[:,3:7].mean(axis=1,skipna=True)
dep_df = dep_df.pivot_table('Age:15-69_depression_average', ['Entity', 'Code'], 'Year')
dep_df = dep_df.reset_index('Code')
del dep_df['Code']
dep_quarterly_df = yearly_to_quarterly(dep_df)

'''
Education data
'''
education_dfs = [None] * 4
for i in range(4):
    education_dfs[i] = load_df('./data/Education.xlsx', 'Sheet ' + str(4*i + 1))
    education_dfs[i] = yearly_to_quarterly(education_dfs[i])
    education_dfs[i] = education_dfs[i].rename({'Germany (until 1990 former territory of the FRG)': 'Germany'})
    

'''
Inflation data
'''
inf_df = pd.read_csv('data/Quarterly_infilation.csv')
inf_df =inf_df.drop(['SUBJECT', 'MEASURE','FREQUENCY','Flag Codes'], axis=1)
inf_df = inf_df.pivot_table('Value', ['LOCATION', 'INDICATOR'], 'TIME')
inf_df = country_code_to_name(inf_df)
inf_df = inf_df.reset_index('INDICATOR')
del inf_df['INDICATOR']
inf_df = inf_df.rename({'Germany (until 1990 former territory of the FRG)': 'Germany'})

'''
Unemployment data
'''
unem_df = pd.read_csv('./data/unem.csv')
unem_df.fillna(0)
unem_df = unem_df[['LOCATION', 'TIME', 'Value']]
unem_df = unem_df.pivot_table('Value',  ['LOCATION'],'TIME')
unem_df = country_code_to_name(unem_df)

'''
Combine all dataframes and create a numpy array of the data
Data = [Country, Time, Feature]
'''
dfs = [per_employee_df, per_HW_df] + education_dfs + [inf_df, unem_df, dep_quarterly_df]
matched_dfs = match_df(dfs)
data = np.zeros((len(matched_dfs[0].index), len(matched_dfs[0].columns),  len(matched_dfs)))
for i in range(len(matched_dfs)):
    data[:, :, i] = matched_dfs[i].to_numpy(dtype=np.float64)
# matched_dfs[1]

In [78]:
depression = matched_dfs[8]
depression

Unnamed: 0_level_0,2008-Q1,2008-Q2,2008-Q3,2008-Q4,2009-Q1,2009-Q2,2009-Q3,2009-Q4,2010-Q1,2010-Q2,...,2017-Q3,2017-Q4,2018-Q1,2018-Q2,2018-Q3,2018-Q4,2019-Q1,2019-Q2,2019-Q3,2019-Q4
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Austria,3.932948,3.932948,3.932948,3.932948,3.894398,3.894398,3.894398,3.894398,3.874706,3.874706,...,3.904541,3.904541,3.909877,3.909877,3.909877,3.909877,3.919968,3.919968,3.919968,3.919968
Belgium,4.470518,4.470518,4.470518,4.470518,4.5292,4.5292,4.5292,4.5292,4.556212,4.556212,...,4.373711,4.373711,4.381408,4.381408,4.381408,4.381408,4.395594,4.395594,4.395594,4.395594
Estonia,4.093089,4.093089,4.093089,4.093089,4.002988,4.002988,4.002988,4.002988,3.930319,3.930319,...,3.690929,3.690929,3.701515,3.701515,3.701515,3.701515,3.718065,3.718065,3.718065,3.718065
Finland,6.216123,6.216123,6.216123,6.216123,6.213358,6.213358,6.213358,6.213358,6.196769,6.196769,...,5.759037,5.759037,5.772166,5.772166,5.772166,5.772166,5.793236,5.793236,5.793236,5.793236
France,4.720823,4.720823,4.720823,4.720823,4.656159,4.656159,4.656159,4.656159,4.618401,4.618401,...,4.559817,4.559817,4.572124,4.572124,4.572124,4.572124,4.604383,4.604383,4.604383,4.604383
Germany,4.634948,4.634948,4.634948,4.634948,4.626955,4.626955,4.626955,4.626955,4.622184,4.622184,...,4.432185,4.432185,4.322971,4.322971,4.322971,4.322971,4.201171,4.201171,4.201171,4.201171
Greece,6.787959,6.787959,6.787959,6.787959,6.777206,6.777206,6.777206,6.777206,6.763624,6.763624,...,5.990507,5.990507,6.029512,6.029512,6.029512,6.029512,6.113405,6.113405,6.113405,6.113405
Ireland,5.243573,5.243573,5.243573,5.243573,5.240653,5.240653,5.240653,5.240653,5.23553,5.23553,...,5.725124,5.725124,5.726337,5.726337,5.726337,5.726337,5.68851,5.68851,5.68851,5.68851
Italy,4.267409,4.267409,4.267409,4.267409,4.156614,4.156614,4.156614,4.156614,4.111459,4.111459,...,4.263645,4.263645,4.297771,4.297771,4.297771,4.297771,4.368157,4.368157,4.368157,4.368157
Latvia,3.953395,3.953395,3.953395,3.953395,3.887468,3.887468,3.887468,3.887468,3.854642,3.854642,...,3.838283,3.838283,3.844814,3.844814,3.844814,3.844814,3.847041,3.847041,3.847041,3.847041


In [103]:
# GDP_per_employee = matched_dfs[0].transpose()
# GDP_per_HW = matched_dfs[1].transpose()
# inflation = matched_dfs[6].transpose()
# unemployment = matched_dfs[7].transpose()
# depression = matched_dfs[8].transpose()

from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()

scaler.fit(matched_dfs[0])
scaled = scaler.fit_transform(matched_dfs[0])
GDP_per_employee = pd.DataFrame(scaled, columns=matched_dfs[0].columns)

scaler.fit(matched_dfs[1])
scaled = scaler.fit_transform(matched_dfs[1])
GDP_per_HW = pd.DataFrame(scaled, columns=matched_dfs[1].columns)

scaler.fit(matched_dfs[6])
scaled = scaler.fit_transform(matched_dfs[6])
inflation = pd.DataFrame(scaled, columns=matched_dfs[6].columns)

scaler.fit(matched_dfs[7])
scaled = scaler.fit_transform(matched_dfs[7])
unemployment = pd.DataFrame(scaled, columns=matched_dfs[7].columns)

scaler.fit(matched_dfs[8])
scaled = scaler.fit_transform(matched_dfs[8])
depression = pd.DataFrame(scaled, columns=matched_dfs[8].columns)


depression

Unnamed: 0,2008-Q1,2008-Q2,2008-Q3,2008-Q4,2009-Q1,2009-Q2,2009-Q3,2009-Q4,2010-Q1,2010-Q2,...,2017-Q3,2017-Q4,2018-Q1,2018-Q2,2018-Q3,2018-Q4,2019-Q1,2019-Q2,2019-Q3,2019-Q4
0,0.579401,0.579401,0.579401,0.579401,0.574632,0.574632,0.574632,0.574632,0.572874,0.572874,...,0.651788,0.651788,0.648457,0.648457,0.648457,0.648457,0.641209,0.641209,0.641209,0.641209
1,0.658595,0.658595,0.658595,0.658595,0.668299,0.668299,0.668299,0.668299,0.673635,0.673635,...,0.730107,0.730107,0.72666,0.72666,0.72666,0.72666,0.719009,0.719009,0.719009,0.719009
2,0.602993,0.602993,0.602993,0.602993,0.590655,0.590655,0.590655,0.590655,0.581097,0.581097,...,0.61613,0.61613,0.613899,0.613899,0.613899,0.613899,0.608182,0.608182,0.608182,0.608182
3,0.915757,0.915757,0.915757,0.915757,0.916802,0.916802,0.916802,0.916802,0.916191,0.916191,...,0.961361,0.961361,0.957319,0.957319,0.957319,0.957319,0.947628,0.947628,0.947628,0.947628
4,0.69547,0.69547,0.69547,0.69547,0.687032,0.687032,0.687032,0.687032,0.682829,0.682829,...,0.761174,0.761174,0.758291,0.758291,0.758291,0.758291,0.753162,0.753162,0.753162,0.753162
5,0.682819,0.682819,0.682819,0.682819,0.682723,0.682723,0.682723,0.682723,0.683389,0.683389,...,0.739868,0.739868,0.716969,0.716969,0.716969,0.716969,0.687206,0.687206,0.687206,0.687206
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,0.772481,0.772481,0.772481,0.772481,0.773276,0.773276,0.773276,0.773276,0.774072,0.774072,...,0.955699,0.955699,0.949718,0.949718,0.949718,0.949718,0.930498,0.930498,0.930498,0.930498
8,0.628673,0.628673,0.628673,0.628673,0.613323,0.613323,0.613323,0.613323,0.607878,0.607878,...,0.711734,0.711734,0.712789,0.712789,0.712789,0.712789,0.714521,0.714521,0.714521,0.714521
9,0.582413,0.582413,0.582413,0.582413,0.573609,0.573609,0.573609,0.573609,0.569908,0.569908,...,0.640728,0.640728,0.637666,0.637666,0.637666,0.637666,0.62928,0.62928,0.62928,0.62928


In [104]:
# GDP_per_employee['Average'] = GDP_per_employee.mean(axis=1, skipna=True)
# GDP_per_HW['Average'] = GDP_per_HW.mean(axis=1, skipna=True)
# inflation['Average'] = inflation.mean(axis=1, skipna=True)
# unemployment['Average'] = unemployment.mean(axis=1, skipna=True)
# depression['Average'] = depression.mean(axis=1, skipna=True)

country_labels = list(matched_dfs[0].index.values)
years = list(matched_dfs[0].columns)
features = ['GDP per employee', 'GDP per hour worked', 'Inflation', 'Unemployment', 'Depression']
length = len(country_labels)

GDP_per_employee['Label'] = 0
GDP_per_HW['Label'] = 1
inflation['Label'] = 2
unemployment['Label'] = 3
depression['Label'] = 4

# GDP_per_employee['Label'] = list(range(0,length))
# GDP_per_HW['Label'] = list(range(0,length))
# inflation['Label'] = list(range(0,length))
# unemployment['Label'] = list(range(0,length))
# depression['Label'] = list(range(0,length))
depression

Unnamed: 0,2008-Q1,2008-Q2,2008-Q3,2008-Q4,2009-Q1,2009-Q2,2009-Q3,2009-Q4,2010-Q1,2010-Q2,...,2017-Q4,2018-Q1,2018-Q2,2018-Q3,2018-Q4,2019-Q1,2019-Q2,2019-Q3,2019-Q4,Label
0,0.579401,0.579401,0.579401,0.579401,0.574632,0.574632,0.574632,0.574632,0.572874,0.572874,...,0.651788,0.648457,0.648457,0.648457,0.648457,0.641209,0.641209,0.641209,0.641209,4
1,0.658595,0.658595,0.658595,0.658595,0.668299,0.668299,0.668299,0.668299,0.673635,0.673635,...,0.730107,0.72666,0.72666,0.72666,0.72666,0.719009,0.719009,0.719009,0.719009,4
2,0.602993,0.602993,0.602993,0.602993,0.590655,0.590655,0.590655,0.590655,0.581097,0.581097,...,0.61613,0.613899,0.613899,0.613899,0.613899,0.608182,0.608182,0.608182,0.608182,4
3,0.915757,0.915757,0.915757,0.915757,0.916802,0.916802,0.916802,0.916802,0.916191,0.916191,...,0.961361,0.957319,0.957319,0.957319,0.957319,0.947628,0.947628,0.947628,0.947628,4
4,0.69547,0.69547,0.69547,0.69547,0.687032,0.687032,0.687032,0.687032,0.682829,0.682829,...,0.761174,0.758291,0.758291,0.758291,0.758291,0.753162,0.753162,0.753162,0.753162,4
5,0.682819,0.682819,0.682819,0.682819,0.682723,0.682723,0.682723,0.682723,0.683389,0.683389,...,0.739868,0.716969,0.716969,0.716969,0.716969,0.687206,0.687206,0.687206,0.687206,4
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4
7,0.772481,0.772481,0.772481,0.772481,0.773276,0.773276,0.773276,0.773276,0.774072,0.774072,...,0.955699,0.949718,0.949718,0.949718,0.949718,0.930498,0.930498,0.930498,0.930498,4
8,0.628673,0.628673,0.628673,0.628673,0.613323,0.613323,0.613323,0.613323,0.607878,0.607878,...,0.711734,0.712789,0.712789,0.712789,0.712789,0.714521,0.714521,0.714521,0.714521,4
9,0.582413,0.582413,0.582413,0.582413,0.573609,0.573609,0.573609,0.573609,0.569908,0.569908,...,0.640728,0.637666,0.637666,0.637666,0.637666,0.62928,0.62928,0.62928,0.62928,4


In [105]:
dict_list = [GDP_per_employee, GDP_per_HW, inflation, unemployment, depression]
big_data = pd.concat(dict_list, axis=0, ignore_index=True)
labels = big_data['Label'].values
big_data.drop('Label', inplace=True, axis=1)
big_data

Unnamed: 0,2008-Q1,2008-Q2,2008-Q3,2008-Q4,2009-Q1,2009-Q2,2009-Q3,2009-Q4,2010-Q1,2010-Q2,...,2017-Q3,2017-Q4,2018-Q1,2018-Q2,2018-Q3,2018-Q4,2019-Q1,2019-Q2,2019-Q3,2019-Q4
0,0.786304,0.842673,0.828414,0.854582,0.790291,0.805609,0.810285,0.867817,0.778429,0.817886,...,0.585752,0.598465,0.582335,0.611389,0.564960,0.611889,0.576130,0.594685,0.542125,0.581042
1,0.857492,0.930144,0.878160,0.919059,0.870809,0.881049,0.873145,0.958633,0.865263,0.913415,...,0.649852,0.667997,0.632730,0.693692,0.613902,0.681019,0.634039,0.666111,0.587524,0.653528
2,0.256474,0.293725,0.281509,0.273696,0.248475,0.261878,0.256486,0.279738,0.258815,0.281966,...,0.254958,0.258959,0.247906,0.280765,0.264535,0.278926,0.258857,0.288612,0.263438,0.275332
3,0.819144,0.861186,0.847083,0.904648,0.810818,0.818665,0.816464,0.905608,0.812137,0.841754,...,0.623019,0.640268,0.611257,0.644701,0.593371,0.642578,0.597634,0.624859,0.563486,0.617595
4,0.819546,0.849017,0.819133,0.852733,0.820456,0.813203,0.801041,0.869181,0.820739,0.834626,...,0.569376,0.578512,0.574310,0.598025,0.552157,0.592265,0.573589,0.589052,0.535025,0.564748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.647267,0.647267,0.647267,0.647267,0.640986,0.640986,0.640986,0.640986,0.636559,0.636559,...,0.704488,0.704488,0.699529,0.699529,0.699529,0.699529,0.688157,0.688157,0.688157,0.688157
71,0.667783,0.667783,0.667783,0.667783,0.665985,0.665985,0.665985,0.665985,0.664165,0.664165,...,0.722617,0.722617,0.722165,0.722165,0.722165,0.722165,0.717986,0.717986,0.717986,0.717986
72,0.828689,0.828689,0.828689,0.828689,0.827359,0.827359,0.827359,0.827359,0.824988,0.824988,...,0.834348,0.834348,0.829797,0.829797,0.829797,0.829797,0.821140,0.821140,0.821140,0.821140
73,0.538367,0.538367,0.538367,0.538367,0.529239,0.529239,0.529239,0.529239,0.523118,0.523118,...,0.568449,0.568449,0.565438,0.565438,0.565438,0.565438,0.559205,0.559205,0.559205,0.559205


In [109]:
# scaler.fit(big_data)
# scaled = scaler.fit_transform(big_data)
# big_data = pd.DataFrame(scaled, columns=big_data.columns)
# big_data

In [108]:
'''
Clustering of features using K-means

X = [Country, Time, Features]
Plot the clusters against properties
'''
X = big_data.values
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)



scatter2d(X, kmeans.labels_)

In [98]:
'''
Clustering of features using DBSCAN
'''
X = big_data.values
y = labels
# X, X_attrs, y, y_attrs = normalise(X, y)
dbs = DBSCAN(eps = 2.5, min_samples= 2).fit(X)
# plt.scatter(X[:, 1], X[:, 4], c= dbs.labels_)
scatter2d(X, dbs.labels_)

In [99]:
'''
PCA
'''
X = big_data.values
y = labels

pca = PCA(n_components=2, svd_solver='full')
X_r = pca.fit_transform(X)

scatter2d(X_r, labels)

In [100]:
X = big_data.values
y = labels
tsne = TSNE()
X_embedded = tsne.fit_transform(X)
scatter2d(X_embedded, labels)
# fig2 = plt.scatter(X_embedded[:, 0], X_embedded[:, 1])


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

