In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 19 11:55:13 2020
@author: nabeelhussain
"""

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

#import data - commented out files have missing data points

#OECD - Inflation Rates
Infl_Rate_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/Global Indicator Data/INFL_RATE.csv', index_col=0)
FER_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/Global Indicator Data/FER.csv', index_col=0)

#OECD - PPP
PPP_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/Global Indicator Data/PPP.csv', index_col=0)

#International Monetary Fund, World Economic Outlook Database - Imports/Exports
Imports_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/Global Indicator Data/IMPORTS.csv', index_col=0)
Exports_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/Global Indicator Data/EXPORTS.csv', index_col=0)
GDP_Percent_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/Global Indicator Data/GDP_Percent.csv', index_col=0)

Topic_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/documents_to_topics.csv', index_col=0)

#Must select number of topics from LDA here
num_topics = 6

#first year that the data starts
start_year = prev_date = '1981'

#initialize
year_count = 0 
topics = [0]*num_topics
topic_count_per_year = [[] for _ in range(2019-1981 + 2)]

for i in range(len(Topic_data)):
    date = Topic_data.iloc[i,3]
    if isinstance(date, str) == True:

        date = date[:4]
        if date >= prev_date:
            
            if date == prev_date:
            
                for t in range(num_topics):
                    #topics columns start at 8 from csv file
                    topics[t] += Topic_data.iloc[i,8+t]
                   
                topic_count_per_year[year_count] = topics
        
            else:
            
                print("article:",i)
                print("topic count:",topic_count_per_year[year_count])
                print("next year:",date)
                print("previous year:",prev_date)
                
                topics = [0]*num_topics
                for t in range(num_topics):
                    topics[t] += Topic_data.iloc[i,8+t]
                
                year_count += 1
        
                prev_date = date

for year in range(len(topic_count_per_year)):
    total = sum(topic_count_per_year[year])
    topic_count_per_year[year] = [x / total for x in topic_count_per_year[year]]
    #print("percent topic count:",topic_count_per_year[year_count])

topic_count_per_year = [x for x in topic_count_per_year if x != []]
       
#select countries to use here
countries = ['INDIA','JAPAN','UK','SWITZERLAND','CANADA','AUSTRALIA']


#select the number of years you want to predict the FER for here:
num_years = 1
#i.e. if num_years = 5, predict the FER for 2015,2016,2017,2018,2019
end_year = 2019
test_year = end_year - num_years

#set up data structures for training/testing
x_train,y_train,x_test,y_true = ([[] for _ in range(len(countries))] for i in range(4))

#set up data structures for model/results
lin_regr_predictedFER,lin_regr = ([[] for _ in range(len(countries))] for i in range(2))
ridge_predictedFER,ridge = ([[] for _ in range(len(countries))] for i in range(2))
RandomForest_predictedFER,RandomForest = ([[] for _ in range(len(countries))] for i in range(2))
Lasso_predictedFER,Lasso = ([[] for _ in range(len(countries))] for i in range(2))

lin_regr_error_country = np.zeros((num_years, len(countries)))
ridge_error_country = np.zeros((num_years, len(countries)))
RandomForest_error_country = np.zeros((num_years, len(countries)))
Lasso_error_country = np.zeros((num_years, len(countries)))

importances = [[] for _ in range(len(countries))]
indices = [[] for _ in range(len(countries))]
std_lin_regr,std_ridge,std_RandomForest,std_Lasso = ([[] for _ in range(len(countries))] for i in range(4))

for j in range(num_years):

    for i in range(len(countries)):
        
        #collect training data which is data from 1980 to whatever year that you decided earlier
        X1 = Infl_Rate_data.loc[["USA",countries[i]],str(start_year):str(test_year+j)]
        X2 = PPP_data.loc[["USA",countries[i]],str(start_year):str(test_year+j)]
        X3 = Imports_data.loc[["USA",countries[i]],str(start_year):str(test_year+j)]
        X4 = Exports_data.loc[["USA",countries[i]],str(start_year):str(test_year+j)]
        X5 = GDP_Percent_data.loc[["USA",countries[i]],str(start_year):str(test_year+j)]
        #X6 = Investment_data.loc[["USA",countries[i]],str(start_year):str(test_year+j)]
        
        a = (pd.concat([X1,X2,X3,X4,X5])).to_numpy()
        X7 = topic_count_per_year[0:(test_year-int(start_year)+j+1)]
        X7 = ((pd.DataFrame(X7)).transpose()).to_numpy()
        x_train[i] = (np.concatenate((a, X7))).transpose()
        
        #the y target data is the FER data for your range of years
        Y1 = FER_data.loc[[countries[i]],str(start_year):str(test_year+j)]
        y_train[i] = (Y1.transpose()).to_numpy()
    
        #set up regression models
        lin_regr[i] = linear_model.LinearRegression()
        ridge[i] = Ridge(alpha=1.0)
        RandomForest[i] = RandomForestRegressor(max_depth=10, random_state=0)
        Lasso[i] = linear_model.Lasso(alpha=0.1)
        
        #fit training data to models
        lin_regr[i].fit(x_train[i], y_train[i])
        ridge[i].fit(x_train[i], y_train[i])
        RandomForest[i].fit(x_train[i], y_train[i])
        Lasso[i].fit(x_train[i], y_train[i])
        
        #set up testing data, which is the start year you defined + 1 for each iteration
        X_1 = Infl_Rate_data.loc[["USA",countries[i]],str(test_year+j+1)]
        X_2 = PPP_data.loc[["USA",countries[i]],str(test_year+j+1)]
        X_3 = Imports_data.loc[["USA",countries[i]],str(test_year+j+1)]
        X_4 = Exports_data.loc[["USA",countries[i]],str(test_year+j+1)]
        X_5 = GDP_Percent_data.loc[["USA",countries[i]],str(test_year+j+1)]
        #X_6 = Investment_data.loc[["USA",countries[i]],str(test_year+j+1)]
        X_7 = topic_count_per_year[(test_year-int(start_year)+j+1)]

        b = (pd.concat([X_1,X_2,X_3,X_4,X_5])).to_numpy()
        x_test[i] = np.concatenate((b, X_7))
    
        #predict FER values for next year
        lin_regr_predictedFER[i] = lin_regr[i].predict([x_test[i]])
        ridge_predictedFER[i] = ridge[i].predict([x_test[i]])
        RandomForest_predictedFER[i] = RandomForest[i].predict([x_test[i]])
        Lasso_predictedFER[i] = Lasso[i].predict([x_test[i]])
        
        #obtain true FER values for each country
        y_true[i] = FER_data.loc[countries[i],str(test_year+j+1)]
        
        #percent error in matrix form: (year x country) for each model
        lin_regr_error_country[j][i] = np.abs((lin_regr_predictedFER[i] - y_true[i]) / y_true[i]) * 100
        ridge_error_country[j][i] = np.abs((ridge_predictedFER[i] - y_true[i]) / y_true[i]) * 100
        RandomForest_error_country[j][i] = np.abs((RandomForest_predictedFER[i] - y_true[i]) / y_true[i]) * 100
        Lasso_error_country[j][i] = np.abs((Lasso_predictedFER[i] - y_true[i]) / y_true[i]) * 100
    
    
#average recursive linear regression error by year
avg_lin_regr_error_yearly = lin_regr_error_country.mean(axis=1)
#average error over years
overall_lin_regr_error = np.mean(avg_lin_regr_error_yearly)
print("Linear Regression Error:",overall_lin_regr_error)

#average recursive ridge regression error by year
avg_ridge_error_yearly = ridge_error_country.mean(axis=1)
#average error over years
overall_ridge_error = np.mean(avg_ridge_error_yearly)
print("Ridge Regression Error:",overall_ridge_error)

#average recursive random forest regression error by year
avg_RandomForest_error_yearly = RandomForest_error_country.mean(axis=1)
#average error over years
overall_RandomForest_error = np.mean(avg_RandomForest_error_yearly)
print("Random Forest Regression Error:",overall_RandomForest_error)

#average recursive lasso regression error by year
avg_Lasso_error_yearly = Lasso_error_country.mean(axis=1)
#average error over years
overall_Lasso_error = np.mean(avg_Lasso_error_yearly)
print("Lasso Regression Error:",overall_Lasso_error)

# Plot the feature importances of the forest
for i in range(len(countries)):
    importances[i] = RandomForest[i].feature_importances_
    indices = np.argsort(importances[i])[::-1]
    feature_list = ['Infl rate-US','Infl rate-'+countries[i],'PPP-US','PPP-'+countries[i],
    'Imports-US','Imports-'+countries[i],'Exports-US','Exports-'+countries[i],
    'GDP-US','GDP-'+countries[i],'Cluster 1','Cluster 2','Cluster 3','Cluster 4',
    'Cluster 5','Cluster 6']
    
    feature_list = [feature_list[j] for j in indices]
    importances = [importances[i][k] for k in indices]

    plt.figure()
    plt.title("Feature Importances" + "- " + countries[i])
    plt.bar(list(range(len(importances))), importances, color="r", align="center")
    plt.xticks(list(range(len(importances))), feature_list, rotation='vertical')
    plt.show()

    std_lin_regr[i] = np.std(lin_regr_error_country[:,i])
    std_ridge[i] = np.std(ridge_error_country[:,i])
    std_RandomForest[i] = np.std(RandomForest_error_country[:,i])
    std_Lasso[i] = np.std(Lasso_error_country[:,i])
    
plt.figure()
plt.title("Average Error - Linear Regression")
plt.bar(list(range(len(countries))), (lin_regr_error_country.tolist())[0],yerr=std_lin_regr)
plt.xticks(list(range(len(countries))), countries, rotation='vertical')
plt.ylim(0, 40)
plt.show()

plt.figure()
plt.title("Average Error - Ridge Regression")
plt.bar(list(range(len(countries))), (ridge_error_country.tolist())[0],yerr=std_ridge)
plt.xticks(list(range(len(countries))), countries, rotation='vertical')
plt.ylim(0, 40)
plt.show()

plt.figure()
plt.title("Average Error - Random Forest Regression")
plt.bar(list(range(len(countries))), (RandomForest_error_country.tolist())[0],yerr=std_RandomForest)
plt.xticks(list(range(len(countries))), countries, rotation='vertical')
plt.ylim(0, 40)
plt.show()

plt.figure()
plt.title("Average Error - Lasso Regression")
plt.bar(list(range(len(countries))), (Lasso_error_country.tolist())[0],yerr=std_Lasso)
plt.xticks(list(range(len(countries))), countries, rotation='vertical')
plt.ylim(0, 40)
plt.show()


article: 8000
topic count: [4826, 4477, 5553, 4686, 3945, 4610]
next year: 1982
previous year: 1981
article: 13197
topic count: [3253, 3119, 3747, 3163, 2861, 3180]
next year: 1983
previous year: 1982
article: 21197
topic count: [5335, 5246, 6118, 5224, 5012, 5256]
next year: 1984
previous year: 1983
article: 29197
topic count: [4928, 4798, 5803, 4745, 4474, 5063]
next year: 1985
previous year: 1984
article: 37197
topic count: [4701, 4721, 5487, 4595, 4221, 5083]
next year: 1986
previous year: 1985
article: 45197
topic count: [4830, 4943, 6052, 4689, 4975, 5274]
next year: 1987
previous year: 1986
article: 53197
topic count: [4930, 4834, 6446, 4736, 5320, 5319]
next year: 1988
previous year: 1987
article: 61187
topic count: [4524, 4397, 5783, 4130, 4540, 4973]
next year: 1989
previous year: 1988
article: 69186
topic count: [4884, 4702, 6451, 4589, 5351, 5275]
next year: 1990
previous year: 1989
article: 77186
topic count: [4651, 4451, 5559, 4141, 4198, 4710]
next year: 1991
previous ye

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>