In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 19 11:55:13 2020
@author: nabeelhussain
"""

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore")

#import data - commented out files have missing data points

#World Bank - GDP
#GDP_data = pd.read_csv('/Users/nabeelhussain/Desktop/COUNTRY_GDP.csv', index_col=0)
#OECD - Interest and Inflation Rates

#Int_Rate_data = pd.read_csv('/Users/nabeelhussain/Desktop/INT_RATES.csv', index_col=0)
Infl_Rate_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/INFL_RATE.csv', index_col=0)
FER_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/FER.csv', index_col=0)

#IMF - General Government Debt
#Gov_Debt_data = pd.read_csv('/Users/nabeelhussain/Desktop/GOV_DEBT.csv', index_col=0)
#World Bank - Governance Indicators
#Governance_data = pd.read_csv('/Users/nabeelhussain/Desktop/GOVERNANCE_IND.csv', index_col=0)

#OECD - PPP
PPP_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/PPP.csv', index_col=0)

#International Monetary Fund, World Economic Outlook Database - Imports/Exports
Imports_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/IMPORTS.csv', index_col=0)
Exports_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/EXPORTS.csv', index_col=0)
GDP_Percent_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/GDP_PERCENT.csv', index_col=0)
Investment_data = pd.read_csv('/Users/nabeelhussain/Desktop/NEU/CS5100/INVEST.csv', index_col=0)


#select countries to use here
countries = ['INDIA','JAPAN','UK','SWITZERLAND','CANADA','AUSTRALIA']


#select the number of years you want to predict the FER for here:
num_years = 1
#i.e. if num_years = 5, predict the FER for 2015,2016,2017,2018,2019
start_year = 2019 - num_years

#set up data structures for training/testing
x_train,y_train,x_test,y_true = ([[] for _ in range(len(countries))] for i in range(4))

#set up data structures for model/results
lin_regr_predictedFER,lin_regr = ([[] for _ in range(len(countries))] for i in range(2))
ridge_predictedFER,ridge = ([[] for _ in range(len(countries))] for i in range(2))
RandomForest_predictedFER,RandomForest = ([[] for _ in range(len(countries))] for i in range(2))
Lasso_predictedFER,Lasso = ([[] for _ in range(len(countries))] for i in range(2))

lin_regr_error_country = np.zeros((num_years, len(countries)))
ridge_error_country = np.zeros((num_years, len(countries)))
RandomForest_error_country = np.zeros((num_years, len(countries)))
Lasso_error_country = np.zeros((num_years, len(countries)))


    
for j in range(num_years):

    for i in range(len(countries)):
        
        #collect training data which is data from 1980 to whatever year that you decided earlier
        X1 = Infl_Rate_data.loc[["USA",countries[i]],"1980":str(start_year+j)]
        X2 = PPP_data.loc[["USA",countries[i]],"1980":str(start_year+j)]
        X3 = Imports_data.loc[["USA",countries[i]],"1980":str(start_year+j)]
        X4 = Exports_data.loc[["USA",countries[i]],"1980":str(start_year+j)]
        X5 = GDP_Percent_data.loc[["USA",countries[i]],"1980":str(start_year+j)]
        X6 = Investment_data.loc[["USA",countries[i]],"1980":str(start_year+j)]
    
        x_train[i] = ((pd.concat([X1,X2,X3,X4,X5,X6])).transpose()).to_numpy()
        
        #the y target data is the FER data for your range of years
        Y1 = FER_data.loc[[countries[i]],"1980":str(start_year+j)]
        y_train[i] = (Y1.transpose()).to_numpy()
    
        #set up regression models
        lin_regr[i] = linear_model.LinearRegression()
        ridge[i] = Ridge(alpha=1.0)
        RandomForest[i] = RandomForestRegressor(max_depth=10, random_state=0)
        Lasso[i] = linear_model.Lasso(alpha=0.1)
        
        #fit training data to models
        lin_regr[i].fit(x_train[i], y_train[i])
        ridge[i].fit(x_train[i], y_train[i])
        RandomForest[i].fit(x_train[i], y_train[i])
        Lasso[i].fit(x_train[i], y_train[i])
        
        #set up testing data, which is the start year you defined + 1 for each iteration
        X_1 = Infl_Rate_data.loc[["USA",countries[i]],str(start_year+j+1)]
        X_2 = PPP_data.loc[["USA",countries[i]],str(start_year+j+1)]
        X_3 = Imports_data.loc[["USA",countries[i]],str(start_year+j+1)]
        X_4 = Exports_data.loc[["USA",countries[i]],str(start_year+j+1)]
        X_5 = GDP_Percent_data.loc[["USA",countries[i]],str(start_year+j+1)]
        X_6 = Investment_data.loc[["USA",countries[i]],str(start_year+j+1)]
    
        x_test[i] = (pd.concat([X_1,X_2,X_3,X_4,X_5,X_6])).transpose()
    
        #predict FER values for next year
        lin_regr_predictedFER[i] = lin_regr[i].predict([x_test[i]])
        ridge_predictedFER[i] = ridge[i].predict([x_test[i]])
        RandomForest_predictedFER[i] = RandomForest[i].predict([x_test[i]])
        Lasso_predictedFER[i] = Lasso[i].predict([x_test[i]])
        
        #obtain true FER values for each country
        y_true[i] = FER_data.loc[countries[i],str(start_year+j+1)]
        
        #percent error in matrix form: (year x country) for each model
        lin_regr_error_country[j][i] = np.abs((lin_regr_predictedFER[i] - y_true[i]) / y_true[i]) * 100
        ridge_error_country[j][i] = np.abs((ridge_predictedFER[i] - y_true[i]) / y_true[i]) * 100
        RandomForest_error_country[j][i] = np.abs((RandomForest_predictedFER[i] - y_true[i]) / y_true[i]) * 100
        Lasso_error_country[j][i] = np.abs((Lasso_predictedFER[i] - y_true[i]) / y_true[i]) * 100
    
    
#average recursive linear regression error by year
avg_lin_regr_error_yearly = lin_regr_error_country.mean(axis=1)
#average error over years
overall_lin_regr_error = np.mean(avg_lin_regr_error_yearly)
print("Linear Regression Error:",overall_lin_regr_error)

#average recursive ridge regression error by year
avg_ridge_error_yearly = ridge_error_country.mean(axis=1)
#average error over years
overall_ridge_error = np.mean(avg_ridge_error_yearly)
print("Ridge Regression Error:",overall_ridge_error)

#average recursive random forest regression error by year
avg_RandomForest_error_yearly = RandomForest_error_country.mean(axis=1)
#average error over years
overall_RandomForest_error = np.mean(avg_RandomForest_error_yearly)
print("Random Forest Regression Error:",overall_RandomForest_error)

#average recursive lasso regression error by year
avg_Lasso_error_yearly = Lasso_error_country.mean(axis=1)
#average error over years
overall_Lasso_error = np.mean(avg_Lasso_error_yearly)
print("Lasso Regression Error:",overall_Lasso_error)

Linear Regression Error: 9.226867551803581
Ridge Regression Error: 9.30548763865343
Random Forest Regression Error: 6.517838131064643
Lasso Regression Error: 9.94892011485277
