In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as st
import plotly.graph_objects as go
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score 

from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import eli5

In [15]:
df = pd.read_csv('data11_filtered.csv')

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,X,year,iso3_o,iso3_d,contig,dist,comlang_off,comrelig,gdp_o,...,rta,rta_coverage,rta_type,iso3num_o_y,iso3num_d_y,destTotImp,HHI_value,tr_flows,MS,olig_cor
0,0,7,2007,ALB,AUT,0,812.933,0,0.001,10701010.0,...,1,1,5,8,40,4404946520,0.34,38736.0,8.79375e-06,-1.19595e-05
1,4,11,2007,ALB,DEU,0,1493.096,0,0.0,10701010.0,...,1,1,5,8,276,38729776000,0.34,1000.0,2.581993e-08,-3.51151e-08
2,5,12,2007,ALB,GRC,1,500.124,0,0.003,10701010.0,...,1,1,5,8,300,4411954035,0.34,11904.0,2.698124e-06,-3.669449e-06
3,7,14,2007,ALB,ITA,0,611.763,0,0.0,10701010.0,...,1,1,5,8,381,15399984966,0.34,7921.0,5.143512e-07,-6.995176e-07
4,11,19,2007,DZA,CAN,0,6747.007,1,0.008,134977900.0,...,0,0,6,12,124,10123052238,0.14,1830.0,1.807755e-07,-1.012343e-07


In [12]:
df.columns

Index(['Unnamed: 0', 'X', 'year', 'iso3_o', 'iso3_d', 'contig', 'dist',
       'comlang_off', 'comrelig', 'gdp_o', 'gdp_d', 'gatt_o', 'gatt_d',
       'wto_o', 'wto_d', 'eu_o', 'eu_d', 'rta', 'rta_coverage', 'rta_type',
       'iso3num_o_y', 'iso3num_d_y', 'destTotImp', 'HHI_value', 'tr_flows',
       'MS', 'olig_cor'],
      dtype='object')

In [17]:
df['gravity_controls'] = df['rta'] + df['contig'] + df['comlang_off']

In [18]:
df.drop(['year',
         'rta_type',
         'Unnamed: 0',
         'X',
         'gatt_o',
         'gatt_d',
         'rta_coverage',
         'iso3num_d_y',
         'iso3num_o_y',
         'comrelig',
         'wto_o',
         'wto_d',
         'eu_o',
         'eu_d',
         'destTotImp',
         'HHI_value',
         'MS',
         'rta',
         'contig',
         'comlang_off'], 
         axis =1, 
         inplace = True)

In [19]:
df.columns

Index(['iso3_o', 'iso3_d', 'dist', 'gdp_o', 'gdp_d', 'tr_flows', 'olig_cor',
       'gravity_controls'],
      dtype='object')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58291 entries, 0 to 58290
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   iso3_o            58291 non-null  object 
 1   iso3_d            58291 non-null  object 
 2   dist              58291 non-null  float64
 3   gdp_o             58291 non-null  float64
 4   gdp_d             58291 non-null  float64
 5   tr_flows          58291 non-null  float64
 6   olig_cor          58291 non-null  float64
 7   gravity_controls  58291 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 3.6+ MB


In [21]:
df_transformed = pd.get_dummies(df, drop_first=True)

In [24]:
numeric = ['dist', 'gdp_o', 'gdp_d', 'tr_flows']

In [25]:
scaler = StandardScaler()
scaler.fit(df_transformed[numeric])
df_transformed[numeric] = scaler.transform(df_transformed[numeric])

In [26]:
features = df_transformed.drop(['tr_flows'], axis=1)
target = df_transformed['tr_flows']

In [27]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features, 
    target, 
    train_size=0.75, 
    test_size=0.25, 
    random_state=12345)

In [28]:
model = LinearRegression()
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
MAE = mean_absolute_error(target_valid,predicted_valid)
RMSE = mean_squared_error(target_valid,predicted_valid)

print('model MAE: ', MAE)
print('model RMSE: ', RMSE)
print('predicted mean: ', predicted_valid.mean())

model MAE:  0.3300659440645271
model RMSE:  0.6623037502646072
predicted mean:  -0.0012892917888148638


In [29]:
feature_names = [col for col in list(features.columns)]

In [30]:
eli5.explain_weights(model,top = 50, feature_names = feature_names)

Weight?,Feature
+1.463,iso3_d_BEL
+1.191,iso3_o_DEU
+0.933,iso3_d_DEU
+0.840,iso3_d_CHE
+0.777,iso3_d_NLD
+0.777,iso3_o_IRL
+0.725,iso3_o_USA
+0.706,iso3_o_CHE
+0.686,iso3_d_GBR
+0.679,iso3_d_ITA
