## Import dataset from locally stored CEPII database

In [None]:
import pandas as pd

df = pd.read_csv('gravity.csv', low_memory=False)

## __Data cleaning procedures:__
- Isolate the columns
- Eliminate all rows for missing data for key variables
- Create columns needed for regression


In [2]:
#Isolating the desired columns
columns = ['year', 'iso3_o', 'iso3_d', 'distw', 'comlang_off', 'comlang_ethno', 'comcol', 'comrelig', 'col45', 'comleg_pretrans', 'comleg_posttrans', 'heg_o', 'heg_d', 'col_dep_ever', 'sibling_ever', 'gdp_o', 'gdp_d', 'eu_o', 'eu_d', 'tradeflow_comtrade_o', 'tradeflow_comtrade_d', 'tradeflow_baci', 'tradeflow_imf_o', 'tradeflow_imf_d']
df = df[columns]

In [3]:
#We have that there are 5 different methods of measuring tradeflow, we hence take an average to erradicate the issues of missing values
pd.options.mode.chained_assignment = None
df['tradeflow'] = df[['tradeflow_comtrade_o', 'tradeflow_comtrade_d', 'tradeflow_baci', 'tradeflow_imf_o', 'tradeflow_imf_d']].mean(axis=1)
df = df.drop(['tradeflow_comtrade_o', 'tradeflow_comtrade_d', 'tradeflow_baci', 'tradeflow_imf_o', 'tradeflow_imf_d'], axis=1)

In [4]:
#Drop all the rows where any of the fundamental variables in the gravity equation are missing
df = df.drop(df[df['gdp_o'].isna() | df['gdp_d'].isna() | df['tradeflow'].isna() | df['distw'].isna()].index)

In [5]:
#Create dummy for origin and destination being EU countries
df['both_eu'] = (df['eu_o'] == 1) & (df['eu_d'] == 1)
df['both_eu'] = df['both_eu'].astype(int)

In [6]:
#Create dummy for either being EU country
df['one_eu'] = (((df['eu_o'] == 1) | (df['eu_d'] == 1)) & (df['both_eu'] == 0))
df['one_eu'] = df['one_eu'].astype(int)

In [7]:
df = df.drop(['eu_o','eu_d'], axis=1)

## We require four subsets for the regression analysis
- All data
- Countries with above average trade with UK
- OECD countries
- EU countries

In [14]:
df1 = df.copy()

In [62]:
#We define a descending list of average trade with UK and use it to get subset 2
import math

tradeflows = df[df['iso3_d'] == 'GBR'].groupby('iso3_o').mean()['tradeflow'].sort_values(ascending=False)
above_average = list(tradeflows[0:math.floor(len(tradeflows)/2)].index)

df2 = df[(df['iso3_d'] == 'GBR') & df['iso3_o'].isin(above_average)]

In [26]:
#Creating subset 3, using a list of all OECD countries (with iso codes)
OECD = ['AUS', 'AUT', 'BEL', 'CAN','CHE', 'CHL', 'COL', 'CRI', 'CZE', 'DEU', 'DNK', 'ESP', 'EST', 'FIN', 'FRA', 'GBR', 'GRC', 'HUN', 'IRL', 'ISL', 'ISR', 'ITA', 'JPN', 'KOR', 'LTU', 'LUX', 'LVA', 'MEX', 'NLD', 'NOR', 'NZL', 'POL', 'PRT', 'SVK', 'SVN', 'SWE', 'TUR', 'USA']

df3 = df[df['iso3_o'].isin(OECD) & df['iso3_d'].isin(OECD)]

In [63]:
#Creating subset 4
df4 = df[df['both_eu'] == 1]

In [None]:
import statsmodels.api as sm

