In [151]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from scipy.spatial import distance

In [152]:
df = pd.read_csv("../data/df_final.csv")

#### Variables that define geopolitical distance

##### 1. IdealPointDistance from UNGA voting records

In [153]:
df['IdealPointDistance'].describe()

count    7400.000000
mean        0.754700
std         0.670854
min         0.000241
25%         0.234837
50%         0.521664
75%         1.194996
max         3.642625
Name: IdealPointDistance, dtype: float64

##### 2. Dummy variables
##### fta_wto: whether they have a free trade agreement in place
##### arms: whether they conduct military arms trade for that particular year (small dataset)
##### comlang_off/ethno: whether the two countries share a common language official/at least 9% of pop respectively

##### Issue: fta_wto is too extreme, best if we can obtain preferential trade agreement data

In [154]:
df[['fta_wto', 'arms', 'comlang_off', 'comlang_ethno']].apply(lambda x: x.value_counts())

Unnamed: 0,fta_wto,arms,comlang_off,comlang_ethno
0,6843,7092,5092,4963
1,557,308,2308,2437


##### 3. comrelig: Religious proximity index - obtained by summing the products of the shares of Catholics, Protestants and Muslims in the origin and destination countries. Varies between 0 and 1, increases when the country pair shares a common religion practised by a large share of the population

In [155]:
df['comrelig'].describe()

count    7400.000000
mean        0.060152
std         0.054896
min         0.000000
25%         0.024000
50%         0.042000
75%         0.073000
max         0.174000
Name: comrelig, dtype: float64

##### 4. V-Dem variables
##### v2x_poly_archy: captures electoral competitiveness, inclusiveness, institutional strength
##### v2x_libdem: measures how well a country upholds the principles of a liberal democracy
##### v2x_partipdem: capture how open and inclusive the political process is for the general populace
##### v2x_delibdem: extent to which political decisions are made through informed discussion and mutual respect rather than through coercion or purely strategic bargaining
##### v2x_egaldem: evaluates the degree to which a democracy ensures equality in political power and representation

##### Generally, higher = more favorable/stronger democratic quality, lower = weaknesses

In [156]:
df['v2x_polyarchy_diff'] = abs(df['v2x_polyarchy_o'] - df['v2x_polyarchy_d'])
df['v2x_libdem_diff'] = abs(df['v2x_libdem_o'] - df['v2x_libdem_d'])
df['v2x_partipdem_diff'] = abs(df['v2x_partipdem_o'] - df['v2x_partipdem_d'])
df['v2x_delibdem_diff'] = abs(df['v2x_delibdem_o'] - df['v2x_delibdem_d'])
df['v2x_egaldem_diff'] = abs(df['v2x_egaldem_o'] - df['v2x_egaldem_d'])
df_final = df.drop(columns=[
    'v2x_polyarchy_o', 'v2x_polyarchy_d',
    'v2x_libdem_o', 'v2x_libdem_d',
    'v2x_partipdem_o', 'v2x_partipdem_d',
    'v2x_delibdem_o', 'v2x_delibdem_d',
    'v2x_egaldem_o', 'v2x_egaldem_d'])

df_final[['v2x_polyarchy_diff', 'v2x_partipdem_diff', 'v2x_delibdem_diff', 'v2x_egaldem_diff']].describe()

Unnamed: 0,v2x_polyarchy_diff,v2x_partipdem_diff,v2x_delibdem_diff,v2x_egaldem_diff
count,7400.0,7400.0,7400.0,7400.0
mean,0.272338,0.222147,0.240834,0.222306
std,0.154567,0.193483,0.152869,0.13736
min,0.0,0.0,0.0,0.0
25%,0.146,0.05,0.118,0.115
50%,0.253,0.147,0.22,0.2
75%,0.4,0.387,0.342,0.326
max,0.599,0.701,0.665,0.58


#### Goal: Define a new geopolitical distance/closeness index metric

#### 1. Metric

##### Distance metric, so higher = more distant

In [157]:
df_2 = df_final.copy()

##### Invert binary variables

In [158]:
df_2['lack_fta'] = 1 - df_2['fta_wto']
df_2['lack_arms'] = 1 - df_2['arms']
df_2['no_comlang_off'] = 1 - df_2['comlang_off']
df_2['no_comlang_ethno'] = 1 - df_2['comlang_ethno']
df_2['religion_distance'] = 1 - df_2['comrelig']

In [159]:
df_2[['lack_fta', 'lack_arms', 'no_comlang_off', 'no_comlang_ethno', 'religion_distance', 'fta_wto']].describe()

Unnamed: 0,lack_fta,lack_arms,no_comlang_off,no_comlang_ethno,religion_distance,fta_wto
count,7400.0,7400.0,7400.0,7400.0,7400.0,7400.0
mean,0.92473,0.958378,0.688108,0.670676,0.939848,0.07527
std,0.263845,0.199736,0.463297,0.47,0.054896,0.263845
min,0.0,0.0,0.0,0.0,0.826,0.0
25%,1.0,1.0,0.0,0.0,0.927,0.0
50%,1.0,1.0,1.0,1.0,0.958,0.0
75%,1.0,1.0,1.0,1.0,0.976,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


##### Rescale IdealPointDistance so it's from 0-1

In [160]:
max_ipd = df_2['IdealPointDistance'].max()
df_2['IdealPointDistance_norm'] = df_2['IdealPointDistance'] / max_ipd

##### Collapse v2x democracy differences into one metric

In [161]:
df_2['democracy_gap'] = (df_2['v2x_polyarchy_diff'] + df_2['v2x_partipdem_diff'] + df_2['v2x_delibdem_diff'] + df_2['v2x_egaldem_diff']) / 4

##### GeoDistance = 1.3 * lack_fta + 0.5 * lack_arms + no_comlang_off + no_comlang_ethno + religion_distance + 1.2 * IdealPointDistance_norm + democracy_gap
##### Weightage of each variable is self-defined
##### Since we only use SIPRI arms database, which is a small dataset, to represent lack_arms, it should not have equal weightage since absence of arms transfer data may not represent "no arms ties".
##### Additionally, since free trade agreements are the most preferential type of trade agreements, it is given a higher weightage
##### We then standardize the scale from 0 to 100
##### The goal is to allow users to input their desired weightage for each variable, so they can see how geodistance varies

In [162]:
df_2['geodistance'] = (1.3 * df_2['lack_fta'] + 0.5 * df_2['lack_arms'] + df_2['no_comlang_off'] + df_2['no_comlang_ethno'] + df_2['religion_distance'] + 1.2 * df_2['IdealPointDistance_norm'] + df_2['democracy_gap'])
df_2['geodistance'] = df_2['geodistance'] * 100/7

##### We will now evaluate the accuracy of this geodistance measure
##### The table shows the geodistance and allexports values for every country in 2020
##### We see that Philippines, India, China, Malaysia and Malta are in the top 5, and New Zealand/USA/Australia in the top 10
##### Note that allexports need not have high correlation with geodistance as there are unaccounted factros like physical distance and population.

In [163]:
geodistance_model = df_2[df_2['year'] == 2020][['country_d', 'geodistance', 'allexports']].sort_values(
    by='geodistance', ascending=True)

html_table = f"""
<div style="max-height:400px; overflow-y:auto;">
{geodistance_model.to_html(index=False)}
</div>
"""

display(HTML(html_table))

country_d,geodistance,allexports
Philippines,22.435521,7432171000.0
India,23.475034,9168108000.0
China,25.621392,51339110000.0
Malaysia,25.98596,33263820000.0
Malta,30.556924,1366019000.0
Ireland,32.590417,546995200.0
New Zealand,32.881804,1837100000.0
United States of America,33.272365,40225650000.0
Canada,34.944442,1107142000.0
Australia,35.510252,8549909000.0


##### We can at least see a negative correlation with logallexports, which is a good thing according to our assumptions

In [164]:
df_2['geodistance'].corr(df_2['logallexports'])

-0.21674420655958582

#### Numerical meaning
##### Maximum possible score is 100, where it represents countries with no shared traits/ties with Singapore and is maximally different in politics and UN Votes

##### Lacking an FTA adds 19 index points.
##### Not having arm ties adds 7.1 index points.
##### Not sharing an official language adds 14.3 points.
##### Lacking an ethnolinguistic commonality adds about 14.3 index points.
##### Not sure how to explain IdealPointDistance/religion_distance/democracy_gap, but you add 14.3/17.1/14.3 index points if you have the highest difference in those metrics with Singapore.

##### A country with a score of 30/100, therefore, might be very similar to Singapore on most aspects, only missing one or two important linkages (e.g, lacking a FTA or having a moderate difference in political alignment).

#### Testing regression model using this metric + control variables
##### Results are bad...

In [170]:
import statsmodels.api as sm
df_3 = df_2.copy()
df_3 = df_3.dropna(subset=['logallexports'])

X = df_3[['geodistance', 'pop_d', 'distcap', 'gdpcap_d']]
X = sm.add_constant(X)
y = df_3['logallexports']

model = sm.OLS(y, X).fit()
print(model.summary())
print("In-sample R²:", model.rsquared)

                            OLS Regression Results                            
Dep. Variable:          logallexports   R-squared:                       0.392
Model:                            OLS   Adj. R-squared:                  0.391
Method:                 Least Squares   F-statistic:                     687.7
Date:                Mon, 07 Apr 2025   Prob (F-statistic):               0.00
Time:                        23:30:43   Log-Likelihood:                -9412.5
No. Observations:                4272   AIC:                         1.884e+04
Df Residuals:                    4267   BIC:                         1.887e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          20.2201      0.162    124.493      