In [182]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from scipy.spatial import distance
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis

In [183]:
df = pd.read_csv("../data/df_final.csv")

### Variables that define geopolitical distance

##### 1. IdealPointDistance from UNGA voting records

In [184]:
df['IdealPointDistance'].describe()

count    7400.000000
mean        0.754700
std         0.670854
min         0.000241
25%         0.234837
50%         0.521664
75%         1.194996
max         3.642625
Name: IdealPointDistance, dtype: float64

##### 2. Dummy variables
##### fta_wto: whether they have a free trade agreement in place
##### arms: whether they conduct military arms trade for that particular year (small dataset)
##### comlang_off/ethno: whether the two countries share a common language official/at least 9% of pop respectively

##### Issue: fta_wto is too extreme, best if we can obtain preferential trade agreement data

In [185]:
df[['fta_wto', 'arms', 'comlang_off', 'comlang_ethno']].apply(lambda x: x.value_counts())

Unnamed: 0,fta_wto,arms,comlang_off,comlang_ethno
0,6843,7092,5092,4963
1,557,308,2308,2437


##### 3. comrelig: Religious proximity index - obtained by summing the products of the shares of Catholics, Protestants and Muslims in the origin and destination countries. Varies between 0 and 1, increases when the country pair shares a common religion practised by a large share of the population

In [186]:
df['comrelig'].describe()

count    7400.000000
mean        0.060152
std         0.054896
min         0.000000
25%         0.024000
50%         0.042000
75%         0.073000
max         0.174000
Name: comrelig, dtype: float64

##### 4. V-Dem variables
##### v2x_poly_archy: captures electoral competitiveness, inclusiveness, institutional strength
##### v2x_libdem: measures how well a country upholds the principles of a liberal democracy
##### v2x_partipdem: capture how open and inclusive the political process is for the general populace
##### v2x_delibdem: extent to which political decisions are made through informed discussion and mutual respect rather than through coercion or purely strategic bargaining
##### v2x_egaldem: evaluates the degree to which a democracy ensures equality in political power and representation

##### Generally, higher = more favorable/stronger democratic quality, lower = weaknesses

In [187]:
df['v2x_polyarchy_diff'] = abs(df['v2x_polyarchy_o'] - df['v2x_polyarchy_d'])
df['v2x_libdem_diff'] = abs(df['v2x_libdem_o'] - df['v2x_libdem_d'])
df['v2x_partipdem_diff'] = abs(df['v2x_partipdem_o'] - df['v2x_partipdem_d'])
df['v2x_delibdem_diff'] = abs(df['v2x_delibdem_o'] - df['v2x_delibdem_d'])
df['v2x_egaldem_diff'] = abs(df['v2x_egaldem_o'] - df['v2x_egaldem_d'])
df_final = df.drop(columns=[
    'v2x_polyarchy_o', 'v2x_polyarchy_d',
    'v2x_libdem_o', 'v2x_libdem_d',
    'v2x_partipdem_o', 'v2x_partipdem_d',
    'v2x_delibdem_o', 'v2x_delibdem_d',
    'v2x_egaldem_o', 'v2x_egaldem_d'])

df_final[['v2x_polyarchy_diff', 'v2x_partipdem_diff', 'v2x_delibdem_diff', 'v2x_egaldem_diff']].describe()

Unnamed: 0,v2x_polyarchy_diff,v2x_partipdem_diff,v2x_delibdem_diff,v2x_egaldem_diff
count,7400.0,7400.0,7400.0,7400.0
mean,0.272338,0.222147,0.240834,0.222306
std,0.154567,0.193483,0.152869,0.13736
min,0.0,0.0,0.0,0.0
25%,0.146,0.05,0.118,0.115
50%,0.253,0.147,0.22,0.2
75%,0.4,0.387,0.342,0.326
max,0.599,0.701,0.665,0.58


### Goal: Define a new geopolitical distance index metric (i.e. lower means more geopolitically aligned)

### 1. Self defined weightage
##### Distance metric, so higher = more distant

In [188]:
df_2 = df_final.copy()

##### Invert binary variables

In [189]:
df_2['lack_fta'] = 1 - df_2['fta_wto']
df_2['lack_arms'] = 1 - df_2['arms']
df_2['no_comlang_off'] = 1 - df_2['comlang_off']
df_2['no_comlang_ethno'] = 1 - df_2['comlang_ethno']
df_2['religion_distance'] = 1 - df_2['comrelig']

In [190]:
df_2[['lack_fta', 'lack_arms', 'no_comlang_off', 'no_comlang_ethno', 'religion_distance', 'fta_wto']].describe()

Unnamed: 0,lack_fta,lack_arms,no_comlang_off,no_comlang_ethno,religion_distance,fta_wto
count,7400.0,7400.0,7400.0,7400.0,7400.0,7400.0
mean,0.92473,0.958378,0.688108,0.670676,0.939848,0.07527
std,0.263845,0.199736,0.463297,0.47,0.054896,0.263845
min,0.0,0.0,0.0,0.0,0.826,0.0
25%,1.0,1.0,0.0,0.0,0.927,0.0
50%,1.0,1.0,1.0,1.0,0.958,0.0
75%,1.0,1.0,1.0,1.0,0.976,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


##### Rescale IdealPointDistance so it's from 0-1

In [191]:
mmScaler = MinMaxScaler()
df_2['IdealPointDistance_norm'] = mmScaler.fit_transform(df_2[['IdealPointDistance']])

##### Collapse v2x democracy differences into one metric

In [192]:
df_2['democracy_gap'] = (df_2['v2x_polyarchy_diff'] + df_2['v2x_partipdem_diff'] + df_2['v2x_delibdem_diff'] + df_2['v2x_egaldem_diff']) / 4

##### GeoDistance = 1.3 * lack_fta + 0.5 * lack_arms + no_comlang_off + no_comlang_ethno + religion_distance + 1.2 * IdealPointDistance_norm + democracy_gap
##### Weightage of each variable is self-defined
##### Since we only use SIPRI arms database, which is a small dataset, to represent lack_arms, it should not have equal weightage since absence of arms transfer data may not represent "no arms ties".
##### Additionally, since free trade agreements are the most preferential type of trade agreements, it is given a higher weightage
##### We then standardize the scale from 0 to 100
##### The goal is to allow users to input their desired weightage for each variable, so they can see how geodistance varies

In [193]:
df_2['geodistance_weighted'] = (1.3 * df_2['lack_fta'] + 0.5 * df_2['lack_arms'] + df_2['no_comlang_off'] + df_2['no_comlang_ethno'] + df_2['religion_distance'] + 1.2 * df_2['IdealPointDistance_norm'] + df_2['democracy_gap'])
df_2['geodistance_weighted'] = df_2['geodistance_weighted'] * 100/7

#### Numerical meaning
##### Maximum possible score is 100, where it represents countries with no shared traits/ties with Singapore and is maximally different in politics and UN Votes

##### Lacking an FTA adds 19 index points.
##### Not having arm ties adds 7.1 index points.
##### Not sharing an official language adds 14.3 points.
##### Lacking an ethnolinguistic commonality adds about 14.3 index points.
##### Not sure how to explain IdealPointDistance/religion_distance/democracy_gap, but you add 14.3/17.1/14.3 index points if you have the highest difference in those metrics with Singapore.

##### A country with a score of 30/100, therefore, might be very similar to Singapore on most aspects, only missing one or two important linkages (e.g, lacking a FTA or having a moderate difference in political alignment).

### 2. PCA
##### Distance-based metric, so higher = worse

In [194]:
df_3 = df_final.copy()

##### Normalise variables so they have the same scale, similar to earlier

In [195]:
df_3['lack_fta'] = 1 - df_3['fta_wto']
df_3['lack_arms'] = 1 - df_3['arms']
df_3['no_comlang_off'] = 1 - df_3['comlang_off']
df_3['no_comlang_ethno'] = 1 - df_3['comlang_ethno']
df_3['religion_distance'] = 1 - df_3['comrelig']

mmScaler = MinMaxScaler()
df_3['IdealPointDistance_norm'] = mmScaler.fit_transform(df_3[['IdealPointDistance']])

##### Standardization

In [196]:
features = ['lack_fta', 'lack_arms', 'no_comlang_off', 'no_comlang_ethno', 'religion_distance', 'IdealPointDistance_norm', 
            'v2x_polyarchy_diff', 'v2x_partipdem_diff', 'v2x_delibdem_diff', 'v2x_egaldem_diff']
non_binary_cols = ['religion_distance', 'IdealPointDistance_norm', 'v2x_polyarchy_diff', 
                  'v2x_partipdem_diff', 'v2x_delibdem_diff', 'v2x_egaldem_diff']
scaler = StandardScaler()
df_3[non_binary_cols] = scaler.fit_transform(df_3[non_binary_cols])

X_std = df_3[features].values

##### Metric modeling

In [197]:
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X_std)

df_3['geodistance_pca'] = X_pca[:, 0]

# Min-max scaling to scale from 0 to 100
scaler_0_100 = MinMaxScaler(feature_range=(0, 100))
df_3['geodistance_pca'] = scaler_0_100.fit_transform(df_3[['geodistance_pca']])

In [198]:
loadings = pca.components_[0]
print("Loadings (first principal component):")
for crit, loading in zip(features, loadings):
    print(f"{crit}: {loading}")

explained_variance = pca.explained_variance_ratio_[0]
print("\nExplained Variance Ratio for the first PC:", explained_variance)

Loadings (first principal component):
lack_fta: -0.018693148743366538
lack_arms: -0.029752891001782023
no_comlang_off: -0.01195865824288821
no_comlang_ethno: -0.006073961401492287
religion_distance: 0.2018043847043678
IdealPointDistance_norm: 0.3144712986890842
v2x_polyarchy_diff: 0.48373966190138046
v2x_partipdem_diff: 0.43258458458033133
v2x_delibdem_diff: 0.48172870318364275
v2x_egaldem_diff: 0.4536212020881696

Explained Variance Ratio for the first PC: 0.5741573360217382


### 3. Factor Analysis
##### Distance-based metric, so higher = worse

In [199]:
df_4 = df_final.copy()

##### Normalise variables so they have the same scale, similar to earlier

In [200]:
df_4['lack_fta'] = 1 - df_4['fta_wto']
df_4['lack_arms'] = 1 - df_4['arms']
df_4['no_comlang_off'] = 1 - df_4['comlang_off']
df_4['no_comlang_ethno'] = 1 - df_4['comlang_ethno']
df_4['religion_distance'] = 1 - df_4['comrelig']

mmScaler = MinMaxScaler()
df_4['IdealPointDistance_norm'] = mmScaler.fit_transform(df_4[['IdealPointDistance']])

In [201]:
features = ['lack_fta', 'lack_arms', 'no_comlang_off', 'no_comlang_ethno', 'religion_distance', 'IdealPointDistance_norm', 
            'v2x_polyarchy_diff', 'v2x_partipdem_diff', 'v2x_delibdem_diff', 'v2x_egaldem_diff']
non_binary_cols = ['religion_distance', 'IdealPointDistance_norm', 'v2x_polyarchy_diff', 
                  'v2x_partipdem_diff', 'v2x_delibdem_diff', 'v2x_egaldem_diff']
scaler = StandardScaler()
df_4[non_binary_cols] = scaler.fit_transform(df_4[non_binary_cols])

X_std = df_4[features].values

##### Metric Modeling

In [202]:
fa = FactorAnalysis(n_components=1, random_state=222)
X_fa = fa.fit_transform(X_std)
df_4['geodistance_fa'] = X_fa[:, 0]

# Min-max scaling to scale from 0 to 100
scaler_0_100 = MinMaxScaler(feature_range=(0, 100))
df_4['geodistance_fa'] = scaler_0_100.fit_transform(df_4[['geodistance_fa']])

loadings = fa.components_
loading_df = pd.DataFrame(loadings.T, columns=['Factor Loading'], index=features)
print("Factor Loadings:")
print(loading_df)

Factor Loadings:
                         Factor Loading
lack_fta                      -0.037986
lack_arms                     -0.049428
no_comlang_off                -0.014331
no_comlang_ethno              -0.006583
religion_distance              0.255736
IdealPointDistance_norm        0.482538
v2x_polyarchy_diff             0.949754
v2x_partipdem_diff             0.746940
v2x_delibdem_diff              0.961511
v2x_egaldem_diff               0.886888


### Metric Comparisn & Evaluation

In [203]:
df_final['geodistance_weighted'] = df_2['geodistance_weighted']
df_final['geodistance_pca'] = df_3['geodistance_pca']
df_final['geodistance_fa'] = df_4['geodistance_fa']

#### Eyeball comparison

In [207]:
geodistance_model = df_final[df_final['year'] == 2020][['country_d', 'allexports', 'geodistance_weighted', 'geodistance_pca', 'geodistance_fa',]].sort_values(
    by='geodistance_weighted', ascending=True)

html_table = f"""
<div style="max-height:400px; overflow-y:auto;">
{geodistance_model.to_html(index=False)}
</div>
"""

display(HTML(html_table))

country_d,allexports,geodistance_weighted,geodistance_pca,geodistance_fa
Philippines,7432171000.0,22.434407,12.583194,6.721874
India,9168108000.0,23.473969,13.944992,9.249732
China,51339110000.0,25.620331,35.838157,40.727287
Malaysia,33263820000.0,25.984928,44.647384,55.855032
Malta,1366019000.0,30.556104,54.691618,51.109239
Ireland,546995200.0,32.58961,73.960915,75.204809
New Zealand,1837100000.0,32.881,73.486169,71.563957
United States of America,40225650000.0,33.272183,66.698213,53.465876
Canada,1107142000.0,34.943834,69.449791,65.833236
Australia,8549909000.0,35.50966,71.904125,68.483771


#### Correlation Check

In [205]:
df_test = df_final.copy()
df_test = df_test.dropna(subset=['tradeflow_comtrade_o'])

columns_to_corr = ['geodistance_weighted', 'geodistance_fa', 'geodistance_pca']
corr_dict = {col: df_test[col].corr(df_test['tradeflow_comtrade_o']) for col in columns_to_corr}
corr_df = pd.DataFrame(list(corr_dict.items()), columns=['Geodistance Metric', 'Correlation'])


print(corr_df)

     Geodistance Metric  Correlation
0  geodistance_weighted    -0.310615
1        geodistance_fa     0.211020
2       geodistance_pca     0.222855


#### Linear Regression Check

In [206]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_weighted = df_test[['geodistance_weighted', 'distcap', 'gdpcap_d', 'pop_d']]
X_pca = df_test[['geodistance_pca', 'distcap', 'gdpcap_d', 'pop_d']]
X_fa = df_test[['geodistance_fa', 'distcap', 'gdpcap_d', 'pop_d']]
y = np.log(df_test['tradeflow_comtrade_o'])

model_weighted = LinearRegression().fit(X_weighted, y)
y_pred_weighted = model_weighted.predict(X_weighted)
rmse_weighted = np.sqrt(mean_squared_error(y, y_pred_weighted))
r2_weighted = r2_score(y, y_pred_weighted)

model_pca = LinearRegression().fit(X_pca, y)
y_pred_pca = model_pca.predict(X_pca)
rmse_pca = np.sqrt(mean_squared_error(y, y_pred_pca))
r2_pca = r2_score(y, y_pred_pca)

model_fa = LinearRegression().fit(X_fa, y)
y_pred_fa = model_fa.predict(X_fa)
rmse_fa = np.sqrt(mean_squared_error(y, y_pred_fa))
r2_fa = r2_score(y, y_pred_fa)

print("Model using weighted composite:")
print("RMSE:", rmse_weighted, "R²:", r2_weighted)
print("Model using PCA composite:")
print("RMSE:", rmse_pca, "R²:", r2_pca)
print("Model using FA composite:")
print("RMSE:", rmse_fa, "R²:", r2_fa)

Model using weighted composite:
RMSE: 2.158818715773443 R²: 0.36357001048210913
Model using PCA composite:
RMSE: 2.1066479595617973 R²: 0.393958700015682
Model using FA composite:
RMSE: 2.1137825374261183 R²: 0.3898467926544591
