This notebook imports data on countries, their conflicts, and their alliances and creates dyads of each country pair.

In [57]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from scipy.io import savemat

The following block imports the .csv files.

In [44]:
country_path = '/content/utf8-country_data.csv'
wars_path = '/content/utf8-war.csv'
alliances_path = '/content/utf8-alliance.csv'

countries = pd.read_csv(country_path)
wars = pd.read_csv(wars_path)
alliances = pd.read_csv(alliances_path)

Now, we define the column names and value mappings (binary and one-hot).

In [50]:
numeric_cols = ["GDP per Capita (USD)", "Military Expenditure (Percent of GDP)", "Political Stability (GPI Rank)"]

# Clean the 'GDP per Capita (USD)' column by removing commas and converting to numeric
countries["GDP per Capita (USD)"] = countries["GDP per Capita (USD)"].astype(str).str.replace(",", "").astype(float)

scaler = MinMaxScaler()
countries[numeric_cols] = scaler.fit_transform(countries[numeric_cols])

# binary flags
binary_map = {"Yes": 1, "No": 0}
countries["Nuclear Armed"] = countries["Nuclear Armed"].map(binary_map)
countries["Recent Conflict (2003–2025)"] = countries["Recent Conflict (2003–2025)"].map(binary_map)

# categorical flags
categorical_cols = ["Economic Ideology", "Religion", "Ethnic Majority", "Religious Majority"]
# Convert categorical columns to string type to handle potential non-string values
for col in categorical_cols:
    countries[col] = countries[col].astype(str)

encoder = OneHotEncoder()
encoded = encoder.fit_transform(countries[categorical_cols])
encoded_df = pd.DataFrame(encoded.toarray(), index=countries.index, columns=encoder.get_feature_names_out(categorical_cols))

Now, we merge the processed features and create a lookup dictionary.

In [51]:
countries_proc = pd.concat([countries[["Country"] + numeric_cols + ["Nuclear Armed", "Recent Conflict (2003–2025)"]],
                            encoded_df], axis=1)

country_dict = countries_proc.set_index("Country").to_dict(orient="index")

Now, we build all the dyads which will comprise our training data.

In [55]:
dyads = list(combinations(countries["Country"], 2))  # unordered pairs
X_rows, y_rows = [], []

for a, b in dyads:
    # fetch features for each country
    fa = np.array(list(country_dict[a].values()))
    fb = np.array(list(country_dict[b].values()))

    # alliances
    allies_a = alliances.loc[alliances["Country"] == a].iloc[:, 1:].dropna().values.flatten()
    allies_b = alliances.loc[alliances["Country"] == b].iloc[:, 1:].dropna().values.flatten()
    direct_ally = int((b in allies_a) or (a in allies_b))
    shared_ally = int(len(set(allies_a).intersection(set(allies_b))) > 0)
    asymmetric_ally = int(((b in allies_a) != (a in allies_b)))

    # land disputes
    ld_a = countries.loc[countries["Country"] == a, ["Land Disputes", "Land Disputes 2"]].values.flatten()
    ld_b = countries.loc[countries["Country"] == b, ["Land Disputes", "Land Disputes 2"]].values.flatten()
    land_dispute = int((b in ld_a) or (a in ld_b))

    relational = np.array([direct_ally, shared_ally, asymmetric_ally, land_dispute])

    # check for dyad existence (war)
    row = wars[((wars["Country A"] == a) & (wars["Country B"] == b)) |
                ((wars["Country A"] == b) & (wars["Country B"] == a))]

    # labels if war exists (otherwise zero)
    label = 1 if len(row) > 0 else 0

    x_row = np.concatenate([fa, fb, relational])
    X_rows.append(x_row)
    y_rows.append(label)

Now, we're done. Below we will print out the final arrays and a confirmation message.

In [56]:
X = np.vstack(X_rows)
y = np.array(y_rows).reshape(-1, 1)

# Save to .mat
savemat("data/X.mat", {"X": X})
savemat("data/y.mat", {"y": y})

print("✅ Saved X.mat and y.mat")
print("X shape:", X.shape)
print("y shape:", y.shape)

✅ Saved X.mat and y.mat
X shape: (1378, 152)
y shape: (1378, 1)
