# Even Splice Formatter

The final formatter python file used to splice the data evenly, with an equal amount of wars and no-wars in training, and a small amount of wars and majority no-wars in testing.

In [5]:
import pandas as pd
import numpy as np
from scipy.io import savemat
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Now, we load the preformatted data from the .csv files.

In [6]:
wars = pd.read_csv('/content/utf8-war.csv')
countries = pd.read_csv('/content/utf8-country_data_new.csv')
alliances = pd.read_csv('/content/utf8-alliance.csv')

Next, we remove column names from the data that will be parsed, so the neural network will receive numbers only.

In [7]:
for df in [wars, countries, alliances]:
    df.columns = df.columns.str.strip()

wars['Country A'] = wars['Country A'].str.strip()
wars['Country B'] = wars['Country B'].str.strip()
countries['Country'] = countries['Country'].str.strip()

Now, we do numeric conversions for base numeric columns.


In [8]:
num_cols = [
    'GDP per Capita (USD)',
    'Military Expenditure (Percent of GDP)',
    'Political Stability (GPI Rank)',
    'Inflation Rate (%)',
    'Trade Openness (% of GDP)',
    'Debt-to-GDP (%)',
    'Active Personnel (per 1k pop)',
    'Global Firepower Rank',
    'Democracy Index (0-10)',
    'Corruption Index (0-100)',
    'Bordering Countries Count',
    'Area (1000 km²)',
    'Population (millions)',
    'Urbanization (%)',
    'UN Voting Alignment (US=1)',
    'Treaty Participation',
    'History of Interstate War (count)',
    'Terrorism Index (0-10)'
]

We will remove all commas and other symbols from the data in numeric columns, so they are pure `float` values. We will also convert the two binary flags to boolean values (`'Yes' = true = 1`).

In [9]:
for col in num_cols:
    if col in countries.columns:
        countries[col] = (
            countries[col]
            .replace('[^0-9.\-]', '', regex=True)
            .replace('', np.nan)
            .astype(float)
        )

countries['Nuclear Armed'] = countries['Nuclear Armed'].map({'Yes': 1, 'No': 0})
countries['Recent Conflict (2003–2025)'] = countries['Recent Conflict (2003–2025)'].map({'Yes': 1, 'No': 0})

  .replace('[^0-9.\-]', '', regex=True)


We then handle any missing (`NaN`) values. These values arose from a lack of data on public databases.

In [10]:
# Replace NaN with column means for numeric columns
countries[num_cols] = countries[num_cols].apply(lambda col: col.fillna(col.mean()), axis=0)

# For categorical columns, fill NaN with 'Unknown'
cat_cols = countries.select_dtypes(include='object').columns.drop('Country')
countries[cat_cols] = countries[cat_cols].fillna('Unknown')

# One-hot encode categorical columns
countries = pd.get_dummies(countries, columns=cat_cols, dtype=int)

# Standardize all numeric columns
numeric_cols = countries.select_dtypes(include=[np.number]).columns.drop(['Nuclear Armed', 'Recent Conflict (2003–2025)'])
scaler = StandardScaler()
countries[numeric_cols] = scaler.fit_transform(countries[numeric_cols])

Next, we create dyads for each pair of countries and convert them to arrays.

In [11]:
dyads = [(a, b) for a, b in product(countries['Country'], countries['Country']) if a != b]
data = []

for a, b in dyads:
    ca = countries[countries['Country'] == a].iloc[0]
    cb = countries[countries['Country'] == b].iloc[0]
    war = ((wars['Country A'] == a) & (wars['Country B'] == b)) | ((wars['Country A'] == b) & (wars['Country B'] == a))
    label = int(war.any())
    features = np.concatenate([ca.drop('Country').to_numpy(), cb.drop('Country').to_numpy()])
    data.append((features, label))

# convert to arrays
X = np.array([d[0] for d in data], dtype=float)
y = np.array([d[1] for d in data], dtype=int)

Next, we split the formatted data into training (evenly spliced) and testing (imbalanced, 99% no war 1% war).

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# balance set
ones = np.where(y_train == 1)[0]
zeros = np.where(y_train == 0)[0]
n = len(ones)

if n > 0:  # Avoid crash if no 1s
    selected_zeros = np.random.choice(zeros, n, replace=False)
    balanced_idx = np.concatenate([ones, selected_zeros])
else:
    balanced_idx = zeros  # fallback (no positives)

np.random.shuffle(balanced_idx)

X_train_bal = X_train[balanced_idx]
y_train_bal = y_train[balanced_idx]

Finally, we can export the data into a `.mat` file and print our stats to ensure we've done everything right.

In [13]:
print("Train set (balanced):")
print("  y_train=1:", np.sum(y_train_bal == 1))
print("  y_train=0:", np.sum(y_train_bal == 0))
print("Test set (original imbalance):")
print("  y_test=1:", np.sum(y_test == 1))
print("  y_test=0:", np.sum(y_test == 0))
print("Total dyads:", len(y))

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_bal = scaler.fit_transform(X_train_bal)
X_test = scaler.transform(X_test)

# export
savemat('X_train.mat', {'X_train': X_train_bal})
savemat('y_train.mat', {'y_train': y_train_bal})
savemat('X_test.mat', {'X_test': X_test})
savemat('y_test.mat', {'y_test': y_test})

print("Saved X_train.mat, y_train.mat, X_test.mat, y_test.mat")

Train set (balanced):
  y_train=1: 21
  y_train=0: 21
Test set (original imbalance):
  y_test=1: 5
  y_test=0: 547
Total dyads: 2756
Saved X_train.mat, y_train.mat, X_test.mat, y_test.mat
