In [2]:
!pip install pandas
!pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd

# Load the dataset (make sure the 'countries.csv' file is in the same folder as your code)
df = pd.read_csv('countries.csv')


In [8]:
df.head(5)

Unnamed: 0,Country,url,Introduction: Background,Geography: Location,Geography: Geographic coordinates,Geography: Map references,Geography: Area - total,Geography: Area - land,Geography: Area - water,Geography: Area - comparative,...,Transportation: Waterways - note 1,Transportation: Waterways - note 2,Transportation: Waterways - top ten largest natural lakes (by surface area),Transportation: Waterways - note 3,Space: Space launch site(s) - note,Transnational Issues: Refugees and internally displaced persons,Transnational Issues: Trafficking in persons - Tier 2 Watch List,Transnational Issues: Trafficking in persons - Tier 3,Transnational Issues: Illicit drugs - cocaine,Transnational Issues: Illicit drugs - opiates
0,Afghanistan,https://www.cia.gov/the-world-factbook/countri...,Ahmad Shah DURRANI unified the Pashtun tribes ...,"Southern Asia, north and west of Pakistan, eas...","33 00 N, 65 00 E",Asia,"652,230 sq km","652,230 sq km",0 sq km,almost six times the size of Virginia; slightl...,...,,,,,,,,,,
1,Akrotiri,https://www.cia.gov/the-world-factbook/countri...,,,,,,,,,...,,,,,,,,,,
2,Albania,https://www.cia.gov/the-world-factbook/countri...,After declaring independence from the Ottoman ...,"Southeastern Europe, bordering the Adriatic Se...","41 00 N, 20 00 E",Europe,"28,748 sq km","27,398 sq km","1,350 sq km",slightly smaller than Maryland,...,,,,,,,,,,
3,Algeria,https://www.cia.gov/the-world-factbook/countri...,"Algeria has known many empires and dynasties, ...","Northern Africa, bordering the Mediterranean S...","28 00 N, 3 00 E",Africa,"2,381,740 sq km","2,381,740 sq km",0 sq km,slightly less than 3.5 times the size of Texas,...,,,,,,,,,,
4,American Samoa,https://www.cia.gov/the-world-factbook/countri...,Tutuila -- the largest island in American Samo...,"Oceania, group of islands in the South Pacific...","14 20 S, 170 00 W",Oceania,224 sq km,224 sq km,0 sq km,"slightly larger than Washington, DC",...,,,,,,,,,,


In [9]:
# Select and rename only the useful columns
columns_to_clean = {
    "Economy: Real GDP (purchasing power parity)": "gdp_ppp",
    "Military and Security: Military expenditures": "military_gdp_percent",
    "People and Society: Population - total": "population",
    "People and Society: Literacy - total population": "literacy_percent",
    "People and Society: Birth rate": "birth_rate",
    "People and Society: Death rate": "death_rate"
}

# Create cleaned dataframe
clean_df = df[['Country']].copy()

# Extract numbers from string fields
for original_col, new_col in columns_to_clean.items():
    if original_col in df.columns:
        clean_df[new_col] = df[original_col].astype(str).str.extract(r'([\d\.]+)').astype(float)
    else:
        clean_df[new_col] = None

# Show cleaned data
print(clean_df.head(10))


               Country  gdp_ppp  military_gdp_percent  population  \
0          Afghanistan   82.595                   3.3        40.0   
1             Akrotiri      NaN                   NaN         NaN   
2              Albania   50.098                   2.0         3.0   
3              Algeria  699.818                   9.0        47.0   
4       American Samoa  658.000                   NaN        43.0   
5              Andorra    5.226                   NaN        85.0   
6               Angola  266.249                   1.3        37.0   
7             Anguilla      NaN                   NaN        19.0   
8           Antarctica      NaN                   NaN         NaN   
9  Antigua and Barbuda    2.703                   NaN       102.0   

   literacy_percent  birth_rate  death_rate  
0              37.3        34.2        11.8  
1               NaN         NaN         NaN  
2              98.4        12.3         7.4  
3              81.4        20.2         4.4  
4         

In [11]:
import numpy as np

# Drop rows with any missing values
clean_df = clean_df.dropna()

# Function to calculate a simple strength score
def calculate_strength(row):
    return (
        row['gdp_ppp'] +
        row['military_gdp_percent'] +
        row['literacy_percent'] +
        row['birth_rate'] -
        row['death_rate']
    )

# Generate pairs
matchups = []

for _ in range(500):  # generate 500 battles
    a, b = clean_df.sample(2, replace=False).to_dict('records')
    
    # Calculate features as difference A - B
    features = {
        'gdp_diff': a['gdp_ppp'] - b['gdp_ppp'],
        'military_diff': a['military_gdp_percent'] - b['military_gdp_percent'],
        'literacy_diff': a['literacy_percent'] - b['literacy_percent'],
        'birthrate_diff': a['birth_rate'] - b['birth_rate'],
        'deathrate_diff': a['death_rate'] - b['death_rate'],
        'label': int(calculate_strength(a) > calculate_strength(b)),
        'country_a': a['Country'],
        'country_b': b['Country']
    }
    matchups.append(features)

# Create DataFrame
battle_df = pd.DataFrame(matchups)

# Show first few battles
print(battle_df.head())


   gdp_diff  military_diff  literacy_diff  birthrate_diff  deathrate_diff  \
0   224.587            2.3           -6.1            -2.8           -11.9   
1   102.223            0.4            3.6            -1.7             2.6   
2   177.129           -0.1           33.2           -12.7            -1.7   
3   823.995            1.7           15.6           -24.3             0.4   
4    55.644            3.3          -26.5             6.2            -0.3   

   label country_a                          country_b  
0      1     Qatar                            Georgia  
1      1   Uruguay                             Mexico  
2      1     Ghana                              Benin  
3      1  Colombia  Congo, Democratic Republic of the  
4      1      Mali                            Nigeria  


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Features and label
X = battle_df[['gdp_diff', 'military_diff', 'literacy_diff', 'birthrate_diff', 'deathrate_diff']]
y = battle_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.98
Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98        45
           1       0.96      1.00      0.98        55

    accuracy                           0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100

