In [2]:
!pip install pandas
!pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd

# Load the dataset (make sure the 'countries.csv' file is in the same folder as your code)
df = pd.read_csv('countries.csv')


In [4]:
df.head(5)

Unnamed: 0,Country,url,Introduction: Background,Geography: Location,Geography: Geographic coordinates,Geography: Map references,Geography: Area - total,Geography: Area - land,Geography: Area - water,Geography: Area - comparative,...,Transportation: Waterways - note 1,Transportation: Waterways - note 2,Transportation: Waterways - top ten largest natural lakes (by surface area),Transportation: Waterways - note 3,Space: Space launch site(s) - note,Transnational Issues: Refugees and internally displaced persons,Transnational Issues: Trafficking in persons - Tier 2 Watch List,Transnational Issues: Trafficking in persons - Tier 3,Transnational Issues: Illicit drugs - cocaine,Transnational Issues: Illicit drugs - opiates
0,Afghanistan,https://www.cia.gov/the-world-factbook/countri...,Ahmad Shah DURRANI unified the Pashtun tribes ...,"Southern Asia, north and west of Pakistan, eas...","33 00 N, 65 00 E",Asia,"652,230 sq km","652,230 sq km",0 sq km,almost six times the size of Virginia; slightl...,...,,,,,,,,,,
1,Akrotiri,https://www.cia.gov/the-world-factbook/countri...,,,,,,,,,...,,,,,,,,,,
2,Albania,https://www.cia.gov/the-world-factbook/countri...,After declaring independence from the Ottoman ...,"Southeastern Europe, bordering the Adriatic Se...","41 00 N, 20 00 E",Europe,"28,748 sq km","27,398 sq km","1,350 sq km",slightly smaller than Maryland,...,,,,,,,,,,
3,Algeria,https://www.cia.gov/the-world-factbook/countri...,"Algeria has known many empires and dynasties, ...","Northern Africa, bordering the Mediterranean S...","28 00 N, 3 00 E",Africa,"2,381,740 sq km","2,381,740 sq km",0 sq km,slightly less than 3.5 times the size of Texas,...,,,,,,,,,,
4,American Samoa,https://www.cia.gov/the-world-factbook/countri...,Tutuila -- the largest island in American Samo...,"Oceania, group of islands in the South Pacific...","14 20 S, 170 00 W",Oceania,224 sq km,224 sq km,0 sq km,"slightly larger than Washington, DC",...,,,,,,,,,,


In [5]:
# Select and rename only the useful columns
columns_to_clean = {
    "Economy: Real GDP (purchasing power parity)": "gdp_ppp",
    "Military and Security: Military expenditures": "military_gdp_percent",
    "People and Society: Population - total": "population",
    "People and Society: Literacy - total population": "literacy_percent",
    "People and Society: Birth rate": "birth_rate",
    "People and Society: Death rate": "death_rate"
}

# Create cleaned dataframe
clean_df = df[['Country']].copy()

# Extract numbers from string fields
for original_col, new_col in columns_to_clean.items():
    if original_col in df.columns:
        clean_df[new_col] = df[original_col].astype(str).str.extract(r'([\d\.]+)').astype(float)
    else:
        clean_df[new_col] = None

# Show cleaned data
print(clean_df.head(10))


               Country  gdp_ppp  military_gdp_percent  population  \
0          Afghanistan   82.595                   3.3        40.0   
1             Akrotiri      NaN                   NaN         NaN   
2              Albania   50.098                   2.0         3.0   
3              Algeria  699.818                   9.0        47.0   
4       American Samoa  658.000                   NaN        43.0   
5              Andorra    5.226                   NaN        85.0   
6               Angola  266.249                   1.3        37.0   
7             Anguilla      NaN                   NaN        19.0   
8           Antarctica      NaN                   NaN         NaN   
9  Antigua and Barbuda    2.703                   NaN       102.0   

   literacy_percent  birth_rate  death_rate  
0              37.3        34.2        11.8  
1               NaN         NaN         NaN  
2              98.4        12.3         7.4  
3              81.4        20.2         4.4  
4         

In [7]:
import numpy as np

# Drop rows with any missing values
clean_df = clean_df.dropna()

# Function to calculate a simple strength score
def calculate_strength(row):
    return (
        row['gdp_ppp'] +
        row['military_gdp_percent'] +
        row['literacy_percent'] +
        row['birth_rate'] -
        row['death_rate']
    )

# Generate pairs
matchups = []

for _ in range(500):  # generate 500 battles
    a, b = clean_df.sample(2, replace=False).to_dict('records')
    
    # Calculate features as difference A - B
    features = {
        'gdp_diff': a['gdp_ppp'] - b['gdp_ppp'],
        'military_diff': a['military_gdp_percent'] - b['military_gdp_percent'],
        'literacy_diff': a['literacy_percent'] - b['literacy_percent'],
        'birthrate_diff': a['birth_rate'] - b['birth_rate'],
        'deathrate_diff': a['death_rate'] - b['death_rate'],
        'label': int(calculate_strength(a) > calculate_strength(b)),
        'country_a': a['Country'],
        'country_b': b['Country']
    }
    matchups.append(features)

# Create DataFrame
battle_df = pd.DataFrame(matchups)

# Show first few battles
print(battle_df.head())


   gdp_diff  military_diff  literacy_diff  birthrate_diff  deathrate_diff  \
0   -85.742           -0.5          -25.2             9.0            -2.1   
1   -32.847           -1.1          -17.0             4.4            -8.2   
2    86.632           -0.7           -8.4            -6.1            -1.4   
3   -97.843            0.2           28.6             8.8            -0.9   
4   -80.384            1.9           -5.6             0.1           -13.1   

   label   country_a country_b  
0      0      Malawi   Bolivia  
1      0   Nicaragua   Georgia  
2      1    Cambodia   Namibia  
3      0  Tajikistan     Nepal  
4      0       Qatar   Hungary  


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Features and label
X = battle_df[['gdp_diff', 'military_diff', 'literacy_diff', 'birthrate_diff', 'deathrate_diff']]
y = battle_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.97
Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97        52
           1       0.94      1.00      0.97        48

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100



In [10]:
import joblib

# Save your trained model
joblib.dump(model, 'battle_model.pkl')


['battle_model.pkl']

In [11]:
""" MODEL TRAINING THIS TIME NEW DATA"""

' MODEL TRAINING THIS TIME NEW DATA'

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [14]:
# Load your clean dataset
df = pd.read_csv("battle_ready_countries_final.csv")

# Make sure all columns are numeric
df[['gdp', 'military', 'literacy', 'birth', 'death']] = df[['gdp', 'military', 'literacy', 'birth', 'death']].apply(pd.to_numeric)

# Create pairwise battle data
# For simplicity, generate all possible matchups (excluding self-matchups)
pairs = []
labels = []

for i in range(len(df)):
    for j in range(len(df)):
        if i != j:
            country_a = df.iloc[i]
            country_b = df.iloc[j]

            # Feature: differences between stats
            features = [
                country_a['gdp'] - country_b['gdp'],
                country_a['military'] - country_b['military'],
                country_a['literacy'] - country_b['literacy'],
                country_a['birth'] - country_b['birth'],
                country_a['death'] - country_b['death']
            ]
            pairs.append(features)

            # Label: 1 if country_a wins, else 0 (simplified — assume country with higher GDP+military wins)
            label = int((country_a['gdp'] + country_a['military']) > (country_b['gdp'] + country_b['military']))
            labels.append(label)

X = np.array(pairs)
y = np.array(labels)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)


In [17]:
y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4954
           1       1.00      1.00      1.00      4948

    accuracy                           1.00      9902
   macro avg       1.00      1.00      1.00      9902
weighted avg       1.00      1.00      1.00      9902



In [20]:
import joblib

# Save the trained model and the scaler
joblib.dump(model, "battle_model.pkl")
joblib.dump(scaler, "battle_scaler.pkl")

print("✅ Model and scaler saved successfully!")


✅ Model and scaler saved successfully!


In [18]:
import pandas as pd

df = pd.read_csv("battle_ready_countries_final.csv")


In [19]:
print("Max GDP:        ", df['gdp'].max())
print("Max Military:   ", df['military'].max())
print("Max Literacy:   ", df['literacy'].max())
print("Max Birth Rate: ", df['birth'].max())
print("Max Death Rate: ", df['death'].max())


Max GDP:         978007000000.0
Max Military:    30.0
Max Literacy:    100.0
Max Birth Rate:  46.6
Max Death Rate:  18.6
