# Data Cleaning and database creation

## Battles list

In [22]:
import pandas as pd
import re

def clean_battles_csv(
    csv_path: str,
    output_csv: str = "clean_battles.csv"
):
    # Load CSV
    df = pd.read_csv(csv_path)

    # Step 1: Clean battle_name
    def clean_battle_name(name):
        name = str(name)
        if "(" in name and ")" in name:
            match = re.search(r'\(([^)]+)\)', name)
            if match:
                content = match.group(1)
                if not content.isalpha():
                    name = re.sub(r'\([^)]*\)', '', name)
        return name.strip()

    df["battle_name"] = df["battle_name"].apply(clean_battle_name)

    # Step 2: Fill year column
    def extract_year(row):
        if pd.notnull(row["year"]):
            return row["year"]
        desc = str(row["description"])
        parts = desc.split('-')

        if len(parts) > 1 and any(x in parts[1] for x in ["BCE", "BC"]):
            try:
                year_candidate = int(re.findall(r'\d+', parts[1])[0])
                return -1 * year_candidate
            except:
                return None
        elif len(parts) > 0 and any(x in parts[0] for x in ["BCE", "BC"]):
            match = re.search(r'(\d+)\s*(BCE|BC)', parts[0])
            if match:
                return -1 * int(match.group(1))
        return None

    df["year"] = df.apply(extract_year, axis=1)

    # Step 3: Clean and fill conflict
    def extract_conflict(row):
        conflict = row.get("conflict", None)
        if pd.notnull(conflict):
            raw = str(conflict)
        else:
            desc_parts = str(row["description"]).split('-')
            if len(desc_parts) > 2:
                raw = desc_parts[2]
            else:
                return None

        # Normalize specific war names
        raw = re.sub(r'^.*(World War I).*$', 'World War I', raw)
        raw = re.sub(r'^.*(World War II).*$', 'World War II', raw)
        raw = re.sub(r'\bWWI\b', 'World War I', raw)
        raw = re.sub(r'\bWWII\b', 'World War II', raw)

        # Remove numeric or symbol prefix like '1645 -' or '? -'
        raw = re.sub(r'^[\d\?\s]*-\s*', '', raw)

        # Clean up unmatched characters and stray symbols
        raw = raw.strip()
        raw = re.sub(r'^[\d\s]*\)\-?', '', raw)
        raw = re.sub(r'^[\)\-,\s]+', '', raw)
        raw = re.sub(r'[\(\)]', '', raw)

        return raw.strip() if raw else None

    df["conflict"] = df.apply(extract_conflict, axis=1)

    # Step 4: Format columns and handle missing values
    df["battle_id"] = pd.to_numeric(df["battle_id"], errors="coerce").astype("Int64")
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df["description"] = df["description"].replace("None", pd.NA).astype("string")
    df["conflict"] = df["conflict"].replace("None", pd.NA).astype("string")
    if "wiki_link" not in df.columns:
        df["wiki_link"] = pd.NA
    df["wiki_link"] = df["wiki_link"].replace("None", pd.NA).astype("string")

    # Save cleaned version to CSV
    df.to_csv(output_csv, index=False)
    print(f"✅ Cleaned CSV saved as '{output_csv}' with {len(df)} rows.")



In [25]:
clean_battles_csv('/Users/louis/Desktop/Coding/Github/WarCast/Pre-clean data/battles_list.csv')

✅ Cleaned CSV saved as 'clean_battles.csv' with 962 rows.


### Upload to database file

In [26]:
import pandas as pd
import sqlite3

def insert_clean_csv_to_sqlite(
    csv_path: str,
    db_path: str,
    table_name: str,
    schema: str,
    replace_table: bool = False
):
    # Load the cleaned CSV
    df = pd.read_csv(csv_path)

    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create table if needed (you define schema manually)
    cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} ({schema});")

    # Choose insertion mode
    if replace_table:
        insertion_mode = "replace"
    else:
        insertion_mode = "append"

    # Insert the data
    df.to_sql(table_name, conn, if_exists=insertion_mode, index=False)

    conn.commit()
    conn.close()
    print(f"✅ Inserted {len(df)} rows into table '{table_name}' in '{db_path}'.")



In [27]:
insert_clean_csv_to_sqlite(
    csv_path='/Users/louis/Desktop/Coding/Github/WarCast/Clean data/clean_battles.csv',
    db_path="warcast.db",
    table_name="battles",
    schema="""
        battle_id INTEGER PRIMARY KEY,
        battle_name TEXT,
        year INTEGER,
        description TEXT,
        conflict TEXT,
        wiki_link TEXT
    """,
    replace_table=True
)

✅ Inserted 962 rows into table 'battles' in 'warcast.db'.


## Battle info

### Primary key resetting and column formatting

In [37]:
import pandas as pd

def clean_participants_csv(
    csv_path: str = "battle_participants.csv",
    output_path: str = "clean_battle_participants.csv"
):
    # Load CSV
    df = pd.read_csv(csv_path)

    # Step 1: Reset `participant_id` column
    if "participant_id" in df.columns:
        df.drop(columns=["participant_id"], inplace=True)
    df.insert(0, "participant_id", df.index + 1)

    # Step 2: Enforce specific formats
    df["participant_id"] = pd.to_numeric(df["participant_id"], downcast="integer")
    df["battle_id"] = pd.to_numeric(df["battle_id"], errors="coerce").astype("Int64")
    df["country"] = df["country"].astype("string")
    df["troops"] = pd.to_numeric(df["troops"], errors="coerce").astype("Int64")
    df["deaths"] = pd.to_numeric(df["deaths"], errors="coerce").astype("Int64")
    df["result"] = df["result"].astype("string")

    # Step 3: Save cleaned file
    df.to_csv(output_path, index=False)
    print(f"✅ Cleaned participant data saved to '{output_path}' with {len(df)} rows.")


In [40]:
clean_participants_csv('/Users/louis/Desktop/Coding/Github/WarCast/Pre-clean data/battle_info.csv')

✅ Cleaned participant data saved to 'clean_battle_participants.csv' with 3182 rows.


### Upload to the database

In [48]:
insert_clean_csv_to_sqlite(
    csv_path='/Users/louis/Desktop/Coding/Github/WarCast/Clean data/clean_battle_info.csv',
    db_path="warcast.db",
    table_name="battle_info",
    schema="""
        participant_id INTEGER PRIMARY KEY,
        battle_id INTEGER,
        country TEXT,
        troops INTEGER,
        deaths INTEGER,
        result TEXT
    """,
    replace_table=True
)

✅ Inserted 3182 rows into table 'battle_info' in 'warcast.db'.


## GDP

In [28]:
import pandas as pd
import sqlite3

def process_gdp_excel_to_sqlite(
    excel_path: str,
    db_path: str = "warcast.db",
    table_name: str = "gdp"
):
    # Step 1: Load Excel
    df = pd.read_excel(excel_path)
    
    # Step 2: Melt into long format
    df_melted = df.melt(
        id_vars=["Country Name"],
        var_name="year",
        value_name="gdp"
    )
    
    # Step 3: Clean
    df_melted.rename(columns={"Country Name": "country"}, inplace=True)
    df_melted = df_melted.dropna(subset=["gdp"])
    df_melted["year"] = pd.to_numeric(df_melted["year"], errors="coerce").dropna().astype(int)
    df_melted["gdp"] = pd.to_numeric(df_melted["gdp"], errors="coerce").round(2)

    # Step 4: Add surrogate key (optional if you're going to use AUTOINCREMENT in SQL)
    df_melted = df_melted.sort_values(by=["country", "year"]).reset_index(drop=True)
    df_melted.insert(0, "id", df_melted.index + 1)

    # Step 5: Save cleaned CSV (optional but useful for inspection)
    df_melted.to_csv("clean_gdp.csv", index=False)

    # Step 6: Create or insert into SQLite
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create table (safe to run repeatedly due to IF NOT EXISTS)
    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id INTEGER PRIMARY KEY,
        country TEXT,
        year INTEGER,
        gdp REAL,
        UNIQUE (country, year)
    );
    """)

    # Insert into database
    df_melted.to_sql(table_name, conn, if_exists="append", index=False)

    conn.commit()
    conn.close()

    print(f"✅ Processed and saved GDP data to table '{table_name}' in {db_path}")


In [29]:
process_gdp_excel_to_sqlite('/Users/louis/Desktop/Coding/Github/WarCast/Pre-clean data/GDP per country_1988-2022.xlsx')

✅ Processed and saved GDP data to table 'gdp' in warcast.db


## Political regime

In [30]:
import pandas as pd
import sqlite3

# Mapping from code to regime label
REGIME_MAP = {
    0: "closed autocracy",
    1: "electoral autocracy",
    2: "electoral democracy",
    3: "liberal democracy"
}

def process_political_regime_to_sqlite(
    csv_path: str = "political-regime.csv",
    db_path: str = "warcast.db",
    table_name: str = "political_regime"
):
    # Step 1: Load CSV
    df = pd.read_csv(csv_path)

    # Step 2: Rename relevant columns
    df = df.rename(columns={
        "Entity": "country",
        "Year": "year",
        df.columns[-1]: "regime_code"  # Last column is assumed to be the regime score
    })

    # Step 3: Map regime code to readable labels
    df["regime_type"] = df["regime_code"].map(REGIME_MAP)

    # Step 4: Sort and add surrogate ID
    df = df.sort_values(by=["country", "year"]).reset_index(drop=True)
    df.insert(0, "id", df.index + 1)

    # Step 5: Save full cleaned CSV including 'Code'
    df.to_csv("clean_political_regime.csv", index=False)

    # Step 6: Connect to SQLite and create table
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id INTEGER PRIMARY KEY,
        country TEXT,
        year INTEGER,
        regime_code INTEGER,
        regime_type TEXT,
        UNIQUE (country, year)
    );
    """)

    # Step 7: Insert only relevant columns into SQL
    df_sql = df[["id", "country", "year", "regime_code", "regime_type"]]
    try:
        df_sql.to_sql(table_name, conn, if_exists="append", index=False)
        print(f"✅ Inserted data into table '{table_name}' in {db_path}")
    except Exception as e:
        print(f"❌ Failed to insert data: {e}")

    conn.commit()
    conn.close()


In [31]:
process_political_regime_to_sqlite('/Users/louis/Desktop/Coding/Github/WarCast/Raw Data/political-regime/political-regime.csv')

✅ Inserted data into table 'political_regime' in warcast.db


## Population

In [45]:
import pandas as pd

def clean_population_csv(
    csv_path: str = "",
    output_path: str = "clean_population.csv"
):
    # Load the raw file
    df = pd.read_csv(csv_path)

    # Step 1: Rename columns
    df.rename(columns={
        "Entity": "Country",
        "Population (historical)": "Population"
    }, inplace=True)

    # Step 2: Enforce data types
    df["Country"] = df["Country"].astype("string")
    df["Code"] = df["Code"].astype("string")
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
    df["Population"] = pd.to_numeric(df["Population"], errors="coerce").astype("Int64")

    # Step 3: Save cleaned file
    df.to_csv(output_path, index=False)
    print(f"✅ Cleaned population data saved to '{output_path}' with {len(df)} rows.")


In [46]:
clean_population_csv('/Users/louis/Desktop/Coding/Github/WarCast/Raw Data/population/population.csv')

✅ Cleaned population data saved to 'clean_population.csv' with 59177 rows.


### Upload to database

In [49]:
insert_clean_csv_to_sqlite(
    csv_path='/Users/louis/Desktop/Coding/Github/WarCast/Clean data/clean_population.csv',
    db_path="warcast.db",
    table_name="population",
    schema="""
        id INTEGER PRIMARY KEY,
        Country TEXT,
        Code TEXT,
        Year INTEGER,
        Population INTEGER
    """,
    replace_table=True
)

✅ Inserted 59177 rows into table 'population' in 'warcast.db'.


## Corruption Perception Index

In [55]:
import pandas as pd

def clean_cpi_scores(
    excel_path: str,
    output_path: str = "clean_cpi.csv"
):
    # Load Excel
    df = pd.read_csv(excel_path,encoding='latin1')

    # Step 1: Keep only country info and CPI score columns
    cpi_cols = [col for col in df.columns if "CPI score" in col]
    base_cols = ["Country", "ISO3"]
    df = df[base_cols + cpi_cols]

    # Step 2: Melt to long format
    df_long = df.melt(
        id_vars=["Country", "ISO3"],
        var_name="year",
        value_name="cpi_score"
    )

    # Step 3: Clean year column from 'CPI score YYYY' → YYYY
    df_long["year"] = df_long["year"].str.extract(r'(\d{4})').astype("Int64")

    # Step 4: Format columns
    df_long = df_long.rename(columns={"Country": "country", "ISO3": "code"})
    df_long["country"] = df_long["country"].astype("string")
    df_long["code"] = df_long["code"].astype("string")
    df_long["cpi_score"] = pd.to_numeric(df_long["cpi_score"], errors="coerce").astype("Int64")

    # Step 5: Add auto ID
    df_long = df_long.sort_values(by=["country", "year"]).reset_index(drop=True)
    df_long.insert(0, "id", df_long.index + 1)

    # Step 6: Save cleaned file
    df_long.to_csv(output_path, index=False)
    print(f"✅ Cleaned CPI data saved to '{output_path}' with {len(df_long)} rows.")

In [57]:
clean_cpi_scores('/Users/louis/Desktop/Coding/Github/WarCast/Pre-clean data/CPI2020_GlobalTablesTS_210125.csv')

✅ Cleaned CPI data saved to 'clean_cpi.csv' with 1260 rows.


### Upload to database

In [58]:
insert_clean_csv_to_sqlite(
    csv_path='/Users/louis/Desktop/Coding/Github/WarCast/Clean data/clean_population.csv',
    db_path="warcast.db",
    table_name="cpi",
    schema="""
        id INTEGER PRIMARY KEY,
        country TEXT,
        code TEXT,
        year INTEGER,
        cpi_score INTEGER
    """,
    replace_table=True
)

✅ Inserted 59177 rows into table 'cpi' in 'warcast.db'.


## Military investment

In [59]:
import pandas as pd

def clean_military_percent_gdp(
    csv_path: str = "Military percent GDP.csv",
    output_path: str = "clean_military_gdp.csv"
):
    # Load raw CSV
    df = pd.read_csv(csv_path)

    # Step 1: Drop unused columns
    df = df.drop(columns=["Indicator Name", "Indicator Code"], errors="ignore")

    # Step 2: Melt into tall format
    df_long = df.melt(
        id_vars=["Country Name", "Country Code"],
        var_name="year",
        value_name="military_percent_gdp"
    )

    # Step 3: Rename columns and enforce types
    df_long = df_long.rename(columns={
        "Country Name": "country",
        "Country Code": "code"
    })

    df_long["country"] = df_long["country"].astype("string")
    df_long["code"] = df_long["code"].astype("string")
    df_long["year"] = pd.to_numeric(df_long["year"], errors="coerce").astype("Int64")
    df_long["military_percent_gdp"] = pd.to_numeric(df_long["military_percent_gdp"], errors="coerce").astype("float")

    # Step 4: Add auto ID
    df_long = df_long.sort_values(by=["country", "year"]).reset_index(drop=True)
    df_long.insert(0, "id", df_long.index + 1)

    # Step 5: Save cleaned version
    df_long.to_csv(output_path, index=False)
    print(f"✅ Cleaned military GDP data saved to '{output_path}' with {len(df_long)} rows.")


In [60]:
clean_military_percent_gdp('/Users/louis/Desktop/Coding/Github/WarCast/Pre-clean data/Military percent GDP.csv')

✅ Cleaned military GDP data saved to 'clean_military_gdp.csv' with 17024 rows.


### Upload to the database

In [61]:
insert_clean_csv_to_sqlite(
    csv_path='/Users/louis/Desktop/Coding/Github/WarCast/Clean data/clean_population.csv',
    db_path="warcast.db",
    table_name="gdp_military",
    schema="""
        id INTEGER PRIMARY KEY,
        country TEXT,
        code TEXT,
        year INTEGER,
        military_percent_gdp REAL
    """,
    replace_table=True
)

✅ Inserted 59177 rows into table 'gdp_military' in 'warcast.db'.
