## COLDAT data exploration & preprocessing

In [3]:
import pandas as pd
from pathlib import Path 

In [4]:
PROJECT_ROOT = Path.cwd().parents[1]
csv_path = PROJECT_ROOT / "data" / "raw" / "colonial" / "coldat-dyads-raw.csv"

coldat_df = pd.read_csv(csv_path)

In [5]:
print(coldat_df.shape)
print(coldat_df.isnull().sum())
print()
print(coldat_df[coldat_df['col'] == 1].head(10))  # actual colonial relationships

(1560, 7)
country             0
colonizer           0
col                 0
colstart_max     1399
colend_max       1399
colstart_mean    1399
colend_mean      1399
dtype: int64

               country colonizer  col  colstart_max  colend_max  \
27             Burundi   belgium    1        1923.0      1962.0   
39    Congo - Kinshasa   belgium    1        1885.0      1960.0   
143             Rwanda   belgium    1        1922.0      1962.0   
200  Antigua & Barbuda   britain    1        1632.0      1981.0   
203          Australia   britain    1        1829.0      1919.0   
206            Bahamas   britain    1        1783.0      1973.0   
207            Bahrain   britain    1        1861.0      1971.0   
208         Bangladesh   britain    1        1857.0      1947.0   
209           Barbados   britain    1        1627.0      1966.0   
212             Belize   britain    1        1798.0      1981.0   

     colstart_mean  colend_mean  
27          1922.0       1962.0  
39          1885

In [6]:
# Filter to actual colonial relationships only
coldat_clean = coldat_df[coldat_df['col'] == 1].copy()

# Select and rename columns
coldat_clean = coldat_clean[['country', 'colonizer', 'colstart_max', 'colend_max']]
coldat_clean.columns = ['colony', 'colonizer', 'col_start', 'col_end']

# Save
output_path = PROJECT_ROOT / "data" / "processed" / "coldat_colonial_ties.csv"
coldat_clean.to_csv(output_path, index=False)

print(coldat_clean.shape)
print(coldat_clean['colonizer'].value_counts())

(161, 4)
colonizer
britain        68
france         33
spain          26
germany        12
portugal       12
netherlands     4
belgium         3
italy           3
Name: count, dtype: int64
