## MIP data exploration and preprocessing

In [1]:
import pandas as pd
from pathlib import Path

In [6]:
PROJECT_ROOT = Path.cwd().parents[1]
xlsx_path = PROJECT_ROOT / "data" / "raw" / "military" / "mip-us-intervention.xlsx"

mip_df = pd.read_excel(xlsx_path)
print(xlsx_path.exists())  # should be True


True


In [7]:


# Check missingness for key columns
print(mip_df[['State B', 'styear', 'endyear', 'InterType', 'Objective', 'US HighAct']].isnull().sum())
print()
print(mip_df[['State B', 'styear', 'endyear', 'InterType', 'Objective', 'US HighAct']].info())

State B         0
styear          0
endyear        15
InterType     149
Objective     149
US HighAct     30
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570 entries, 0 to 569
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   State B     570 non-null    object 
 1   styear      570 non-null    int64  
 2   endyear     555 non-null    float64
 3   InterType   421 non-null    float64
 4   Objective   421 non-null    object 
 5   US HighAct  540 non-null    float64
dtypes: float64(3), int64(1), object(2)
memory usage: 26.8+ KB
None


In [8]:
# Check if missing values are mostly pre-1990
print("Missing InterType by era:")
print(mip_df.groupby(mip_df['styear'] >= 1990)['InterType'].apply(lambda x: x.isnull().sum()))
print()
print("Total rows pre-1990:", (mip_df['styear'] < 1990).sum())
print("Total rows 1990+:", (mip_df['styear'] >= 1990).sum())

Missing InterType by era:
styear
False    123
True      26
Name: InterType, dtype: int64

Total rows pre-1990: 424
Total rows 1990+: 146


In [10]:
# Load and select columns
mip_df = pd.read_excel(xlsx_path)
mip_df = mip_df[['State B', 'styear', 'endyear', 'InterType', 'Objective', 'US HighAct']]
mip_df.columns = ['target_country', 'start_year', 'end_year', 'intervention_type', 'objective', 'hostility_level']

# Filter to 1990+
mip_df = mip_df[mip_df['start_year'] >= 1990]

# Save
output_path = PROJECT_ROOT / "data" / "processed" / "mip_us_interventions.csv"
mip_df.to_csv(output_path, index=False)

print(mip_df.shape)
print(mip_df.head())

(146, 6)
    target_country  start_year  end_year  intervention_type  \
424            CUB        1990    1990.0                NaN   
425            IRQ        1990    1991.0                NaN   
426            KUW        1990    1991.0                NaN   
427            LBR        1990    1991.0                1.0   
428            SAU        1990    1991.0                2.0   

                                   objective  hostility_level  
424                                      NaN             19.0  
425                                      NaN             12.0  
426                                      NaN             12.0  
427               Social Protection, ProtOwn             11.0  
428  Maintain/Build Foreign Regime Authority             12.0  
