# Modeling

## 0. Import Dependencies

In [4]:
import pandas as pd
import numpy as np

from pathlib import Path

In [2]:
SEED = 42
np.random.seed(SEED)

## 1. Set Up


In [6]:
BASE_DIR = Path(r"C:/Users/black/Documents/Ironhack/final_project")
FEATENG_DIR = BASE_DIR / "data" / "feat_eng"
MODELED_DIR = BASE_DIR / "data" / "modeled"

FEATENG_DIR.mkdir(parents=True, exist_ok=True)
MODELED_DIR.mkdir(parents=True, exist_ok=True)

## 2. Load Data

In [12]:
individual_df = pd.read_csv(r"data\feat_eng\ind_df_final.csv")
violence_df_all = pd.read_csv(r"data\feat_eng\violence_df_final_all")
violence_df_all_target = pd.read_csv(r"data\feat_eng\violence_df_final_all_target")
violence_df_fe = pd.read_csv(r"data\feat_eng\violence_df_final_fe")
violence_df_fe_target = pd.read_csv(r"data\feat_eng\violence_df_final_fe_target")
global_df = pd.read_csv(r"data\feat_eng\global_df_final")

## 3. Individual Dataset 

**Target Variable** : violence (yes/no == 1/0)

*Which individual factors lead to experiencing violence as a woman?*

In [8]:
individual_df.head()

Unnamed: 0,education_ordinal,is_married,emp_unemployed,age_scaled,income_scaled,vawg_rate,young_unmarried,low_education_unemployed,married_unemployed,age_income,education_income,age_group_young,age_group_middle,economic_vulnerability,empowerment_score,high_vawg_environment,emp_employed,emp_semi employed,violence
0,2,1,True,-0.126628,-0.436516,18.585241,0,0,1,0.055275,-0.873033,0,1,1,2,0,False,False,1
1,3,1,True,1.635635,-0.436516,19.82012,0,0,1,-0.713981,-1.309549,0,0,1,3,0,False,False,0
2,3,0,True,-0.748603,-0.436516,19.82012,1,0,0,0.326777,-1.309549,1,0,1,3,0,False,False,0
3,3,0,True,-0.955928,-0.436516,19.82012,1,0,0,0.417278,-1.309549,1,0,1,3,0,False,False,0
4,1,1,True,1.946622,-0.436516,21.492651,0,1,1,-0.849733,-0.436516,0,0,2,1,0,False,False,1


In [17]:
individual_df.shape

(325, 19)

## 4. Violence Dataset

**Target Variable** : Intimate Partner Violenc (ipv, continous)
- composite only (reduce overfitting)
- compare models: Linear regression vs Random Forest vs XGBoost
-  *What country-level factors predict intimate partner violence?*

In [None]:
# only features
violence_df_fe.head()

Unnamed: 0,structural_inequality_index,violence_exposure_index,general_danger_index,economic_empowerment_gap,inequality_attitudes_interaction,education_attitudes_interaction,legal_violence_interaction
0,-0.964292,0.152586,0.903442,0.80047,-0.789501,0.005758,0.387501
1,-0.064801,-0.187819,0.111532,-0.261881,0.079435,0.757695,0.09849
2,1.960851,-2.058809,-0.600209,-0.780153,-2.403674,0.901379,2.010315
3,1.78176,-0.874997,-0.724152,-0.707869,-1.131737,0.442243,0.370937
4,1.747599,-0.510202,-0.277318,-0.531493,-1.110039,0.132028,0.267543


In [16]:
violence_df_fe.shape

(50, 7)

In [14]:
# only target 
violence_df_fe_target.head()

Unnamed: 0,ipv
0,30.0
1,73.0
2,38.0
3,29.0
4,53.0


## 5. Global Dataset

**Target Variable:** Intimate Partner Violence (ip_violence)
 
- supervised Learning with all Features
- *Which structural factors are leading to women experiencing higher violence & which ones can reduce it?*


In [11]:
global_df.head()

Unnamed: 0,country,economic_development_index,gender_inequality_composite,youth_vulnerability_index,freedom_index,maternal_health_risk,econ_gender_interaction,freedom_inequality_interaction,youth_compound_risk,wpsi,seats_parliament,maternal_mortality,ip_violence_missing,wpsi_missing,ip_violence
0,Afghanistan,-3.435148,2.012829,1.9381,-2.284678,1.558708,-6.914367,-4.598666,1.455041,0.286,27.2,638.0,0,0,46.0
1,Angola,-0.958283,1.248151,0.31729,-0.734784,1.415363,-1.196082,-0.917122,-2.584103,0.598,29.5,241.0,0,0,38.0
2,Albania,1.147539,-1.690309,-0.716077,0.823646,-0.704143,-1.939695,-1.392216,0.619054,0.796,35.7,15.0,0,0,13.0
3,Andorra,1.659922,-2.134899,-0.527925,0.826419,-0.849748,-3.543765,-1.76432,0.291153,0.846985,46.4,0.0,1,1,20.157011
4,United Arab Emirates,1.699965,-1.678001,0.247325,-0.354063,-0.879801,-2.852544,0.594118,-1.305647,0.868,50.0,3.0,1,0,20.359459


In [15]:
global_df.shape

(195, 15)