In [12]:
# Initial imports.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification
import sqlalchemy as sql
from getpass import getpass

In [2]:
# Ask for the database pasword
password = getpass('Enter database password')

Enter database password········


In [3]:
# Create engine to connect to database
engine = sql.create_engine(f'postgresql://postgres:{password}@obstetric-violence.clstnlifxcx7.us-west-2.rds.amazonaws.com:5432/ENDIREH_2021')

# Get list of table names
sql.inspect(engine).get_table_names()

['TVIV', 'TSDem', 'TB_SEC_III', 'TB_SEC_IV', 'TB_SEC_X', 'obstetric_violence']

In [4]:
# Read the obstetric_violence table and show the results
df = pd.read_sql_table('obstetric_violence', con=engine)
df

Unnamed: 0,ID_PER,ID_VIV,UPM,VIV_SEL,HOGAR,N_REN,CVE_ENT,NOM_ENT,CVE_MUN,NOM_MUN,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
0,0100128.05.1.02,100128.05,100128,5,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
1,0101482.03.1.03,101482.03,101482,3,1,3,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
2,0101631.04.1.01,101631.04,101631,4,1,1,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
3,0101876.04.1.02,101876.04,101876,4,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
4,0102096.02.1.02,102096.02,102096,2,1,2,1,AGUASCALIENTES,5,JESÚS MARÍA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110122,2805373.02.1.02,2805373.02,2805373,2,1,2,28,TAMAULIPAS,32,REYNOSA,...,,,,,,,,,,
110123,2806028.02.1.03,2806028.02,2806028,2,1,3,28,TAMAULIPAS,38,TAMPICO,...,,,,,,,,,,
110124,3103444.16.1.01,3103444.16,3103444,16,1,1,31,YUCATÁN,21,CHICHIMILÁ,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,
110125,3103573.19.1.04,3103573.19,3103573,19,1,4,31,YUCATÁN,35,HOCTÚN,...,,,,,,,,,,


In [5]:
# Creating a copy of the database to choose the features we will use to analyse
df_copy = df.copy()
df_copy

Unnamed: 0,ID_PER,ID_VIV,UPM,VIV_SEL,HOGAR,N_REN,CVE_ENT,NOM_ENT,CVE_MUN,NOM_MUN,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
0,0100128.05.1.02,100128.05,100128,5,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
1,0101482.03.1.03,101482.03,101482,3,1,3,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
2,0101631.04.1.01,101631.04,101631,4,1,1,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
3,0101876.04.1.02,101876.04,101876,4,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
4,0102096.02.1.02,102096.02,102096,2,1,2,1,AGUASCALIENTES,5,JESÚS MARÍA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110122,2805373.02.1.02,2805373.02,2805373,2,1,2,28,TAMAULIPAS,32,REYNOSA,...,,,,,,,,,,
110123,2806028.02.1.03,2806028.02,2806028,2,1,3,28,TAMAULIPAS,38,TAMPICO,...,,,,,,,,,,
110124,3103444.16.1.01,3103444.16,3103444,16,1,1,31,YUCATÁN,21,CHICHIMILÁ,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,
110125,3103573.19.1.04,3103573.19,3103573,19,1,4,31,YUCATÁN,35,HOCTÚN,...,,,,,,,,,,


In [7]:
# Replace empty cells with 'b' where b means blank 
df_copy.fillna('b', inplace=True)
df_copy

Unnamed: 0,ID_PER,ID_VIV,UPM,VIV_SEL,HOGAR,N_REN,CVE_ENT,NOM_ENT,CVE_MUN,NOM_MUN,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
0,0100128.05.1.02,100128.05,100128,5,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,b,b,b,b,b,b,b,b,b,b
1,0101482.03.1.03,101482.03,101482,3,1,3,1,AGUASCALIENTES,1,AGUASCALIENTES,...,b,b,b,b,b,b,b,b,b,b
2,0101631.04.1.01,101631.04,101631,4,1,1,1,AGUASCALIENTES,1,AGUASCALIENTES,...,b,b,b,b,b,b,b,b,b,b
3,0101876.04.1.02,101876.04,101876,4,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,b
4,0102096.02.1.02,102096.02,102096,2,1,2,1,AGUASCALIENTES,5,JESÚS MARÍA,...,b,b,b,b,b,b,b,b,b,b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110122,2805373.02.1.02,2805373.02,2805373,2,1,2,28,TAMAULIPAS,32,REYNOSA,...,b,b,b,b,b,b,b,b,b,b
110123,2806028.02.1.03,2806028.02,2806028,2,1,3,28,TAMAULIPAS,38,TAMPICO,...,b,b,b,b,b,b,b,b,b,b
110124,3103444.16.1.01,3103444.16,3103444,16,1,1,31,YUCATÁN,21,CHICHIMILÁ,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,b
110125,3103573.19.1.04,3103573.19,3103573,19,1,4,31,YUCATÁN,35,HOCTÚN,...,b,b,b,b,b,b,b,b,b,b


In [9]:
# Remove columns that had data that wasn't usefull like ids, sampling information and table structure
df_copy = df_copy.drop(columns=['ID_VIV', 'ID_PER' ,'UPM', 'VIV_SEL', 'HOGAR', 'N_REN', 'CVE_ENT', 'CVE_MUN', 'COD_RES', 'EST_DIS', 'UPM_DIS', 'ESTRATO', 'NOMBRE', 'SEXO', 'COD_M15', 'CODIGO', 'REN_MUJ_EL', 'REN_INF_AD', 'N_REN_ESP','T_INSTRUM'])
df_copy

Unnamed: 0,NOM_ENT,NOM_MUN,FAC_VIV,DOMINIO,PAREN,EDAD,NIV,GRA,FAC_MUJ,P1_1,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
0,AGUASCALIENTES,AGUASCALIENTES,72,U,3,23,10.0,6.0,143,3,...,b,b,b,b,b,b,b,b,b,b
1,AGUASCALIENTES,AGUASCALIENTES,98,U,3,26,10.0,4.0,195,3,...,b,b,b,b,b,b,b,b,b,b
2,AGUASCALIENTES,AGUASCALIENTES,148,U,1,62,10.0,3.0,296,3,...,b,b,b,b,b,b,b,b,b,b
3,AGUASCALIENTES,AGUASCALIENTES,92,U,2,45,11.0,3.0,92,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,b
4,AGUASCALIENTES,JESÚS MARÍA,78,U,2,77,2.0,3.0,235,2,...,b,b,b,b,b,b,b,b,b,b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110122,TAMAULIPAS,REYNOSA,341,U,2,43,4.0,3.0,341,2,...,b,b,b,b,b,b,b,b,b,b
110123,TAMAULIPAS,TAMPICO,351,U,3,16,4.0,1.0,703,3,...,b,b,b,b,b,b,b,b,b,b
110124,YUCATÁN,CHICHIMILÁ,218,C,1,31,3.0,3.0,218,2,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,b
110125,YUCATÁN,HOCTÚN,159,C,3,30,10.0,4.0,318,2,...,b,b,b,b,b,b,b,b,b,b


In [10]:
# Removing women that did not had a pregnancy on the last 5 years
df_copy = df_copy[df_copy.P10_2 == 1.0]
df_copy

Unnamed: 0,NOM_ENT,NOM_MUN,FAC_VIV,DOMINIO,PAREN,EDAD,NIV,GRA,FAC_MUJ,P1_1,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
3,AGUASCALIENTES,AGUASCALIENTES,92,U,2,45,11.0,3.0,92,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,b
7,AGUASCALIENTES,SAN FRANCISCO DE LOS ROMO,119,R,2,31,4.0,3.0,119,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,b
8,BAJA CALIFORNIA,TIJUANA,232,U,2,27,4.0,3.0,232,3,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,b,b,b
9,BAJA CALIFORNIA,TIJUANA,291,U,1,25,10.0,4.0,291,3,...,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,b
19,COLIMA,VILLA DE ÁLVAREZ,65,U,9,30,9.0,1.0,129,3,...,b,b,b,b,b,b,b,b,b,b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110105,TAMAULIPAS,NUEVO LAREDO,284,U,2,25,3.0,3.0,284,2,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,b,b,b
110109,TLAXCALA,ATLANGATEPEC,84,R,2,33,10.0,5.0,84,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,3.0
110110,TLAXCALA,ATLTZAYANCA,74,C,2,33,10.0,5.0,74,2,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,b
110113,TLAXCALA,SAN PABLO DEL MONTE,105,U,2,35,2.0,6.0,316,2,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,b


In [51]:
#Function to create a dataset for each target question and store it in a dictionary
def DataFrame_X_y_split(df,targets):
    df.fillna('b',inplace=True)
    df_X_y_dict = {}
    categorical_features = df.dtypes[df.dtypes == 'object'].index.tolist()
    for target in targets:
        if target in categorical_features:
            categorical_features.remove(target)

    for col in categorical_features:
        df[col] = df[col].astype(str)
    encode_df = pd.get_dummies(df, columns=categorical_features)
    for target in targets:
        df_X = encode_df.loc[encode_df[target] != 'b'].drop(columns=targets)
        df_y = encode_df.loc[encode_df[target] != 'b'][target]
        df_X_y_dict[target] = {}
        df_X_y_dict[target]['X'] = df_X
        df_X_y_dict[target]['y'] = df_y
    return df_X_y_dict

In [52]:
#List of each target question we chose 
target = ['P10_8_1',
'P10_8_2',
'P10_8_3',
'P10_8_4',
'P10_8_5',
'P10_8_6',
'P10_8_7',
'P10_8_8',
'P10_8_9',
'P10_8_10',
'P10_8_11',
'P10_8_12',
'P10_8_13',
'P10_8_14',
'P10_8_15']

In [53]:
#Datasets for each target question
dataset_dictionary = DataFrame_X_y_split(df_copy, target)
dataset_dictionary

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna('b',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)


{'P10_8_1': {'X':         FAC_VIV  PAREN  EDAD   NIV  GRA  FAC_MUJ  P1_1  P1_2  P1_2_A  P1_3  \
  3            92      2    45  11.0  3.0       92     3     3       5    15   
  7           119      2    31   4.0  3.0      119     3     2       3     5   
  8           232      2    27   4.0  3.0      232     3     1       1     3   
  9           291      1    25  10.0  4.0      291     3     1       3     6   
  23          334      2    29   2.0  6.0      334     2     2       4     6   
  ...         ...    ...   ...   ...  ...      ...   ...   ...     ...   ...   
  110105      284      2    25   3.0  3.0      284     2     2       3     4   
  110109       84      2    33  10.0  5.0       84     3     2       5    19   
  110110       74      2    33  10.0  5.0       74     2     2       5    10   
  110113      105      2    35   2.0  6.0      316     2     3       4     6   
  110124      218      1    31   3.0  3.0      218     2     2       3     4   
  
          ...  P10_7_

In [139]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=18)

In [140]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [143]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=18) 

In [144]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [145]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [147]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 0.9967600515315246
Classification Report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     81405
         1.0       1.00      0.62      0.77       878
         2.0       0.98      1.00      0.99     21731

    accuracy                           1.00    104014
   macro avg       0.99      0.87      0.92    104014
weighted avg       1.00      1.00      1.00    104014



In [148]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, pd.DataFrame(data=X).columns), reverse=True)

[(0.2031990875710407, 'p10_7'),
 (0.18712158261438763, 'p10_4_1'),
 (0.13187124853225837, 'p10_6mes'),
 (0.12845377149565362, 'p10_6anio'),
 (0.11658249221263205, 'p10_3'),
 (0.039560390990765526, 'p10_5_01'),
 (0.02546040954581277, 'p10_5_02'),
 (0.02491437156261494, 'p10_5_07'),
 (0.020953709892517887, 'p10_2'),
 (0.011357528953952154, 'edad'),
 (0.008617827794357371, 'p10_5_05'),
 (0.006696286001653509, 'p2_16'),
 (0.005400474502628161, 'p4_7_ab'),
 (0.0047891676150730415, 'p4_5_ab'),
 (0.004598343144941996, 'p4_5_1_ab'),
 (0.004009810039171145, 'p3_1'),
 (0.003678094817291754, 'p2_11'),
 (0.003479677706743299, 'niv'),
 (0.0027267065957611095, 'p1_7'),
 (0.0024586034472776916, 'fac_viv'),
 (0.0024017099860649265, 'upm_dis'),
 (0.002384241569458134, 'p4bc_1'),
 (0.002377506071615297, 'n_ren_esp'),
 (0.0023090551435051734, 'p10_4_3'),
 (0.002280164708983225, 'p4_4_cve'),
 (0.0022465468377546017, 'p2_13'),
 (0.0021398009724603465, 'p2_9'),
 (0.0021325143168789244, 'gra'),
 (0.002046655