In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification
import sqlalchemy as sql
from getpass import getpass

In [2]:
# Ask for the database pasword
password = getpass('Enter database password')

Enter database password··········


In [3]:
# Create engine to connect to database
engine = sql.create_engine(f'postgresql://postgres:{password}@obstetric-violence.clstnlifxcx7.us-west-2.rds.amazonaws.com:5432/ENDIREH_2021')

# Get list of table names
sql.inspect(engine).get_table_names()

['TVIV', 'TSDem', 'TB_SEC_III', 'TB_SEC_IV', 'TB_SEC_X', 'obstetric_violence']

In [4]:
# Read the obstetric_violence table and show the results
df = pd.read_sql_table('obstetric_violence', con=engine)
df

Unnamed: 0,ID_PER,ID_VIV,UPM,VIV_SEL,HOGAR,N_REN,CVE_ENT,NOM_ENT,CVE_MUN,NOM_MUN,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
0,0100128.05.1.02,100128.05,100128,5,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
1,0101482.03.1.03,101482.03,101482,3,1,3,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
2,0101631.04.1.01,101631.04,101631,4,1,1,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
3,0101876.04.1.02,101876.04,101876,4,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
4,0102096.02.1.02,102096.02,102096,2,1,2,1,AGUASCALIENTES,5,JESÚS MARÍA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110122,2805373.02.1.02,2805373.02,2805373,2,1,2,28,TAMAULIPAS,32,REYNOSA,...,,,,,,,,,,
110123,2806028.02.1.03,2806028.02,2806028,2,1,3,28,TAMAULIPAS,38,TAMPICO,...,,,,,,,,,,
110124,3103444.16.1.01,3103444.16,3103444,16,1,1,31,YUCATÁN,21,CHICHIMILÁ,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,
110125,3103573.19.1.04,3103573.19,3103573,19,1,4,31,YUCATÁN,35,HOCTÚN,...,,,,,,,,,,


In [5]:
# Creating a copy of the database to choose the features we will use to analyse
df_copy = df.copy()

In [6]:
# Remove columns that had data that wasn't usefull like ids, sampling information and table structure
df_copy = df_copy.drop(columns=['ID_VIV', 'ID_PER' ,'UPM', 'VIV_SEL', 'HOGAR', 'N_REN', 'CVE_ENT', 'CVE_MUN', 'COD_RES', 'EST_DIS', 'UPM_DIS', 'ESTRATO', 'NOMBRE', 'SEXO', 'COD_M15', 'CODIGO', 'REN_MUJ_EL', 'REN_INF_AD', 'N_REN_ESP','T_INSTRUM'])
df_copy

Unnamed: 0,NOM_ENT,NOM_MUN,FAC_VIV,DOMINIO,PAREN,EDAD,NIV,GRA,FAC_MUJ,P1_1,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
0,AGUASCALIENTES,AGUASCALIENTES,72,U,3,23,10.0,6.0,143,3,...,,,,,,,,,,
1,AGUASCALIENTES,AGUASCALIENTES,98,U,3,26,10.0,4.0,195,3,...,,,,,,,,,,
2,AGUASCALIENTES,AGUASCALIENTES,148,U,1,62,10.0,3.0,296,3,...,,,,,,,,,,
3,AGUASCALIENTES,AGUASCALIENTES,92,U,2,45,11.0,3.0,92,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
4,AGUASCALIENTES,JESÚS MARÍA,78,U,2,77,2.0,3.0,235,2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110122,TAMAULIPAS,REYNOSA,341,U,2,43,4.0,3.0,341,2,...,,,,,,,,,,
110123,TAMAULIPAS,TAMPICO,351,U,3,16,4.0,1.0,703,3,...,,,,,,,,,,
110124,YUCATÁN,CHICHIMILÁ,218,C,1,31,3.0,3.0,218,2,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,
110125,YUCATÁN,HOCTÚN,159,C,3,30,10.0,4.0,318,2,...,,,,,,,,,,


In [7]:
# Removing women that did not had a pregnancy on the last 5 years
df_copy = df_copy[df_copy.P10_2 == 1.0]
df_copy

Unnamed: 0,NOM_ENT,NOM_MUN,FAC_VIV,DOMINIO,PAREN,EDAD,NIV,GRA,FAC_MUJ,P1_1,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
3,AGUASCALIENTES,AGUASCALIENTES,92,U,2,45,11.0,3.0,92,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
7,AGUASCALIENTES,SAN FRANCISCO DE LOS ROMO,119,R,2,31,4.0,3.0,119,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
8,BAJA CALIFORNIA,TIJUANA,232,U,2,27,4.0,3.0,232,3,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,
9,BAJA CALIFORNIA,TIJUANA,291,U,1,25,10.0,4.0,291,3,...,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
19,COLIMA,VILLA DE ÁLVAREZ,65,U,9,30,9.0,1.0,129,3,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110105,TAMAULIPAS,NUEVO LAREDO,284,U,2,25,3.0,3.0,284,2,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,
110109,TLAXCALA,ATLANGATEPEC,84,R,2,33,10.0,5.0,84,3,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,3.0
110110,TLAXCALA,ATLTZAYANCA,74,C,2,33,10.0,5.0,74,2,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
110113,TLAXCALA,SAN PABLO DEL MONTE,105,U,2,35,2.0,6.0,316,2,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,


In [8]:
#List of each target question we chose 
target = ['P10_8_1',
'P10_8_2',
'P10_8_3',
'P10_8_4',
'P10_8_5',
'P10_8_6',
'P10_8_7',
'P10_8_8',
'P10_8_9',
'P10_8_10',
'P10_8_11',
'P10_8_12',
'P10_8_13',
'P10_8_14',
'P10_8_15']

In [9]:
#Function to create a dataset for each target question and store it in a dictionary
def DataFrame_X_y_split(df,targets, df_X_y_dict = {}):
    income_columns = ['P4_2', 'P4_5_AB', 'P4_7_AB', 'P4_9_1', 'P4_9_2', 'P4_9_3', 'P4_9_4', 'P4_9_5', 'P4_9_6', 'P4_9_7']
    df[income_columns] = df[income_columns].fillna(0)
    df[income_columns].apply(lambda x: x.astype(int))    
    df_copy[(df_copy[income_columns] == 999999)][income_columns] = 0
    df.fillna('b',inplace=True)
    categorical_features = df.dtypes[df.dtypes == 'object'].index.tolist()
    for target in targets:
        if target in categorical_features:
            categorical_features.remove(target)
    df[categorical_features].apply(lambda x: x.astype(str))
    encode_df = pd.get_dummies(df, columns=categorical_features)
    for target in targets:
        df_X = encode_df.loc[encode_df[target] != 'b'].drop(columns=targets)
        df_y = encode_df.loc[encode_df[target] != 'b',[target]]
        df_X_y_dict[target] = {}
        df_X_y_dict[target]['X'] = df_X
        df_X_y_dict[target]['y'] = df_y
    return df_X_y_dict

In [10]:
#Datasets for each target question
dataset_dictionary = DataFrame_X_y_split(df_copy, target)
dataset_dictionary['P10_8_1']['X']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Unnamed: 0,FAC_VIV,PAREN,EDAD,NIV,GRA,FAC_MUJ,P1_1,P1_2,P1_2_A,P1_3,...,P10_7_2.0,P10_7_3.0,P10_7_4.0,P10_7_5.0,P10_7_6.0,P10_7_7.0,P10_7_8.0,P10_7_9.0,P10_7_10.0,P10_7_b
3,92,2,45,11.0,3.0,92,3,3,5,15,...,0,0,0,0,1,0,0,0,0,0
7,119,2,31,4.0,3.0,119,3,2,3,5,...,0,0,0,0,0,0,0,0,0,0
8,232,2,27,4.0,3.0,232,3,1,1,3,...,0,0,0,0,1,0,0,0,0,0
9,291,1,25,10.0,4.0,291,3,1,3,6,...,0,0,0,0,1,0,0,0,0,0
23,334,2,29,2.0,6.0,334,2,2,4,6,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110105,284,2,25,3.0,3.0,284,2,2,3,4,...,1,0,0,0,0,0,0,0,0,0
110109,84,2,33,10.0,5.0,84,3,2,5,19,...,0,0,0,0,1,0,0,0,0,0
110110,74,2,33,10.0,5.0,74,2,2,5,10,...,0,0,0,0,0,1,0,0,0,0
110113,105,2,35,2.0,6.0,316,2,3,4,6,...,0,0,0,0,0,1,0,0,0,0


In [31]:
dataset_dictionary['P10_8_1']['y'].value_counts()

P10_8_1
2.0        17935
1.0         1387
dtype: int64

In [21]:
def RFC_Classifier(key, X, y, RFC_Results = {}):
    # Grab the y information from the target dataset
    y = y.astype('int').values.ravel()
    # Create the train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X.values, y, random_state=18, stratify=y)
    # Create a scaler instance
    scaler = StandardScaler()
    # Train the standard scaler using the X_train data
    X_scaler = scaler.fit(X_train)
    # Scale the X training data
    X_train_scaled = X_scaler.transform(X_train)
    # Scale the X test data
    X_test_scaled = X_scaler.transform(X_test)
    # Create instance of the Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=256, random_state=18)
    # Fit the training data to the model
    rf_model = rf_model.fit(X_train_scaled, y_train)
    # Predict the results for the target question
    predictions = rf_model.predict(X_test_scaled)
    # Calculating the confusion matrix.
    cm = confusion_matrix(y_test, predictions)
    # Calculating the accuracy score.
    acc_score = accuracy_score(y_test, predictions)
    # Store the results results
    RFC_Results[key] = {}
    RFC_Results[key]['Predictions'] = predictions
    RFC_Results[key]["Confusion Matrix"] = cm
    RFC_Results[key]["Accuracy Score"] = acc_score
    RFC_Results[key]["Classification Report"] = classification_report(y_test, predictions)     
    RFC_Results[key]['Feature Importance'] = sorted(zip(rf_model.feature_importances_, pd.DataFrame(data=X).columns), reverse=True)
    return RFC_Results

In [22]:
RFC_Result = RFC_Classifier('P10_8_1',dataset_dictionary['P10_8_1']['X'],dataset_dictionary['P10_8_1']['y'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
RFC_Result['P10_8_1']['Feature Importance'][:10]

[(0.018907561913466004, 'FAC_MUJ'),
 (0.018876397927621556, 'FAC_VIV'),
 (0.017060290671145353, 'EDAD'),
 (0.01541584364959796, 'P1_3'),
 (0.015002649295750781, 'P10_6MES'),
 (0.014612596567405267, 'P4_7_AB'),
 (0.013993336364713718, 'P4_5_AB'),
 (0.012341231449811463, 'P10_6ANIO'),
 (0.012133080329136212, 'P1_2_A'),
 (0.012003968355341992, 'P1_7')]

In [33]:
print(RFC_Result['P10_8_1']['Classification Report'])

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       347
           2       0.93      1.00      0.96      4484

    accuracy                           0.93      4831
   macro avg       0.46      0.50      0.48      4831
weighted avg       0.86      0.93      0.89      4831

