In [29]:
# Import dependencies
import pandas as pd
import numpy as np
import sqlalchemy as sql
from getpass import getpass

In [2]:
# Ask for the database pasword
password = getpass('Enter database password')

In [4]:
# Create engine to connect to database
engine = sql.create_engine(f'postgresql://postgres:{password}@obstetric-violence.clstnlifxcx7.us-west-2.rds.amazonaws.com:5432/ENDIREH_2021')

In [5]:
# Get list of table names
sql.inspect(engine).get_table_names()

['TVIV', 'TSDem', 'TB_SEC_III', 'TB_SEC_IV', 'TB_SEC_X', 'obstetric_violence']

In [6]:
# Read the obstetric_violence table and show the results
df = pd.read_sql_table('obstetric_violence', con=engine)
df

Unnamed: 0,ID_PER,ID_VIV,UPM,VIV_SEL,HOGAR,N_REN,CVE_ENT,NOM_ENT,CVE_MUN,NOM_MUN,...,P10_8_6,P10_8_7,P10_8_8,P10_8_9,P10_8_10,P10_8_11,P10_8_12,P10_8_13,P10_8_14,P10_8_15
0,0100128.05.1.02,100128.05,100128,5,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
1,0101482.03.1.03,101482.03,101482,3,1,3,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
2,0101631.04.1.01,101631.04,101631,4,1,1,1,AGUASCALIENTES,1,AGUASCALIENTES,...,,,,,,,,,,
3,0101876.04.1.02,101876.04,101876,4,1,2,1,AGUASCALIENTES,1,AGUASCALIENTES,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,
4,0102096.02.1.02,102096.02,102096,2,1,2,1,AGUASCALIENTES,5,JESÚS MARÍA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110122,2805373.02.1.02,2805373.02,2805373,2,1,2,28,TAMAULIPAS,32,REYNOSA,...,,,,,,,,,,
110123,2806028.02.1.03,2806028.02,2806028,2,1,3,28,TAMAULIPAS,38,TAMPICO,...,,,,,,,,,,
110124,3103444.16.1.01,3103444.16,3103444,16,1,1,31,YUCATÁN,21,CHICHIMILÁ,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,
110125,3103573.19.1.04,3103573.19,3103573,19,1,4,31,YUCATÁN,35,HOCTÚN,...,,,,,,,,,,


In [24]:
# Create the EDA_df which holds an overview of the information in the obstetric_violence table
EDA_df = pd.DataFrame({
    'Dtype' : df.dtypes,
    'Number of Unique Values' : df.nunique(),
    'Number of Non-Empty Entries' : df.count(),
    'Number of Empty Entries' : df.isnull().sum(),
    'List of Values' : [df[col].value_counts().index.tolist() for col in df.columns],
})
EDA_df.tail(15)

Unnamed: 0,Dtype,Number of Unique Values,Number of Non-Empty Entries,Number of Empty Entries,List of Values
P10_8_1,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_2,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_3,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_4,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_5,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_6,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_7,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_8,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_9,float64,2,19322,90805,"[2.0, 1.0]"
P10_8_10,float64,2,19322,90805,"[2.0, 1.0]"


In [69]:
# Verify if the people who answered No (2.0) to question 10_2, did answer the questions from section 10_8
pregnancy_not_in_last_5_years = df.loc[(df['P10_2'] == 2.0)]

target = ['P10_8_1',
'P10_8_2',
'P10_8_3',
'P10_8_4',
'P10_8_5',
'P10_8_6',
'P10_8_7',
'P10_8_8',
'P10_8_9',
'P10_8_10',
'P10_8_11',
'P10_8_12',
'P10_8_13',
'P10_8_14',
'P10_8_15']

for col in target:
    print(pregnancy_not_in_last_5_years[col].value_counts())

Series([], Name: P10_8_1, dtype: int64)
Series([], Name: P10_8_2, dtype: int64)
Series([], Name: P10_8_3, dtype: int64)
Series([], Name: P10_8_4, dtype: int64)
Series([], Name: P10_8_5, dtype: int64)
Series([], Name: P10_8_6, dtype: int64)
Series([], Name: P10_8_7, dtype: int64)
Series([], Name: P10_8_8, dtype: int64)
Series([], Name: P10_8_9, dtype: int64)
Series([], Name: P10_8_10, dtype: int64)
Series([], Name: P10_8_11, dtype: int64)
Series([], Name: P10_8_12, dtype: int64)
Series([], Name: P10_8_13, dtype: int64)
Series([], Name: P10_8_14, dtype: int64)
Series([], Name: P10_8_15, dtype: int64)


In [9]:
# The following columns contain information that serves as an ID for the interviews individual and will be removed
# ID_cols = ['ID_VIV', 'ID_PER' ,'UPM', 'VIV_SEL', 'HOGAR', 'N_REN']

In [10]:
# The following columns have values that should be removed since they're part of the sampling strategy
# Sampling_cols = ['ESTRATO','EST_DIS','UPM_DIS']

In [16]:
# Verify the information in the following column
EDA_df.loc['COD_RES']

Dtype                           int64
Number of Unique Values             1
Number of Non-Empty Entries    110127
Number of Empty Entries             0
List of Values                    [1]
Name: COD_RES, dtype: object

In [11]:
# Compare if T_INSTRUM has the same information as question P3_8, if true then T_INSTRUM can be dropped
EDA_df.loc['T_INSTRUM'].equals(EDA_df.loc['P3_8'])

True

In [17]:
# Verify the information in the following column
EDA_df.loc['SEXO']

Dtype                           int64
Number of Unique Values             1
Number of Non-Empty Entries    110127
Number of Empty Entries             0
List of Values                    [2]
Name: SEXO, dtype: object

In [18]:
# Verify the information in the following column
EDA_df.loc['COD_M15']

Dtype                          float64
Number of Unique Values              1
Number of Non-Empty Entries     110127
Number of Empty Entries              0
List of Values                   [1.0]
Name: COD_M15, dtype: object

In [20]:
# Verify the information in the following column
EDA_df.loc['CODIGO']

Dtype                          float64
Number of Unique Values              1
Number of Non-Empty Entries     110127
Number of Empty Entries              0
List of Values                   [1.0]
Name: CODIGO, dtype: object

In [21]:
# Verify the information in the following column
EDA_df.loc['REN_MUJ_EL']

Dtype                                                                    float64
Number of Unique Values                                                       16
Number of Non-Empty Entries                                               110127
Number of Empty Entries                                                        0
List of Values                 [2.0, 1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...
Name: REN_MUJ_EL, dtype: object

In [22]:
# Verify the information in the following column
EDA_df.loc['REN_INF_AD']

Dtype                                                                    float64
Number of Unique Values                                                       13
Number of Non-Empty Entries                                                67048
Number of Empty Entries                                                    43079
List of Values                 [2.0, 1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0, 8.0, ...
Name: REN_INF_AD, dtype: object

In [23]:
# Verify the information in the following column
EDA_df.loc['N_REN_ESP']

Dtype                                                                    float64
Number of Unique Values                                                       16
Number of Non-Empty Entries                                                68540
Number of Empty Entries                                                    41587
List of Values                 [1.0, 2.0, 0.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
Name: N_REN_ESP, dtype: object

In [None]:
# The following columns will be removed since they're not relevant to the machine learning model
# Discarded_cols = ['ID_VIV', 'ID_PER' ,'UPM', 'VIV_SEL', 'HOGAR', 'N_REN', 'CVE_ENT', 'CVE_MUN', 'COD_RES', 'EST_DIS', 'UPM_DIS', 'ESTRATO', 'NOMBRE', 'SEXO', 'COD_M15', 'CODIGO', 'REN_MUJ_EL', 'REN_INF_AD', 'N_REN_ESP','T_INSTRUM']

In [12]:
# The following columns contain information about the city and state of the interviewed person
# Location_cols = ['CVE_ENT','NOM_ENT','CVE_MUN','NOM_MUN']

In [13]:
# The following columns contain the ponderators used to score the household and survery answers
# Ponderator_cols = ['FAC_VIV', 'FAC_MUJ']

In [14]:
# The following columns contain information that references another source
# External_cols = ['P4_4_CVE']

In [15]:
# The following columns use a number to refer to a non-specified answer

## EDAD
### Uses 98 for an unspecified age for people aged 15 or older
### Uses 99 for unspecified age

## P2_5
### 96 means lives in another household
### 97 means the person passed away
### 98 means unknown

## P2_6
### 96 means lives in another household
### 97 means the person passed away
### 98 means unknown

## GRA
### 9 means unspecified grade of education

## P2_8
### 9 means the person did not specify if they can read and write a message

## P2_10
### 8 means the person does not know if they're considered indigenous

## P2_14
### 99 means the person did not specify what they did last week related to their economical activity

## N_REN_ESP
### 00 means the husband or partner was not included in the resident list

## P4AB_2
### 00 means that it's been less than a year living without their partner in the same house
### 98 means they dont remember
### 99 means not specified

## P4B_2
### 8 means the person doesnt know where their ex partner lives
### 9 means unspecified

## P4BC_1
### 98 means the person doesnt know their expartner's age
### 99 means not specified

## P4BC_2
### 98 means the person does not know their expartner's scholarity

## P4C_1
### 8 means the person does not know if their expartner is currently enrolled in a school 

## P4BC_3
### 8 means the person does not know if their exparter is considered indigenous

## P4_2
### 999998 means the person does not know their income
### 999999 means unspecified

## P4_2_1
### 8 means the person does not know when they're paid

## P4_3
### 8 means the person does not know if their partner/expartner works
### 9 means unspecified

## P4_5_AB
### 999998 means the person does not know their partner/expartner's income
### 999999 means unspecified

## P4_5_1_AB
### 8 means the person does not know when their partner/expartner receive payment from their work
### 9 means unspecified

## P4_6_AB
### 9 means the person did not specify if their partner/expartner contribues money to cover for family expenses

## P4_7_AB
### 999998 means the person does not how much money their partner/expartner gives them
### 999999 means unspecified

## P4_9_1
### 999998 means the person does not how much money they receive from their retirement fund
### 999999 means unspecified

## P4_9_2
### 999998 means the person does not how much money they receive from someone working in the US
### 999999 means unspecified

## P4_9_3
### 999998 means the person does not how much money they receive from someone working in Mexico
### 999999 means unspecified

## P4_9_4
### 999998 means the person does not how much money they receive from scholarships to their son/daughter
### 999999 means unspecified

## P4_9_5
### 999998 means the person does not how much money they receive from scholarships to themselves
### 999999 means unspecified

## P4_9_6
### 999998 means the person does not how much money they receive from government support programs
### 999999 means unspecified

## P4_9_7
### 999998 means the person does not how much money they receive from other sources
### 999999 means unspecified

## P4_13_1
### 98 means the person doesnt know who owns the agricultural terrains

## P4_13_2
### 98 means the person doest know who owns the automobile or trucks

## P4_13_3
### 98 means the person doesnt know who owns the savings account

## P4_13_4
### 98 means the person doesnt know who owns the house

## P4_13_5
### 98 means the person doesnt know who owns the buildings 

## P4_13_6
### 98 means the person doesnt know who owns the stablishments

## P4_13_7
### 98 means the person doesnt know who owns the other houses

# Unspecified_cols = ['EDAD', 'P2_5', 'P2_6', 'GRA', 'P2_8', 'P2_10', 'P2_14', 'N_REN_ESP', 'P4AB_2', 'P4B_2', 'P4BC_1', 'P4BC_2', 'P4C_1', 'P4BC_3', 'P4_2', 'P4_2_1', 'P4_3', 'P4_5_AB', 'P4_5_1_AB', 'P4_6_AB', 'P4_7_AB', 'P4_9_1', 'P4_9_2', 'P4_9_3', 'P4_9_4', 'P4_9_5', 'P4_9_6', 'P4_9_7', 'P4_13_1', 'P4_13_2', 'P4_13_3', 'P4_13_4', 'P4_13_5', 'P4_13_6', 'P4_13_7', ]