## Beyond the Model: Data Ethics

**Goals:**

- Prepare a household dataset for binary classification.

- Create a logistic regression model to predict severe damage.

- Explain model prediction using odds ratio.

- Interrogate importance features based on caste.

In [1]:
import sqlite3
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from category_encoders import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

### Prepare Data

Import data from sqlite server

In [3]:
def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)

    # Construct query
    query = """
            SELECT h.*,
            s.*,
            i.vdcmun_id,
            d.damage_grade
        FROM household_demographics AS h
        JOIN id_map AS i ON i.household_id = h.household_id
        JOIN building_structure AS s ON i.building_id = s.building_id
        JOIN building_damage as d ON i.building_id = d.building_id
        WHERE district_id = 4
        """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="household_id")

    # Identify leaky columns
    drop_cols = [col for col in df.columns if "post_eq" in col]

    # Add high-cardinality / redundant column
    drop_cols.append("building_id")

    # Create binary target column
    df["damage_grade"] = df["damage_grade"].str[-1].astype(int)
    df["severe_damage"] = (df["damage_grade"] > 3).astype(int)

    # Drop old target
    drop_cols.append("damage_grade")

    # Drop multicollinearity column
    drop_cols.append("count_floors_pre_eq")
    
    # Drop columns
    df.drop(columns=drop_cols, inplace=True)

    return df

Importing from local dir:

In [5]:
df = pd.read_csv("data/household_df.csv")
df.head()

Unnamed: 0,household_id,gender_household_head,age_household_head,caste_household,education_level_household_head,income_level_household,size_household,is_bank_account_present_in_household,age_building,plinth_area_sq_ft,...,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,superstructure,vdcmun_id,severe_damage
0,16400201,Female,46,Chhetree,Class 5,Rs. 10-20 thousand,4,1,20,560,...,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,"Stone, mud mortar",38,0
1,16408101,Male,66,Chhetree,Illiterate,Rs. 10 thousand,5,0,21,200,...,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,"Stone, mud mortar",38,0
2,16408901,Male,54,Magar,Class 4,Rs. 10 thousand,5,1,18,315,...,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,"Stone, mud mortar",38,0
3,16409801,Male,36,Chhetree,Class 5,Rs. 10 thousand,6,1,45,290,...,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,"Stone, mud mortar",38,0
4,16410301,Female,39,Chhetree,Class 4,Rs. 10 thousand,3,0,21,230,...,Flat,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Not attached,Rectangular,"Stone, mud mortar",38,0


High-Cardinality Features

In [8]:
df.select_dtypes("object").nunique()

gender_household_head              2
caste_household                   63
education_level_household_head    19
income_level_household             5
land_surface_condition             3
foundation_type                    5
roof_type                          3
ground_floor_type                  5
other_floor_type                   4
position                           4
plan_configuration                10
superstructure                    11
dtype: int64

In [10]:
df["caste_household"].value_counts().head()

caste_household
Gurung          15119
Brahman-Hill    13043
Chhetree         8766
Magar            8180
Sarki            6052
Name: count, dtype: int64

In [11]:
df["caste_household"].value_counts().tail()

caste_household
Byasi/Sauka    1
Kalar          1
Dhimal         1
Kalwar         1
Hyolmo         1
Name: count, dtype: int64

Next: Take the top 10 ethnicities, and group the rest as others.

In [14]:
df["caste_household"].value_counts().head(10)

caste_household
Gurung          15119
Brahman-Hill    13043
Chhetree         8766
Magar            8180
Sarki            6052
Newar            5906
Kami             3565
Tamang           2396
Kumal            2271
Damai/Dholi      1977
Name: count, dtype: int64

In [17]:
top_10=df["caste_household"].value_counts().head(10).index
top_10

Index(['Gurung', 'Brahman-Hill', 'Chhetree', 'Magar', 'Sarki', 'Newar', 'Kami',
       'Tamang', 'Kumal', 'Damai/Dholi'],
      dtype='object', name='caste_household')

In [21]:
df["caste_household"].apply(lambda c: c if c in top_10 else "other")#.value_counts()

caste_household
Gurung          15119
Brahman-Hill    13043
Chhetree         8766
other            8608
Magar            8180
Sarki            6052
Newar            5906
Kami             3565
Tamang           2396
Kumal            2271
Damai/Dholi      1977
Name: count, dtype: int64