### Logistic Regression Classifier - Malwai

In [1]:
%matplotlib inline

import os
import sys
import json
import keras

import numpy as np
import pandas as pd

from pathlib import Path
from pathlib import PureWindowsPath

from matplotlib import pyplot as plt
from IPython.display import display
import seaborn as sns
sns.set()


import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import metrics

# Add our local functions to the path
sys.path.append(os.path.join(os.pardir, 'src'))
from models import evaluation
from data.load_data import (get_country_filepaths, 
                            split_features_labels_weights)


ALGORITHM_NAME = 'lr'
COUNTRY = 'mwi'


Using TensorFlow backend.


In [2]:
# Load the processed Mwai household:
mwi_hhold = pd.read_csv(r"C:\Users\micha\Documents\GitHub\mkp_code\Institute of Data Course\Project 3\data\csv_data\MWI_2010_household.csv")

In [73]:
mwi_hhold.poor.head

<bound method NDFrame.head of 0        0
1        0
2        0
3        0
4        1
        ..
12239    1
12240    1
12241    1
12242    0
12243    1
Name: poor, Length: 12244, dtype: int64>

In [74]:
y_test = mwi_hhold.poor

In [3]:
# Split Malawi data and Indonesia data
mwi_train, mwi_test = train_test_split(mwi_hhold, 
                                       test_size=0.25,
                                       random_state=1443,
                                       stratify=mwi_hhold.poor)

In [4]:
# summarize Train loaded data
print('Data has {:,} rows and {:,} columns' \
        .format(*mwi_train.shape))

print('Percent poor: {:0.1%} \tPercent non-poor: {:0.1%}' \
        .format(*mwi_train.poor.value_counts(normalize=True, ascending=True)))

# print first 5 rows of data
mwi_train.head()

Data has 9,183 rows and 488 columns
Percent poor: 45.1% 	Percent non-poor: 54.9%


Unnamed: 0,hid,wta_hh,wta_pop,der_hhsize,poor,hld_rooms,hld_nbcellpho,hld_selfscale,der_nchild10under,der_nmalesover10,...,com_medicines__Yes,com_clinic__Yes,com_distclinic__11 to 15 kilometers,com_distclinic__16 to 20 kilometers,com_distclinic__21 to 25 kilometers,com_distclinic__26 to 30 kilometers,com_distclinic__6 to 10 kilometers,com_distclinic__Above 30 kilometers,com_distclinic__nan,com_bank__Yes
7927,304044580160,170.1948,850.974,5,True,3,0.0,1.0,3,1,...,0,0,0,0,0,0,0,0,0,0
4210,205205820123,188.8692,566.6076,3,False,2,1.0,3.0,1,1,...,1,0,0,0,0,0,0,0,0,0
457,102016880095,143.9194,863.5164,6,True,5,0.0,2.0,3,1,...,0,0,0,0,0,0,0,0,0,0
11951,315356330210,234.7279,704.1837,3,False,3,3.0,3.0,1,1,...,1,1,0,0,0,0,0,0,1,1
11428,313061660067,78.5482,157.0964,2,False,1,0.0,2.0,0,1,...,1,0,0,1,0,0,0,0,0,0


In [5]:
# Select a few columns for this example
selected_columns = [
    'der_hhsize', 
    'der_nchild10under', 
    'der_nmalesover10', 
    'der_nfemalesover10',
    'der_nliterate', 
    'der_nemployedpastyear',
    'hld_electricity__Yes',
    'hld_toilet__None',
    'cons_0504__Yes',
    'cons_0508__Yes'
]

print("X shape with selected columns:", mwi_train[selected_columns].shape)

X shape with selected columns: (9183, 10)


## Checking for Multicollinearity:
### Calcualting the variance inflation factor (VIF) of each feature.

In [6]:
def get_vif(X, intercept_col='intercept'):
    if intercept_col is not None and intercept_col in X.columns:
        X = X.copy().drop(intercept_col, axis=1)
    
    vi_factors = [variance_inflation_factor(X.values, i)
                             for i in range(X.shape[1])]
    
    return pd.Series(vi_factors,
                     index=X.columns,
                     name='variance_inflaction_factor')

In [7]:
get_vif(mwi_train[selected_columns])

der_hhsize               1142.050480
der_nchild10under         206.466307
der_nmalesover10          138.021692
der_nfemalesover10        141.867242
der_nliterate               5.616642
der_nemployedpastyear       1.533041
hld_electricity__Yes        1.471163
hld_toilet__None            1.091630
cons_0504__Yes              1.344652
cons_0508__Yes              1.341629
Name: variance_inflaction_factor, dtype: float64

### Standardization 

In [8]:
# Standardize features
def standardize(df, numeric_only=True):
    if numeric_only is True:
    # find non-boolean columns
        cols = df.loc[:, df.dtypes != 'uint8'].columns
    else:
        cols = df.columns
    for field in cols:
        mean, std = df[field].mean(), df[field].std()
        # account for constant columns
        if np.all(df[field] - mean != 0):
            df.loc[:, field] = (df[field] - mean) / std
    
    return df

In [81]:
standardize(mwi_train)
mwi_train.head()

Unnamed: 0,hid,wta_hh,wta_pop,der_hhsize,poor,hld_rooms,hld_nbcellpho,hld_selfscale,der_nchild10under,der_nmalesover10,...,com_medicines__Yes,com_clinic__Yes,com_distclinic__11 to 15 kilometers,com_distclinic__16 to 20 kilometers,com_distclinic__21 to 25 kilometers,com_distclinic__26 to 30 kilometers,com_distclinic__6 to 10 kilometers,com_distclinic__Above 30 kilometers,com_distclinic__nan,com_bank__Yes
7517,0.877705,0.349928,0.391977,0.176255,0.907005,1.167342,-0.612173,1.096303,-1.20883,1.497847,...,-1.708513,-0.591278,3.098851,-0.198751,-0.091951,-0.152601,-0.541448,-0.14179,-0.591278,-0.25068
5259,-0.344649,0.460407,0.801713,0.626351,-1.10241,-0.520933,-0.612173,-1.13296,0.249676,0.545039,...,0.585241,1.691069,-0.322665,-0.198751,-0.091951,-0.152601,-0.541448,-0.14179,1.691069,-0.25068
6703,0.851865,0.688945,0.677486,0.176255,-1.10241,-1.36507,-0.612173,1.096303,0.249676,-0.407768,...,-1.708513,-0.591278,-0.322665,-0.198751,10.874137,-0.152601,-0.541448,-0.14179,-0.591278,-0.25068
10973,0.993296,-0.168602,0.166037,0.626351,-1.10241,-1.36507,-0.612173,-1.13296,0.249676,0.545039,...,0.585241,-0.591278,-0.322665,-0.198751,-0.091951,-0.152601,1.846697,-0.14179,-0.591278,-0.25068
7994,0.89067,-0.433803,-0.101975,0.626351,-1.10241,-0.520933,-0.612173,-0.018328,1.708181,-0.407768,...,-1.708513,1.691069,-0.322665,-0.198751,-0.091951,-0.152601,-0.541448,-0.14179,1.691069,-0.25068


In [9]:
get_vif(mwi_train[selected_columns])

der_hhsize               1142.050480
der_nchild10under         206.466307
der_nmalesover10          138.021692
der_nfemalesover10        141.867242
der_nliterate               5.616642
der_nemployedpastyear       1.533041
hld_electricity__Yes        1.471163
hld_toilet__None            1.091630
cons_0504__Yes              1.344652
cons_0508__Yes              1.341629
Name: variance_inflaction_factor, dtype: float64

In [10]:
selected_columns.remove('der_hhsize')
print(selected_columns)

get_vif(mwi_train[selected_columns])

['der_nchild10under', 'der_nmalesover10', 'der_nfemalesover10', 'der_nliterate', 'der_nemployedpastyear', 'hld_electricity__Yes', 'hld_toilet__None', 'cons_0504__Yes', 'cons_0508__Yes']


der_nchild10under        2.228858
der_nmalesover10         3.678918
der_nfemalesover10       4.027538
der_nliterate            5.616554
der_nemployedpastyear    1.528315
hld_electricity__Yes     1.470804
hld_toilet__None         1.091630
cons_0504__Yes           1.344010
cons_0508__Yes           1.341605
Name: variance_inflaction_factor, dtype: float64

In [11]:
 mwi_train.poor

7927      True
4210     False
457       True
11951    False
11428    False
         ...  
5430      True
11107    False
769      False
8471     False
7712     False
Name: poor, Length: 9183, dtype: bool

In [13]:
# X is standardised - prepare for model
y_train = mwi_train.poor
X_train = mwi_train[selected_columns]

In [14]:
model = sm.Logit(y_train, X_train[selected_columns])
model = model.fit()

Optimization terminated successfully.
         Current function value: 0.517076
         Iterations 8


In [15]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

NameError: name 'X' is not defined