### Logistic Regression Classifier - Malwai

In [42]:
%matplotlib inline

import os
import sys
import json
import keras

import numpy as np
import pandas as pd

from pathlib import Path
from pathlib import PureWindowsPath

from matplotlib import pyplot as plt
from IPython.display import display
import seaborn as sns
sns.set()


import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import metrics

# Add our local functions to the path
sys.path.append(os.path.join(os.pardir, 'src'))
from models import evaluation
from data.load_data import (get_country_filepaths, 
                            split_features_labels_weights)


ALGORITHM_NAME = 'lr'
COUNTRY = 'mwi'


In [63]:
# Load the processed Mwai household:
mwi_hhold = pd.read_csv(r"C:\Users\micha\Documents\GitHub\mkp_code\Institute of Data Course\Project 3\data\csv_data\MWI_2010_household.csv")

In [65]:
mwi_hhold.poor = mwi_hhold.poor.replace('Poor',0)
mwi_hhold.poor = mwi_hhold.poor.replace('Non-poor',1)

In [66]:
# Split Malawi data and Indonesia data
mwi_train, mwi_test = train_test_split(mwi_hhold, 
                                       test_size=0.25,
                                       random_state=1443,
                                       stratify=mwi_hhold.poor)

In [67]:
# summarize Train loaded data
print('Data has {:,} rows and {:,} columns' \
        .format(*mwi_train.shape))

print('Percent poor: {:0.1%} \tPercent non-poor: {:0.1%}' \
        .format(*mwi_train.poor.value_counts(normalize=True, ascending=True)))

# print first 5 rows of data
mwi_train.head()

Data has 9,183 rows and 488 columns
Percent poor: 45.1% 	Percent non-poor: 54.9%


Unnamed: 0,hid,wta_hh,wta_pop,der_hhsize,poor,hld_rooms,hld_nbcellpho,hld_selfscale,der_nchild10under,der_nmalesover10,...,com_medicines__Yes,com_clinic__Yes,com_distclinic__11 to 15 kilometers,com_distclinic__16 to 20 kilometers,com_distclinic__21 to 25 kilometers,com_distclinic__26 to 30 kilometers,com_distclinic__6 to 10 kilometers,com_distclinic__Above 30 kilometers,com_distclinic__nan,com_bank__Yes
7927,304044580160,170.1948,850.974,5,True,3,0.0,1.0,3,1,...,0,0,0,0,0,0,0,0,0,0
4210,205205820123,188.8692,566.6076,3,False,2,1.0,3.0,1,1,...,1,0,0,0,0,0,0,0,0,0
457,102016880095,143.9194,863.5164,6,True,5,0.0,2.0,3,1,...,0,0,0,0,0,0,0,0,0,0
11951,315356330210,234.7279,704.1837,3,False,3,3.0,3.0,1,1,...,1,1,0,0,0,0,0,0,1,1
11428,313061660067,78.5482,157.0964,2,False,1,0.0,2.0,0,1,...,1,0,0,1,0,0,0,0,0,0


In [68]:
# Select a few columns for this example
selected_columns = [
    'der_hhsize', 
    'der_nchild10under', 
    'der_nmalesover10', 
    'der_nfemalesover10',
    'der_nliterate', 
    'der_nemployedpastyear',
    'hld_electricity__Yes',
    'hld_toilet__None',
    'cons_0504__Yes',
    'cons_0508__Yes'
]

print("X shape with selected columns:", mwi_train[selected_columns].shape)

X shape with selected columns: (9183, 10)


## Checking for Multicollinearity:
### Calcualting the variance inflation factor (VIF) of each feature.

In [69]:
def get_vif(X, intercept_col='intercept'):
    if intercept_col is not None and intercept_col in X.columns:
        X = X.copy().drop(intercept_col, axis=1)
    
    vi_factors = [variance_inflation_factor(X.values, i)
                             for i in range(X.shape[1])]
    
    return pd.Series(vi_factors,
                     index=X.columns,
                     name='variance_inflaction_factor')

In [70]:
get_vif(mwi_train[selected_columns])

der_hhsize               1142.050480
der_nchild10under         206.466307
der_nmalesover10          138.021692
der_nfemalesover10        141.867242
der_nliterate               5.616642
der_nemployedpastyear       1.533041
hld_electricity__Yes        1.471163
hld_toilet__None            1.091630
cons_0504__Yes              1.344652
cons_0508__Yes              1.341629
Name: variance_inflaction_factor, dtype: float64

### Standardization 

In [71]:
# Standardize features
def standardize(df, numeric_only=True):
    if numeric_only is True:
    # find non-boolean columns
        cols = df.loc[:, df.dtypes != 'uint8'].columns
    else:
        cols = df.columns
    for field in cols:
        mean, std = df[field].mean(), df[field].std()
        # account for constant columns
        if np.all(df[field] - mean != 0):
            df.loc[:, field] = (df[field] - mean) / std
    
    return df

In [72]:
standardize(mwi_train)
mwi_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,hid,wta_hh,wta_pop,der_hhsize,poor,hld_rooms,hld_nbcellpho,hld_selfscale,der_nchild10under,der_nmalesover10,...,com_medicines__Yes,com_clinic__Yes,com_distclinic__11 to 15 kilometers,com_distclinic__16 to 20 kilometers,com_distclinic__21 to 25 kilometers,com_distclinic__26 to 30 kilometers,com_distclinic__6 to 10 kilometers,com_distclinic__Above 30 kilometers,com_distclinic__nan,com_bank__Yes
7927,0.898019,-0.463091,-0.289299,0.183527,1.10241,0.326073,-0.597364,-1.131563,0.97709,-0.398378,...,-1.700716,-0.593458,-0.321636,-0.198159,-0.096077,-0.150344,-0.545987,-0.150344,-0.592787,-0.25844
4210,-0.375839,-0.356672,-0.560014,-0.721591,-0.907005,-0.512205,0.398207,1.089716,-0.483297,-0.398378,...,0.587923,-0.593458,-0.321636,-0.198159,-0.096077,-0.150344,-0.545987,-0.150344,-0.592787,-0.25844
457,-1.705763,-0.612824,-0.277359,0.636087,1.10241,2.002628,-0.597364,-0.020924,0.97709,-0.398378,...,-1.700716,-0.593458,-0.321636,-0.198159,-0.096077,-0.150344,-0.545987,-0.150344,-0.592787,-0.25844
11951,1.043807,-0.095341,-0.429043,-0.721591,-0.907005,0.326073,2.389348,1.089716,-0.483297,-0.398378,...,0.587923,1.684855,-0.321636,-0.198159,-0.096077,-0.150344,-0.545987,-0.150344,1.686762,3.868954
11428,1.014233,-0.98535,-0.949866,-1.174151,-0.907005,-1.350483,-0.597364,-0.020924,-1.21349,-0.398378,...,0.587923,-0.593458,-0.321636,5.04591,-0.096077,-0.150344,-0.545987,-0.150344,-0.592787,-0.25844


In [73]:
get_vif(mwi_train[selected_columns])

der_hhsize               214.615373
der_nchild10under         83.536365
der_nmalesover10          48.129934
der_nfemalesover10        38.840336
der_nliterate              2.256896
der_nemployedpastyear      1.240740
hld_electricity__Yes       1.344728
hld_toilet__None           1.043755
cons_0504__Yes             1.194788
cons_0508__Yes             1.077015
Name: variance_inflaction_factor, dtype: float64

In [74]:
selected_columns.remove('der_hhsize')
print(selected_columns)

get_vif(mwi_train[selected_columns])

['der_nchild10under', 'der_nmalesover10', 'der_nfemalesover10', 'der_nliterate', 'der_nemployedpastyear', 'hld_electricity__Yes', 'hld_toilet__None', 'cons_0504__Yes', 'cons_0508__Yes']


der_nchild10under        1.054748
der_nmalesover10         1.511700
der_nfemalesover10       1.422928
der_nliterate            2.256803
der_nemployedpastyear    1.237311
hld_electricity__Yes     1.344378
hld_toilet__None         1.043731
cons_0504__Yes           1.194295
cons_0508__Yes           1.077012
Name: variance_inflaction_factor, dtype: float64

In [75]:
 mwi_train.poor

7927     1.102410
4210    -0.907005
457      1.102410
11951   -0.907005
11428   -0.907005
           ...   
5430     1.102410
11107   -0.907005
769     -0.907005
8471    -0.907005
7712    -0.907005
Name: poor, Length: 9183, dtype: float64

In [76]:
X_train = mwi_train[selected_columns]
y_train = mwi_train.poor

In [53]:
X_train.head

<bound method NDFrame.head of        der_nchild10under  der_nmalesover10  der_nfemalesover10  der_nliterate  \
7927            0.977090         -0.398378           -0.549568      -0.083075   
4210           -0.483297         -0.398378           -0.549568      -0.083075   
457             0.977090         -0.398378            0.535040      -0.083075   
11951          -0.483297         -0.398378           -0.549568       0.492247   
11428          -1.213490         -0.398378           -0.549568      -0.083075   
...                  ...               ...                 ...            ...   
5430           -0.483297          0.572252            1.619648      -0.658397   
11107           0.246897         -0.398378            0.535040      -0.083075   
769            -1.213490         -0.398378           -1.634176      -0.658397   
8471           -0.483297         -0.398378           -0.549568      -0.083075   
7712           -0.483297         -0.398378           -0.549568      -0.083075  

In [77]:
logreg = LogisticRegression()

In [78]:
# Fit Model
logreg.fit(X_train, y_train)

ValueError: Unknown label type: 'continuous'

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)