### Linear Discrimination Analysis - Malwai

In [74]:
%matplotlib inline

import os
import sys
import json

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from IPython.display import display
import seaborn as sns
sns.set()

# Use for 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import metrics

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Add our local functions to the path
sys.path.append(os.path.join(os.pardir, 'src'))

ALGORITHM_NAME = 'lda'
COUNTRY = 'mwi'
TRAIN_PATH, TEST_PATH, QUESTIONS_PATH = get_country_filepaths(COUNTRY)

# load text of survey questions for reference
with open(QUESTIONS_PATH, 'r') as fp:
    questions = json.load(fp)

In [75]:
# Definition of Main Variables to Predict Poverty from the Household Survey

MWI_BASIC_FEATURES = [
    'der_nchild10under', 
    'der_nmalesover10', 
    'der_nfemalesover10',
    'der_nliterate', 
    'der_nemployedpastyear',
    'hld_electricity__Yes',
    'hld_toilet__None',
    'cons_0504__Yes',
    'cons_0508__Yes'
]

In [76]:
# Standardise Data
def standardize(df, numeric_only=True):
    if numeric_only is True:
    # find non-boolean columns
        cols = df.loc[:,df.dtypes != 'uint8'].columns
    else:
        cols = df.columns
    for field in cols:
        mean, std = df[field].mean(), df[field].std()
        # account for constant columns
        if np.all(df[field]-mean != 0):
            df.loc[:,field] = (df[field]-mean)/std
    
    return df

In [77]:
def drop_duplicate_columns(df, ignore=[], inplace=False):
    if not inplace:
        df = df.copy()

    # pairwise correlations
    corr = df.corr()
    corr[corr.columns] = np.triu(corr, k=1)
    corr = corr.stack()

    # for any perfectly correlated variables, drop one of them
    for ix, r in corr[(corr == 1)].to_frame().iterrows():
        first, second = ix

        if second in df.columns and second not in ignore:
            df.drop(second, inplace=True, axis=1)

    if not inplace:
        return df

In [52]:
# Split the data for analysis
def split_features_labels_weights(df,
                                  weights=['wta_pop', 'wta_hh'],
                                  weights_col=['wta_pop'],
                                  label_col=['poor']):

    '''Split data into features, labels, and weights dataframes'''
    data = df
    return (data.drop(weights + label_col, axis=1),
            data[label_col],
            data[weights_col])

In [80]:
# Load the processed Mwai household:
mwi_hhold = pd.read_csv(r"C:\Users\micha\Documents\GitHub\mkp_code\Institute of Data Course\Project 3\data\csv_data\MWI_2010_household.csv")

In [78]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

NameError: name 'X' is not defined

In [81]:
# load training data
X_train, y_train, w_train = split_features_labels_weights(mwi_hhold)

In [82]:
X_train.head

<bound method NDFrame.head of                 hid  der_hhsize  hld_rooms  hld_nbcellpho  hld_selfscale  \
0      101010160009           6          2            0.0            2.0   
1      101010160033           4          3            0.0            1.0   
2      101010160060          10          6            1.0            1.0   
3      101010160068           9          3            0.0            2.0   
4      101010160069           1          2            0.0            3.0   
...             ...         ...        ...            ...            ...   
12239  315556140342           3          2            1.0            1.0   
12240  315556140365           2          1            0.0            2.0   
12241  315556140388           2          2            1.0            2.0   
12242  315556140411           5          2            1.0            1.0   
12243  315556140434           7          4            1.0            3.0   

       der_nchild10under  der_nmalesover10  der_nfemaleso

In [83]:
y_test = 

<bound method NDFrame.head of                 hid    wta_hh    wta_pop  der_hhsize   poor  hld_rooms  \
10396  311040860017  154.1838  1079.2866           7   True          2   
5957   210370300058   38.0793   228.4758           6  False          3   
7847   304034440117  171.6786  1030.0717           6  False          5   
10050  310016470133  304.8801  2134.1606           7   True          2   
7475   303055060022  438.8862   877.7724           2   True          2   
...             ...       ...        ...         ...    ...        ...   
6461   210875310120  383.0713  2298.4277           6  False          5   
6367   210815730169  338.8980  3050.0820           9  False          7   
1624   105022680350  538.7578  2155.0312           4   True          3   
1780   105055080092  414.3365  2900.3555           7   True          4   
7889   304037660001  205.5182   205.5182           1  False          2   

       hld_nbcellpho  hld_selfscale  der_nchild10under  der_nmalesover10  ...  \


In [55]:
# summarize loaded data
print('Data has {:,} rows and {:,} columns' \
        .format(*X_train.shape))

print('Percent poor: {:0.1%} \tPercent non-poor: {:0.1%}' \
        .format(*y_train.poor.value_counts(normalize=True, ascending=True)))

# print first 5 rows of data
X_train.head()

Data has 12,244 rows and 485 columns
Percent poor: 45.1% 	Percent non-poor: 54.9%


Unnamed: 0,hid,der_hhsize,hld_rooms,hld_nbcellpho,hld_selfscale,der_nchild10under,der_nmalesover10,der_nfemalesover10,der_nliterate,der_nemployedpastyear,...,com_medicines__Yes,com_clinic__Yes,com_distclinic__11 to 15 kilometers,com_distclinic__16 to 20 kilometers,com_distclinic__21 to 25 kilometers,com_distclinic__26 to 30 kilometers,com_distclinic__6 to 10 kilometers,com_distclinic__Above 30 kilometers,com_distclinic__nan,com_bank__Yes
0,101010160009,6,2,0.0,2.0,4,1,1,2,0,...,0,0,0,0,1,0,0,0,0,0
1,101010160033,4,3,0.0,1.0,1,2,1,1,0,...,0,0,0,0,1,0,0,0,0,0
2,101010160060,10,6,1.0,1.0,4,3,3,4,1,...,0,0,0,0,1,0,0,0,0,0
3,101010160068,9,3,0.0,2.0,4,3,2,6,0,...,0,0,0,0,1,0,0,0,0,0
4,101010160069,1,2,0.0,3.0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [56]:
selected_columns = MWI_BASIC_FEATURES
print("X shape with selected columns:", X_train[selected_columns].shape)

X shape with selected columns: (12244, 9)


In [63]:
# Select a few columns for this example
selected_columns = [
    'der_hhsize', 
    'der_nchild10under', 
    'der_nmalesover10', 
    'der_nfemalesover10',
    'der_nliterate', 
    'der_nemployedpastyear',
    'hld_electricity__Yes',
    'hld_toilet__None',
    'cons_0504__Yes',
    'cons_0508__Yes'
]

print("X shape with selected columns:", mwi_train[selected_columns].shape)

X shape with selected columns: (9183, 10)


In [64]:
# Same method for getting the coefficients as we used with Malawi
def get_coefs_df(X, coefs, index=None):
    coefs_df = pd.DataFrame(np.std(X, 0)*coefs)
    coefs_df.columns = ["coef_std"]
    coefs_df['coef'] = coefs
    coefs_df['abs'] = coefs_df.coef_std.apply(abs)
    if index is not None:
        coefs_df.index = index
    return coefs_df

In [65]:
# Fit the model
model = LinearDiscriminantAnalysis()
%time model.fit(X_train, y_train)

# Get an initial score
%time score = model.score(X_train, y_train)
print("In-sample score: {:0.2%}".format(score))



Wall time: 2.38 s
Wall time: 74 ms
In-sample score: 87.94%


In [71]:
# Store coefficients
coefs = get_coefs_df(X_train, model.coef_[0])


In [None]:
# Load the test set
X_test, y_test, w_test = load_data(TEST_PATH, selected_columns=selected_columns)

# Run the model
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

In [72]:
# Run the model
y_pred = model.predict(X_train)
y_prob = model.predict_proba(X_train)[:,1]

In [73]:
# Evaluate performance and store model
metrics = evaluation.evaluate_model(y_train, y_pred, y_prob, 
                                    compare_models=['lr_simple'],
                                    store_model=True,
                                    model_name='simple', 
                                    prefix=ALGORITHM_NAME,
                                    country=COUNTRY,
                                    model=model, 
                                    features=coefs)

TypeError: auc() got an unexpected keyword argument 'reorder'

In [24]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

NameError: name 'X' is not defined