In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.utils import resample

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

import pickle
import warnings
warnings.filterwarnings('ignore')

### Loading the dataset

In [156]:
data = pd.read_csv('Train_v2.csv')

In [157]:
df = data.copy()
df.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,bank_account
0,Kenya,2018,uniqueid_1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed,Yes
1,Kenya,2018,uniqueid_2,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent,No
2,Kenya,2018,uniqueid_3,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed,Yes
3,Kenya,2018,uniqueid_4,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private,No
4,Kenya,2018,uniqueid_5,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed,No


In [158]:
df.bank_account.value_counts()

No     20212
Yes     3312
Name: bank_account, dtype: int64

### Descriptive information about the dataset

In [159]:
df.describe()

Unnamed: 0,year,household_size,age_of_respondent
count,23524.0,23524.0,23524.0
mean,2016.975939,3.797483,38.80522
std,0.847371,2.227613,16.520569
min,2016.0,1.0,16.0
25%,2016.0,2.0,26.0
50%,2017.0,3.0,35.0
75%,2018.0,5.0,49.0
max,2018.0,21.0,100.0


### Checking for missing/NAN value

In [160]:
df.isnull().sum()

country                   0
year                      0
uniqueid                  0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
bank_account              0
dtype: int64

In [161]:
df.isna().sum()

country                   0
year                      0
uniqueid                  0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
bank_account              0
dtype: int64

In [162]:
df.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,bank_account
0,Kenya,2018,uniqueid_1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed,Yes
1,Kenya,2018,uniqueid_2,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent,No
2,Kenya,2018,uniqueid_3,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed,Yes
3,Kenya,2018,uniqueid_4,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private,No
4,Kenya,2018,uniqueid_5,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed,No


### Dropping unwanted columns

In [163]:
df = df.drop('country',axis=1)

In [164]:
df = df.drop('year',axis=1)

In [165]:
df = df.drop('uniqueid',axis=1)

### Converting categorical featuers to binary values

In [166]:
location = pd.get_dummies(df.location_type)

In [167]:
cell_phone = pd.get_dummies(df.cellphone_access)

In [168]:
Gender = pd.get_dummies(df.gender_of_respondent)

In [169]:
df.relationship_with_head.unique()

array(['Spouse', 'Head of Household', 'Other relative', 'Child', 'Parent',
       'Other non-relatives'], dtype=object)

In [170]:
Relationship_HOH = pd.get_dummies(df.relationship_with_head)

In [171]:
Relationship_HOH.head()

Unnamed: 0,Child,Head of Household,Other non-relatives,Other relative,Parent,Spouse
0,0,0,0,0,0,1
1,0,1,0,0,0,0
2,0,0,0,1,0,0
3,0,1,0,0,0,0
4,1,0,0,0,0,0


In [172]:
Relationship_HOH.columns.values

array(['Child', 'Head of Household', 'Other non-relatives',
       'Other relative', 'Parent', 'Spouse'], dtype=object)

In [173]:
columns = ['Rlship_HOH(Child)', 'Head of Household', 'Rlship_HOH(Other non-relatives)',
       'Rlship_HOH(Other relative)', 'Rlship_HOH(Parent)', 'Rlship_HOH(Spouse)']

In [174]:
Relationship_HOH.columns=columns 

In [175]:
marital_status = pd.get_dummies(df.marital_status)

In [176]:
education_level = pd.get_dummies(df.education_level)

In [177]:
job_type = pd.get_dummies(df.job_type)

In [178]:
bank_account = df.bank_account.map({'Yes':1, 'No':0})

In [179]:
bank_account = pd.DataFrame(bank_account)

In [180]:
df = df.drop(['location_type','cellphone_access','gender_of_respondent','relationship_with_head',
             'marital_status','education_level','job_type','bank_account'],axis= 1)

In [181]:
df = pd.concat([df,location,cell_phone,Gender,Relationship_HOH,marital_status,education_level,job_type,bank_account], axis = 1)

In [182]:
pd.options.display.max_columns=None

In [183]:
df.head()

Unnamed: 0,household_size,age_of_respondent,Rural,Urban,No,Yes,Female,Male,Rlship_HOH(Child),Head of Household,Rlship_HOH(Other non-relatives),Rlship_HOH(Other relative),Rlship_HOH(Parent),Rlship_HOH(Spouse),Divorced/Seperated,Dont know,Married/Living together,Single/Never Married,Widowed,No formal education,Other/Dont know/RTA,Primary education,Secondary education,Tertiary education,Vocational/Specialised training,Dont Know/Refuse to answer,Farming and Fishing,Formally employed Government,Formally employed Private,Government Dependent,Informally employed,No Income,Other Income,Remittance Dependent,Self employed,bank_account
0,3,24,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
1,5,70,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,5,26,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
3,5,34,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,8,26,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Handing class imbalance

In [207]:
class_0 = df[df.bank_account == 0]
class_1 = df[df.bank_account == 1]

In [211]:
account_new = resample(class_1, replace=True, n_samples=20212,random_state = 42)

In [212]:
upsample = pd.concat([class_0,account_new])

In [217]:
upsample.bank_account.value_counts()

1    20212
0    20212
Name: bank_account, dtype: int64

### Another checkpoint

In [237]:
new_df = upsample.copy()

In [240]:
new_df = new_df.sample(frac = 1, random_state = 1)

In [241]:
new_df.head()

Unnamed: 0,household_size,age_of_respondent,Rural,Urban,No,Yes,Female,Male,Rlship_HOH(Child),Head of Household,Rlship_HOH(Other non-relatives),Rlship_HOH(Other relative),Rlship_HOH(Parent),Rlship_HOH(Spouse),Divorced/Seperated,Dont know,Married/Living together,Single/Never Married,Widowed,No formal education,Other/Dont know/RTA,Primary education,Secondary education,Tertiary education,Vocational/Specialised training,Dont Know/Refuse to answer,Farming and Fishing,Formally employed Government,Formally employed Private,Government Dependent,Informally employed,No Income,Other Income,Remittance Dependent,Self employed,bank_account
4111,4,35,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
12675,5,47,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
6685,2,24,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
15511,2,48,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
3759,7,36,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1


### Created a check point

In [242]:
clean_df = new_df.copy()

### Saving cleaned dataset locally as .csv file

In [243]:
clean_df.to_csv(r'C:\Users\obehi\Desktop\Biintel\Data set\ml\lish-moa\clean_df.csv', index = False)

### Standardizing the dataset

In [245]:
scaler = StandardScaler()

In [246]:
df_std = scaler.fit_transform(clean_df)

### Spliting features and target

In [247]:
featuers = df_std[:,:-1]

In [248]:
target = clean_df.iloc[:,-1:]

In [249]:
featuers = featuers.astype(int)

In [250]:
target= target.astype(int)

In [252]:
x_train,x_test,y_train,y_test = train_test_split(featuers, target,random_state=42) 

### Fitting the model

In [254]:
clf = RandomForestClassifier().fit(x_train, y_train)

### Predicting the target outcome

In [255]:
y_pred = clf.predict(x_test)

In [256]:
y_pred

array([0, 1, 1, ..., 0, 1, 0])

### Measuring model accuracy

In [257]:
cross_score = cross_val_score(clf,y_test,y_pred, cv=5)

In [258]:
print(cross_score)
print('mean: ',np.mean(cross_score))

[0.8264095  0.81454006 0.80999505 0.8149431  0.81683168]
mean:  0.8165438774990867


### Saving the model using pickle

In [259]:
pkl_clf = 'model.pkl'

with open(pkl_clf,'wb') as file:
    pickle.dump(clf, file)

In [260]:
pickle.dump(scaler,open('scaler.pickle','wb'))

### Loading the model again via pickle it to see if it is working fine for reuse

In [261]:
with open(pkl_clf,'rb') as file:
    pk_model = pickle.load(file)

In [262]:
pk_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)