In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.utils import resample

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

import pickle
import warnings
warnings.filterwarnings('ignore')

### Loading the dataset

In [2]:
data = pd.read_csv('Train_v2.csv')

In [3]:
df = data.copy()
df.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,bank_account
0,Kenya,2018,uniqueid_1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed,Yes
1,Kenya,2018,uniqueid_2,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent,No
2,Kenya,2018,uniqueid_3,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed,Yes
3,Kenya,2018,uniqueid_4,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private,No
4,Kenya,2018,uniqueid_5,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed,No


In [4]:
df.bank_account.value_counts()

No     20212
Yes     3312
Name: bank_account, dtype: int64

### Descriptive information about the dataset

In [5]:
df.describe()

Unnamed: 0,year,household_size,age_of_respondent
count,23524.0,23524.0,23524.0
mean,2016.975939,3.797483,38.80522
std,0.847371,2.227613,16.520569
min,2016.0,1.0,16.0
25%,2016.0,2.0,26.0
50%,2017.0,3.0,35.0
75%,2018.0,5.0,49.0
max,2018.0,21.0,100.0


### Checking for missing/NAN value

In [6]:
df.isnull().sum()

country                   0
year                      0
uniqueid                  0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
bank_account              0
dtype: int64

In [7]:
df.isna().sum()

country                   0
year                      0
uniqueid                  0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
bank_account              0
dtype: int64

In [8]:
df.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,bank_account
0,Kenya,2018,uniqueid_1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed,Yes
1,Kenya,2018,uniqueid_2,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent,No
2,Kenya,2018,uniqueid_3,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed,Yes
3,Kenya,2018,uniqueid_4,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private,No
4,Kenya,2018,uniqueid_5,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed,No


### Dropping unwanted columns

In [9]:
df = df.drop('country',axis=1)

In [10]:
df = df.drop('year',axis=1)

In [11]:
df = df.drop('uniqueid',axis=1)

### Converting categorical featuers to binary values

In [12]:
location = pd.get_dummies(df.location_type)

In [13]:
cell_phone = pd.get_dummies(df.cellphone_access)

In [14]:
Gender = pd.get_dummies(df.gender_of_respondent)

In [15]:
df.relationship_with_head.unique()

array(['Spouse', 'Head of Household', 'Other relative', 'Child', 'Parent',
       'Other non-relatives'], dtype=object)

In [16]:
Relationship_HOH = pd.get_dummies(df.relationship_with_head)

In [17]:
Relationship_HOH.head()

Unnamed: 0,Child,Head of Household,Other non-relatives,Other relative,Parent,Spouse
0,0,0,0,0,0,1
1,0,1,0,0,0,0
2,0,0,0,1,0,0
3,0,1,0,0,0,0
4,1,0,0,0,0,0


In [18]:
Relationship_HOH.columns.values

array(['Child', 'Head of Household', 'Other non-relatives',
       'Other relative', 'Parent', 'Spouse'], dtype=object)

In [19]:
columns = ['Rlship_HOH(Child)', 'Head of Household', 'Rlship_HOH(Other non-relatives)',
       'Rlship_HOH(Other relative)', 'Rlship_HOH(Parent)', 'Rlship_HOH(Spouse)']

In [20]:
Relationship_HOH.columns=columns 

In [21]:
marital_status = pd.get_dummies(df.marital_status)

In [22]:
education_level = pd.get_dummies(df.education_level)

In [23]:
job_type = pd.get_dummies(df.job_type)

In [24]:
bank_account = df.bank_account.map({'Yes':1, 'No':0})

In [25]:
bank_account = pd.DataFrame(bank_account)

In [26]:
df = df.drop(['location_type','cellphone_access','gender_of_respondent','relationship_with_head',
             'marital_status','education_level','job_type','bank_account'],axis= 1)

In [27]:
df = pd.concat([df,location,cell_phone,Gender,Relationship_HOH,marital_status,education_level,job_type,bank_account], axis = 1)

In [28]:
pd.options.display.max_columns=None

In [29]:
df.head()

Unnamed: 0,household_size,age_of_respondent,Rural,Urban,No,Yes,Female,Male,Rlship_HOH(Child),Head of Household,Rlship_HOH(Other non-relatives),Rlship_HOH(Other relative),Rlship_HOH(Parent),Rlship_HOH(Spouse),Divorced/Seperated,Dont know,Married/Living together,Single/Never Married,Widowed,No formal education,Other/Dont know/RTA,Primary education,Secondary education,Tertiary education,Vocational/Specialised training,Dont Know/Refuse to answer,Farming and Fishing,Formally employed Government,Formally employed Private,Government Dependent,Informally employed,No Income,Other Income,Remittance Dependent,Self employed,bank_account
0,3,24,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
1,5,70,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,5,26,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
3,5,34,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,8,26,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Handing class imbalance

In [30]:
class_0 = df[df.bank_account == 0]
class_1 = df[df.bank_account == 1]

In [31]:
account_new = resample(class_1, replace=True, n_samples=20212,random_state = 42)

In [32]:
upsample = pd.concat([class_0,account_new])

In [33]:
upsample.bank_account.value_counts()

1    20212
0    20212
Name: bank_account, dtype: int64

### Another checkpoint

In [34]:
new_df = upsample.copy()

In [35]:
new_df = new_df.sample(frac = 1, random_state = 1)

In [36]:
new_df.head()

Unnamed: 0,household_size,age_of_respondent,Rural,Urban,No,Yes,Female,Male,Rlship_HOH(Child),Head of Household,Rlship_HOH(Other non-relatives),Rlship_HOH(Other relative),Rlship_HOH(Parent),Rlship_HOH(Spouse),Divorced/Seperated,Dont know,Married/Living together,Single/Never Married,Widowed,No formal education,Other/Dont know/RTA,Primary education,Secondary education,Tertiary education,Vocational/Specialised training,Dont Know/Refuse to answer,Farming and Fishing,Formally employed Government,Formally employed Private,Government Dependent,Informally employed,No Income,Other Income,Remittance Dependent,Self employed,bank_account
17931,3,59,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
12111,4,38,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
12840,3,35,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1
7520,5,29,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1948,2,35,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1


### Created a check point

In [37]:
clean_df = new_df.copy()

### Saving cleaned dataset locally as .csv file

In [38]:
clean_df.to_csv(r'C:\Users\obehi\Desktop\Biintel\Data set\ml\lish-moa\clean_df.csv', index = False)

In [39]:
featuers = clean_df.iloc[:,:-1]

In [40]:
target = clean_df.iloc[:,-1:]

### Standardizing the dataset

In [41]:
scaler = StandardScaler()

In [57]:
df_std = scaler.fit_transform(featuers)

### Spliting features and target

In [58]:
x_train,x_test,y_train,y_test = train_test_split(featuers, target,random_state=42) 

### Fitting the model

In [59]:
clf = RandomForestClassifier().fit(x_train, y_train)

### Predicting the target outcome

In [60]:
y_pred = clf.predict(x_test)

In [61]:
y_pred

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

### Measuring model accuracy

In [62]:
cross_score = cross_val_score(clf,y_test,y_pred, cv=5)

In [63]:
print(cross_score)
print('mean: ',np.mean(cross_score))

[0.90405539 0.91740851 0.91538842 0.92132608 0.90594059]
mean:  0.9128237977928677


### Saving the model using pickle

In [64]:
pkl_clf = 'model.pkl'

with open(pkl_clf,'wb') as file:
    pickle.dump(clf, file)

In [65]:
pickle.dump(scaler,open('scaler.pickle','wb'))

### Loading the model again via pickle it to see if it is working fine for reuse

In [66]:
with open(pkl_clf,'rb') as file:
    pk_model = pickle.load(file)

In [67]:
pk_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)