# Import libraries and Load dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import joblib

In [2]:
df = pd.read_csv("Financial_inclusion_dataset.csv")

# drop unnecessary column
df.drop(columns=['uniqueid'], inplace=True)

df.head()

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   bank_account            23524 non-null  object
 3   location_type           23524 non-null  object
 4   cellphone_access        23524 non-null  object
 5   household_size          23524 non-null  int64 
 6   age_of_respondent       23524 non-null  int64 
 7   gender_of_respondent    23524 non-null  object
 8   relationship_with_head  23524 non-null  object
 9   marital_status          23524 non-null  object
 10  education_level         23524 non-null  object
 11  job_type                23524 non-null  object
dtypes: int64(3), object(9)
memory usage: 2.2+ MB


In [4]:
df.describe()

Unnamed: 0,year,household_size,age_of_respondent
count,23524.0,23524.0,23524.0
mean,2016.975939,3.797483,38.80522
std,0.847371,2.227613,16.520569
min,2016.0,1.0,16.0
25%,2016.0,2.0,26.0
50%,2017.0,3.0,35.0
75%,2018.0,5.0,49.0
max,2018.0,21.0,100.0


In [5]:
df.duplicated().sum()

np.int64(4429)

In [6]:
df.isnull().sum()

country                   0
year                      0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [7]:
Q1 = df['age_of_respondent'].quantile(0.25)
Q3 = df['age_of_respondent'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['age_of_respondent'] < (Q1 - 1.5 * IQR)) | (df['age_of_respondent'] > (Q3 + 1.5 * IQR)))]


# Encode Categorical Features

In [8]:
df.select_dtypes(object)

Unnamed: 0,country,bank_account,location_type,cellphone_access,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,Yes,Rural,Yes,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,No,Rural,No,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,Yes,Urban,Yes,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,No,Rural,Yes,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,No,Urban,No,Male,Child,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...,...
23519,Uganda,No,Rural,Yes,Female,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,Uganda,No,Rural,Yes,Female,Head of Household,Single/Never Married,Secondary education,Other Income
23521,Uganda,No,Rural,Yes,Female,Parent,Widowed,Primary education,Other Income
23522,Uganda,No,Urban,Yes,Female,Parent,Divorced/Seperated,Secondary education,Self employed


In [9]:
df['location_type'].unique()

array(['Rural', 'Urban'], dtype=object)

In [10]:
df['bank_account'].unique()

array(['Yes', 'No'], dtype=object)

In [11]:
df['country'].unique()

array(['Kenya', 'Rwanda', 'Tanzania', 'Uganda'], dtype=object)

In [12]:
df['cellphone_access'].unique()

array(['Yes', 'No'], dtype=object)

In [13]:
df['gender_of_respondent'].unique()

array(['Female', 'Male'], dtype=object)

In [14]:
df['relationship_with_head'].unique()

array(['Spouse', 'Head of Household', 'Other relative', 'Child', 'Parent',
       'Other non-relatives'], dtype=object)

In [15]:
df['marital_status'].unique()

array(['Married/Living together', 'Widowed', 'Single/Never Married',
       'Divorced/Seperated', 'Dont know'], dtype=object)

In [16]:
df['education_level'].unique()

array(['Secondary education', 'No formal education',
       'Vocational/Specialised training', 'Primary education',
       'Tertiary education', 'Other/Dont know/RTA'], dtype=object)

In [17]:
df['job_type'].unique()

array(['Self employed', 'Government Dependent',
       'Formally employed Private', 'Informally employed',
       'Formally employed Government', 'Farming and Fishing',
       'Remittance Dependent', 'Other Income',
       'Dont Know/Refuse to answer', 'No Income'], dtype=object)

In [18]:
# Encode binary Yes/No
df['bank_account'] = df['bank_account'].map({'Yes': 1, 'No': 0})

In [19]:
# Encode categorical variables
cat_cols = ['country','cellphone_access', 'location_type', 'gender_of_respondent',
            'relationship_with_head', 'marital_status',
            'education_level', 'job_type']

encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

     # Save encoder with a clear name
    joblib.dump(le, f'{col}_encoder.sav')

# Train/Test ML Classifier

In [20]:

x = df.drop('bank_account', axis=1)
y = df['bank_account']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

#  Save the columns your model will be trained on
model_columns = x_train.columns.tolist()
joblib.dump(model_columns, 'model_columns.sav')

model = RandomForestClassifier(max_depth=8)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
result = classification_report(y_test, y_pred)



# save model

In [21]:
joblib.dump(model, 'bank_model.sav')

['bank_model.sav']