#**Financial Inclusion by IndabaX Sudan 2021 (Hackathon)**

# **Team: 10011**

## **Imports**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBClassifier
from scipy import stats
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

## **Preprocessing and Encoding**

In [None]:
def transform(df):

    #Getting rid of outliers that have z-scores >= 2 in the numerical variables:

    if (len(df.columns) == 13):  #13 is the number of columns in the training set
          z_score_age = stats.zscore(df['age_of_respondent'])
          absolute_z_score_age = np.abs(z_score_age)
          z_score_size = stats.zscore(df['household_size'])
          absolute_z_score_size = np.abs(z_score_size)

          for row in df.index:
              if ((z_score_age[row] >= 2) or (z_score_size[row] >= 2)):
                df.drop(axis=1,index=row,inplace=True)
    #The choice of the number 2 comes from the fact that in these two features 
    #we have the mean almost equals the median which can be thought as a normal 
    #distribution then,the Empirical Rule from Statistics implies that almost 
    #95% of the data lies within 2 z-scores


    #Dealing with the categorical features
    cols = ['location_type', 'year',  'cellphone_access', 'household_size', 'age_of_respondent', 'gender_of_respondent']
    X = df[cols].copy()

    #Converting features that have two unique elements from object-type to binary-type 
    X['cellphone_access'] = X['cellphone_access'] == 'Yes'
    X['location_type'] = X['location_type'] == 'Rural'
    X['gender_of_respondent'] = X['gender_of_respondent'] == 'Male'
    
    #Converting features that have more than two unique elements to multiple dummy variables
    categorical_cols = ['job_type', 'education_level', 'country', 'marital_status','year']
    X_cat = pd.get_dummies(df[categorical_cols])
    New_df = pd.merge(X, X_cat, how='inner', left_index=True, right_index=True)
    assert(len(X) == len(New_df))
    
    return New_df

## **Preparing the Data**

In [None]:
#Getting the Datasets
Train_Data = pd.read_csv('Data/Train.csv')
Test_Data = pd.read_csv('Data/Test.csv')

In [None]:
#Having a look into the data
Train_Data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [None]:
#Having a look into the features (No missing values).
#As that the feature 'year' has just 3 unique values, 
#it can be considered as a categorical variable
Train_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [None]:
#Let's know more about our features
vars_def = pd.read_csv('Data/VariableDefinitions.csv')
vars_def

Unnamed: 0,Variable Definitions,Unnamed: 1
0,country,Country interviewee is in.
1,year,Year survey was done in.
2,uniqueid,Unique identifier for each interviewee
3,location_type,"Type of location: Rural, Urban"
4,cellphone_access,"If interviewee has access to a cellphone: Yes, No"
5,household_size,Number of people living in one house
6,age_of_respondent,The age of the interviewee
7,gender_of_respondent,"Gender of interviewee: Male, Female"
8,relationship_with_head,The interviewee’s relationship with the head o...
9,marital_status,The martial status of the interviewee: Married...


In [None]:
#Having a look into the statistical measures of the data 
#(notice the outliers in the two numerical variables)
Train_Data.describe()

Unnamed: 0,year,household_size,age_of_respondent
count,23524.0,23524.0,23524.0
mean,2016.975939,3.797483,38.80522
std,0.847371,2.227613,16.520569
min,2016.0,1.0,16.0
25%,2016.0,2.0,26.0
50%,2017.0,3.0,35.0
75%,2018.0,5.0,49.0
max,2018.0,21.0,100.0


In [None]:
#Preprocess the data
X = transform(Train_Data)
X_Test = transform(Test_Data)

#Convert the y-values from object-type (Yes, No) into binary-type (1, 0)
lb = LabelBinarizer()
y = lb.fit_transform(Train_Data['bank_account']).reshape(-1,)

In [None]:
#Now, let's check the features again
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21656 entries, 0 to 23522
Data columns (total 32 columns):
 #   Column                                           Non-Null Count  Dtype
---  ------                                           --------------  -----
 0   location_type                                    21656 non-null  bool 
 1   year_x                                           21656 non-null  int64
 2   cellphone_access                                 21656 non-null  bool 
 3   household_size                                   21656 non-null  int64
 4   age_of_respondent                                21656 non-null  int64
 5   gender_of_respondent                             21656 non-null  bool 
 6   year_y                                           21656 non-null  int64
 7   job_type_Dont Know/Refuse to answer              21656 non-null  uint8
 8   job_type_Farming and Fishing                     21656 non-null  uint8
 9   job_type_Formally employed Government            2

In [None]:
#Let's check the statistics again (Notice the two previous numerical variables)
X.describe()

Unnamed: 0,year_x,household_size,age_of_respondent,year_y,job_type_Dont Know/Refuse to answer,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed,education_level_No formal education,education_level_Other/Dont know/RTA,education_level_Primary education,education_level_Secondary education,education_level_Tertiary education,education_level_Vocational/Specialised training,country_Kenya,country_Rwanda,country_Tanzania,country_Uganda,marital_status_Divorced/Seperated,marital_status_Dont know,marital_status_Married/Living together,marital_status_Single/Never Married,marital_status_Widowed
count,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0,21656.0
mean,2016.964583,3.646795,36.81534,2016.964583,0.005264,0.22645,0.017409,0.047377,0.005911,0.249215,0.025582,0.044099,0.099049,0.279645,0.17053,0.001293,0.556012,0.184706,0.052041,0.035417,0.253371,0.372414,0.290589,0.083626,0.092676,0.000323,0.461535,0.351265,0.0942
std,0.84154,1.926871,14.078164,0.84154,0.072365,0.418543,0.130791,0.212449,0.076655,0.432569,0.157888,0.205319,0.298734,0.448835,0.376107,0.035935,0.496864,0.388068,0.222115,0.184837,0.434952,0.483459,0.454045,0.276832,0.289985,0.017976,0.49853,0.477377,0.292114
min,2016.0,1.0,16.0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2016.0,2.0,26.0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2017.0,3.0,34.0,2017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2018.0,5.0,46.0,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
max,2018.0,8.0,71.0,2018.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## **Defining the Model**

In [None]:
#Splitting the Training set into Training and Testing sets
X_Train, X_test = train_test_split(X, random_state=42, test_size = 0.2)
y_Train, y_test = train_test_split(y, random_state=42, test_size = 0.2)

In [None]:
#Defining XGBoost model with specific hyperparameters (They are found by Tuning the Hyperparameters)
xg_model = XGBClassifier(min_child_weight = 5,
                          gamma = 0.6,
                          subsample = 0.6,
                          colsample_bytree = 0.6,
                          max_depth = 5)

In [None]:
#Fitting the model with training data
xg_model.fit(X_Train,y_Train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.6,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.6, verbosity=1)

In [None]:
#Using 10-fold Cross-Validation to evaluate the model
scores = cross_val_score(xg_model, X_test, y_test, scoring="neg_mean_absolute_error", cv=10)
mae_scores = - scores
print("Scores:", mae_scores)
print("Mean:", mae_scores.mean())
print("Standard deviation:", mae_scores.std())

Scores: [0.11981567 0.12442396 0.11316397 0.12009238 0.11778291 0.11316397
 0.09006928 0.12240185 0.10854503 0.11547344]
Mean: 0.11449324719830567
Standard deviation: 0.009317034564534971


## **Submission**

In [None]:
#Fitting the model with the whole dataset and predicting the test set values
xg_model.fit(X,y)
X_Test['bank_account'] = xg_model.predict(X_Test)

In [None]:
#Exporting the submission file
submission = pd.DataFrame({"uniqueid": Test_Data["uniqueid"] + " x " + Test_Data["country"],"bank_account": X_Test.bank_account})
submission.to_csv('Final_Submission.csv',index=False)