# Modeling on the TD Challenge

## Importing Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('TDs Challenge.csv')
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

## Data Preprocessing

In [4]:
df.drop(columns = 'duration', inplace = True)

### Label Encoding

In [5]:
from sklearn.preprocessing import LabelEncoder

object_columns = df.select_dtypes(include = 'object').columns
numeric_columns = df.select_dtypes(include = ['int64', 'float64']).columns

le = LabelEncoder()

df_le = df[object_columns].apply(le.fit_transform)
df_le.head(5)

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
0,3,1,0,0,0,0,1,6,1,1,0
1,7,1,3,1,0,0,1,6,1,1,0
2,7,1,3,0,2,0,1,6,1,1,0
3,0,1,1,0,0,0,1,6,1,1,0
4,7,1,3,0,0,2,1,6,1,1,0


In [6]:
df[object_columns]

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent,no
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent,no
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent,no
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent,no
...,...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nov,fri,nonexistent,yes
41184,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,nonexistent,no
41185,retired,married,university.degree,no,yes,no,cellular,nov,fri,nonexistent,no
41186,technician,married,professional.course,no,no,no,cellular,nov,fri,nonexistent,yes


## Modeling using a Decision Tree

In [7]:
df[numeric_columns]

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...
41183,73,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,2,999,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,1,999,0,-1.1,94.767,-50.8,1.028,4963.6


In [8]:
df_prep_le = pd.concat([df_le, df[numeric_columns]], axis = 1)
df_prep_le.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             41188 non-null  int32  
 1   marital         41188 non-null  int32  
 2   education       41188 non-null  int32  
 3   default         41188 non-null  int32  
 4   housing         41188 non-null  int32  
 5   loan            41188 non-null  int32  
 6   contact         41188 non-null  int32  
 7   month           41188 non-null  int32  
 8   day_of_week     41188 non-null  int32  
 9   poutcome        41188 non-null  int32  
 10  y               41188 non-null  int32  
 11  age             41188 non-null  int64  
 12  campaign        41188 non-null  int64  
 13  pdays           41188 non-null  int64  
 14  previous        41188 non-null  int64  
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [9]:
df_prep_le['y']

0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int32

### Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_prep_le.drop(columns = 'y'),
                                                    df_prep_le['y'],
                                                    test_size = 0.25,
                                                    random_state = 42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(30891, 19)
(10297, 19)
(30891,)
(10297,)


### Trying a Model with the Default Parameters

In [11]:
from sklearn.tree import DecisionTreeClassifier

dt_default = DecisionTreeClassifier(random_state = 42)
dt_default.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

### Model Evaluation

In [12]:
print(f"Training Accuracy: {round(dt_default.score(X_train, y_train) * 100, 2)}%")
print(f"Test Accuracy: {round(dt_default.score(X_test, y_test) * 100, 2)}%")

Training Accuracy: 99.54%
Test Accuracy: 83.87%


### Using Appropriate Evaluation Metrics

#### Accuracy and Precision

In [13]:
from sklearn.metrics import accuracy_score, precision_score

y_pred = dt_default.predict(X_test)
print(f"Accuracy Score: {round(accuracy_score(y_pred, y_test) * 100, 2)}%")
print(f"Precision Score: {round(precision_score(y_pred, y_test) * 100, 2)}%")

Accuracy Score: 83.87%
Precision Score: 34.26%


### Fine Tuning the Tree

#### Pruning

In [14]:
dt_pruned = DecisionTreeClassifier(max_depth = 3, random_state = 42)
dt_pruned.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=42)

In [15]:
y_pred = (dt_pruned.predict_proba(X_test)[:, 0] < 0.5).astype('int')
print(f"Accuracy Score: {round(accuracy_score(y_pred, y_test) * 100, 2)}%")
print(f"Precision Score: {round(precision_score(y_pred, y_test) * 100, 2)}%")

Accuracy Score: 89.93%
Precision Score: 19.17%


### Calculating Additional Metrics

In [16]:
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print(f"Recall Score: {round(recall_score(y_pred, y_test) * 100, 2)}%")
print(f"F1 Score: {round(f1_score(y_pred, y_test) * 100, 2)}%")

Recall Score: 67.79%
F1 Score: 29.89%


In [17]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_pred, y_test)

array([[9039,  932],
       [ 105,  221]], dtype=int64)

## Modeling using a Random Forest

### Trying a model with the default parameters

In [18]:
from sklearn.ensemble import RandomForestClassifier

forest_default = RandomForestClassifier(random_state = 42)
forest_default.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

### Evaluation

In [19]:
y_pred = forest_default.predict(X_test)
print(f"Accuracy Score: {round(accuracy_score(y_pred, y_test) * 100, 2)}%")
print(f"Precision Score: {round(precision_score(y_pred, y_test) * 100, 2)}%")

Accuracy Score: 89.46%
Precision Score: 29.84%


### Fine tuning the Random Forest

In [20]:
forest_tuned = RandomForestClassifier(n_estimators = 500, random_state = 42)
forest_tuned.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=42)

In [21]:
y_pred = forest_tuned.predict(X_test)
print(f"Accuracy Score: {round(accuracy_score(y_pred, y_test) * 100, 2)}%")
print(f"Precision Score: {round(precision_score(y_pred, y_test) * 100, 2)}%")

Accuracy Score: 89.44%
Precision Score: 29.92%


## Trying Synthetic Minority Oversampling Technique

### Investigating Class Imbalance

In [22]:
df_prep_le['y'].value_counts(normalize = True)

0    0.887346
1    0.112654
Name: y, dtype: float64

### Oversampling Using SMOTE

In [23]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state = 42)
X, y = oversample.fit_resample(df_prep_le.drop(columns = 'y'), df_prep_le['y']) 

y.value_counts(normalize = True)

1    0.5
0    0.5
Name: y, dtype: float64

In [44]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Creating the classifier
dt_default = DecisionTreeClassifier(random_state = 42)
dt_default.fit(X_train, y_train)

# Evaluation
y_pred = (dt_default.predict_proba(X_test)[:, 0] < 0.5).astype('int')
print(f"Accuracy Score: {round(accuracy_score(y_pred, y_test) * 100, 2)}%")
print(f"Precision Score: {round(precision_score(y_pred, y_test) * 100, 2)}%")
print(f"Recall Score: {round(recall_score(y_pred, y_test) * 100, 2)}%")
print(f"F1 Score: {round(f1_score(y_pred, y_test) * 100, 2)}%")

Accuracy Score: 88.5%
Precision Score: 89.92%
Recall Score: 87.69%
F1 Score: 88.79%


## Extracting Feature Importances

In [45]:
feature_importances = dict([(feature, importance) for feature, importance in zip(list(X.columns), list(dt_default.feature_importances_))])
feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: item[1])}
feature_importances

{'emp.var.rate': 0.0028484475940761945,
 'pdays': 0.009305592516797148,
 'month': 0.012858443021086791,
 'poutcome': 0.015670443644907404,
 'previous': 0.015833723993107875,
 'marital': 0.021888112861030498,
 'default': 0.024356074103913584,
 'loan': 0.0291200797205551,
 'contact': 0.03290519498384469,
 'cons.price.idx': 0.033389105782290464,
 'day_of_week': 0.04331532261532875,
 'campaign': 0.04533656758272361,
 'housing': 0.04616670456569549,
 'education': 0.0466019568299039,
 'job': 0.05317986828231709,
 'cons.conf.idx': 0.054216257303517125,
 'age': 0.1069580659757818,
 'euribor3m': 0.19567080243885143,
 'nr.employed': 0.21037923618427096}