# Instructor Do: Dealing with Categorical Data in ML

In [1]:
# initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np

## Dataset Information

The file `loans_data.csv`, contains simulated data about loans, there are a total of 500 records. Each row represents a loan application along an arbitrary year, where every column represents the following data about every loan application.

* `amount`: The loan amount in USD.
* `term`: The loan term in months.
* `month`: The month of the year when the loan was requested.
* `age`: Age of the loan applicant.
* `education`: Educational level of the loan applicant.
* `gender`: Gender of the loan applicant.
* `bad`: Stands for a bad or good loan applicant (`1` - bad, `0` - good).

In [2]:
# Load data
file_path = Path('HR_Employee_Data.csv')
loans_df = pd.read_csv(file_path)
loans_df.head()
loans_df.isnull()

Unnamed: 0,Emp_Id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
14994,False,False,False,False,False,False,False,False,False,False,False
14995,False,False,False,False,False,False,False,False,False,False,False
14996,False,False,False,False,False,False,False,False,False,False,False
14997,False,False,False,False,False,False,False,False,False,False,False


In [3]:
loans_df.isnull().sum()

Emp_Id                   0
satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

In [4]:
loans_df.dropna()

Unnamed: 0,Emp_Id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,IND02438,38%,53%,2,157,3,0,1,0,sales,low
1,IND28133,80%,86%,5,262,6,0,1,0,sales,medium
2,IND07164,11%,88%,7,272,4,0,1,0,sales,medium
3,IND30478,72%,87%,5,223,5,0,1,0,sales,low
4,IND24003,37%,52%,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...,...
14994,IND40221,40%,57%,2,151,3,0,1,0,support,low
14995,IND24196,37%,48%,2,160,3,0,1,0,support,low
14996,IND33544,37%,53%,2,143,3,0,1,0,support,low
14997,IND40533,11%,96%,6,280,4,0,1,0,support,low


In [5]:
loans_df['satisfaction_level'] = loans_df['satisfaction_level'].str.rstrip('%').astype('float')/100

## Dummy Encoding (Binary Encoded Data)

In [6]:
loans_df['last_evaluation'] = loans_df['last_evaluation'].str.rstrip('%').astype('float')/100

In [7]:
# Binary encoding using Pandas (single column)
loans_binary_encoded = pd.get_dummies(loans_df, columns=["Department"], drop_first=True)
loans_binary_encoded.head()

Unnamed: 0,Emp_Id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,IND02438,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,1,0,0
1,IND28133,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,1,0,0
2,IND07164,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,1,0,0
3,IND30478,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,1,0,0
4,IND24003,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,1,0,0


In [8]:
# Binary encoding using Pandas (multiple columns)
loans_binary_encoded = pd.get_dummies(loans_binary_encoded, columns=["salary",])
loans_binary_encoded.head() 

Unnamed: 0,Emp_Id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department_RandD,...,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_high,salary_low,salary_medium
0,IND02438,0.38,0.53,2,157,3,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,IND28133,0.8,0.86,5,262,6,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,IND07164,0.11,0.88,7,272,4,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
3,IND30478,0.72,0.87,5,223,5,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,IND24003,0.37,0.52,2,159,3,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


In [9]:
loans_binary_encoded.drop('Emp_Id' , axis=1 , inplace=True)

In [10]:
loans_binary_encoded.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


In [11]:
# Saving the encoded dataset
file_path = Path("loans_data_encoded.csv")
loans_binary_encoded.to_csv(file_path, index=False)

## Scaling Data

In [12]:
# Creating the scaler instance
data_scaler = StandardScaler()

In [13]:
df = pd.read_csv("loans_data_encoded.csv")
data_scaler.fit_transform(df)

array([[-0.93649469, -1.08727529, -1.46286291, ..., -0.29980859,
         1.02477511, -0.8681323 ],
       [ 0.75281433,  0.84070693,  0.97111292, ..., -0.29980859,
        -0.97582386,  1.15189816],
       [-2.02247906,  0.95755433,  2.59376348, ..., -0.29980859,
        -0.97582386,  1.15189816],
       ...,
       [-0.97671633, -1.08727529, -1.46286291, ..., -0.29980859,
         1.02477511, -0.8681323 ],
       [-2.02247906,  1.42494396,  1.7824382 , ..., -0.29980859,
         1.02477511, -0.8681323 ],
       [-0.97671633, -1.14569899, -1.46286291, ..., -0.29980859,
         1.02477511, -0.8681323 ]])

In [14]:
# Fitting the scaler
data_scaler.fit(loans_binary_encoded)

StandardScaler()

In [15]:
# Transforming the data
loans_data_scaled = data_scaler.transform(loans_binary_encoded)
loans_data_scaled[:5]

array([[-0.93649469, -1.08727529, -1.46286291, -0.88203988, -0.34123516,
        -0.41116529,  1.788917  , -0.14741182, -0.2353205 , -0.23214788,
        -0.22764728, -0.20939051, -0.24632222, -0.25295305,  1.61955144,
        -0.41779149, -0.4706553 , -0.29980859,  1.02477511, -0.8681323 ],
       [ 0.75281433,  0.84070693,  0.97111292,  1.22042276,  1.71343614,
        -0.41116529,  1.788917  , -0.14741182, -0.2353205 , -0.23214788,
        -0.22764728, -0.20939051, -0.24632222, -0.25295305,  1.61955144,
        -0.41779149, -0.4706553 , -0.29980859, -0.97582386,  1.15189816],
       [-2.02247906,  0.95755433,  2.59376348,  1.4206573 ,  0.34365527,
        -0.41116529,  1.788917  , -0.14741182, -0.2353205 , -0.23214788,
        -0.22764728, -0.20939051, -0.24632222, -0.25295305,  1.61955144,
        -0.41779149, -0.4706553 , -0.29980859, -0.97582386,  1.15189816],
       [ 0.43104118,  0.89913063,  0.97111292,  0.43950807,  1.02854571,
        -0.41116529,  1.788917  , -0.14741182, -

In [16]:
# Define features set
X = loans_binary_encoded.copy()
X.drop("left", axis=1, inplace=True)
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [17]:
#Define target vector
y = loans_df["left"].values.reshape(-1,1)
y[:5]

array([[1],
       [1],
       [1],
       [1],
       [1]])

In [18]:
# Splitting into Train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=learning_rate,
    max_features=2,
    max_depth=3,
    random_state=0
    )

#Fit the model 
classifier.fit(X_train_scaled, y_train.ravel())
print("learning rate: ", learning_rate)

#Score the model
print("Accuracy score (train): {0:.3f}".format(
    classifier.score(
        X_train_scaled,
        y_train.ravel())))
print("Accuracy score (test): {0:.3f}".format(
    classifier.score(
        X_test_scaled,
        y_test.ravel())))
print()

learning rate:  1
Accuracy score (train): 0.982
Accuracy score (test): 0.971



In [29]:
# Choose a learning rate and create classifier 
classifier = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.75,
    max_features=2,
    max_depth=3,
    random_state=0
    
    )

#Fit the model
classifier.fit(X_train_scaled, y_train.ravel())

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
5,1,1
6,0,0
7,0,0
8,1,1
9,1,1


In [30]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.972


In [32]:
#Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

#Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2824,40
Actual 1,65,821


In [33]:
#Generate classification report 
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2864
           1       0.95      0.93      0.94       886

    accuracy                           0.97      3750
   macro avg       0.97      0.96      0.96      3750
weighted avg       0.97      0.97      0.97      3750

