# Instructor Do: Dealing with Categorical Data in ML

In [4]:
# initial imports
import pandas as pd
from path import Path

ModuleNotFoundError: No module named 'path'

## Dataset Information

The file `loans_data.csv`, contains simulated data about loans, there are a total of 500 records. Each row represents a loan application along an arbitrary year, where every column represents the following data about every loan application.

* `amount`: The loan amount in USD.
* `term`: The loan term in months.
* `month`: The month of the year when the loan was requested.
* `age`: Age of the loan applicant.
* `education`: Educational level of the loan applicant.
* `gender`: Gender of the loan applicant.
* `bad`: Stands for a bad or good loan applicant (`1` - bad, `0` - good).

In [2]:
# Load data
file_path = Path("../Resources/loans_data.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()

# The bad column is the target column 0 = Approval 1 = Denial

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,High School or Below,male,0
1,1000,30,July,50,Bachelor,female,0
2,1000,30,August,33,Bachelor,female,0
3,1000,15,September,27,college,male,0
4,1000,30,October,28,college,female,0


In [3]:
# Use the pd.get_dummies() module to convert the gender column from text to numerical values
    # pd.get_dummies() takes two arguments: the dataframe (loans_df) and the column we want to convert ['gender']
    

    # Binary encoding using Pandas (single column)
loans_binary_encoded = pd.get_dummies(loans_df, columns=["gender"])
loans_binary_encoded.head()

# This splits the gender column into two which contain 0 = false and 1 = true

Unnamed: 0,amount,term,month,age,education,bad,gender_female,gender_male
0,1000,30,June,45,High School or Below,0,0,1
1,1000,30,July,50,Bachelor,0,1,0
2,1000,30,August,33,Bachelor,0,1,0
3,1000,15,September,27,college,0,0,1
4,1000,30,October,28,college,0,1,0


In [4]:
# Binary encoding using Pandas (multiple columns)

# It is also possible to encode two columns at the same time using the syntax: columns=["column1", "column2"]
loans_binary_encoded = pd.get_dummies(loans_df, columns=["education", "gender"])
loans_binary_encoded.head()

# The education column has now been separted into 4 columns again with 0 = False and 1 = True
    # So 1 populates the column with the previous text

Unnamed: 0,amount,term,month,age,bad,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,June,45,0,0,1,0,0,0,1
1,1000,30,July,50,0,1,0,0,0,1,0
2,1000,30,August,33,0,1,0,0,0,1,0
3,1000,15,September,27,0,0,0,0,1,0,1
4,1000,30,October,28,0,0,0,0,1,1,0


In [3]:
# 18.6.2 Encode Labels With Scikit-learn

# Import the module
from sklearn.preprocessing import LabelEncoder

# An instance of the label encoder object is created and assigned the variable le
le = LabelEncoder()

# A copy of the original loans_df is created for this example,
    # This step is not necessary for using label encoder
df2 = loans_df.copy()

# The label encoder's fit_transform() method is used to first train the label encoder
    # Then convert the text data into numerical data
df2['education'] = le.fit_transform(df2['education']) 
df2.head()

NameError: name 'loans_df' is not defined

## Integer Encoding

In [2]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = loans_df.copy()
df2['education'] = le.fit_transform(df2['education'])

NameError: name 'loans_df' is not defined

In [7]:
df2.head()

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,1,male,0
1,1000,30,July,50,0,female,0
2,1000,30,August,33,0,female,0
3,1000,15,September,27,3,male,0
4,1000,30,October,28,3,female,0


In [1]:
# Skill drill 18.6.2
df2['gender'] = le.fit_transform(df2['gender'])

NameError: name 'le' is not defined

# Custom Encoding

In [7]:
# 18.6.3 Create custom encoding

# Creating an instance of label encoder
label_encoder = LabelEncoder()
loans_df["month_le"] = label_encoder.fit_transform(loans_df["month"])
loans_df.head()

Unnamed: 0,amount,term,month,age,education,gender,bad,month_le
0,1000,30,June,45,High School or Below,male,0,6
1,1000,30,July,50,Bachelor,female,0,5
2,1000,30,August,33,Bachelor,female,0,1
3,1000,15,September,27,college,male,0,11
4,1000,30,October,28,college,female,0,10


In [8]:
# create a dictionary of the months of the year 
    # apply a custom function to convert the month names to their corresponding integers

# Months dictionary
months_num = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}



In [9]:
# Months' names encoded using the dictionary values

# A lambda function is applied to the month column to perform the actual conversion
loans_df["month_num"] = loans_df["month"].apply(lambda x: months_num[x])
loans_df.head()



Unnamed: 0,amount,term,month,age,education,gender,bad,month_le,month_num
0,1000,30,June,45,High School or Below,male,0,6,6
1,1000,30,July,50,Bachelor,female,0,5,7
2,1000,30,August,33,Bachelor,female,0,1,8
3,1000,15,September,27,college,male,0,11,9
4,1000,30,October,28,college,female,0,10,10


In [10]:
# This code is merely cleanup—
    # It drops the unnecessary columns related to the month

# Drop the month and month_le columns
loans_df = loans_df.drop(["month", "month_le"], axis=1)
loans_df.head()

Unnamed: 0,amount,term,age,education,gender,bad,month_num
0,1000,30,45,High School or Below,male,0,6
1,1000,30,50,Bachelor,female,0,7
2,1000,30,33,Bachelor,female,0,8
3,1000,15,27,college,male,0,9
4,1000,30,28,college,female,0,10
