# Muhammad Aysh MLDP
## IBM Employee Attrition Model

![image-2.png](attachment:image-2.png)





In [1]:
# Import all necessary libraries

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import warnings
#warnings.filterwarnings("ignore")
#Use this if you only want to disable future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Read the dataset into a pandas dataframe

df_analyze=pd.read_csv("IBM.csv")


Data Dictionary (pre-cleaning)

- Age: Age of employee
- Attrition: Employee attrition status
- Department: Department of work
- DistanceFromHome
- Education: 1-Below College; 2- College; 3-Bachelor; 4-Master; 5-Doctor;
- EducationField
- EnvironmentSatisfaction: 1-Low; 2-Medium; 3-High; 4-Very High;
- JobSatisfaction: 1-Low; 2-Medium; 3-High; 4-Very High;
- MaritalStatus
- MonthlyIncome
- NumCompaniesWorked: Number of companies worked prior to IBM
- WorkLifeBalance: 1-Bad; 2-Good; 3-Better; 4-Best;
- YearsAtCompany: Current years of service in IBM


## Exploratory Data Analysis


### Assumptions before Data Analysis

1. Younger people have a higher attrition rate
2. People with lower Job Satisfactory have a higher attrition rate
3. People with lower Environment Satisfactory have a higher attrition rate
4. People who are Single have a higher attrition rate
5. One factor contributing to attrition is a lower monthly income


In [None]:
# View randomly rows from the dataset to get a gist of the info

df_analyze.sample(5)

In [None]:
# Print the no. of rows and columns of the dataset

df_analyze.shape

In [None]:
# Print the size of the dataset

print(df_analyze.size)

In [None]:
# Print the columns in the dataset

list(df_analyze.columns)

In [None]:
# Print the last 5 rows of the dataset
df_analyze.tail()

In [None]:
# Print the data types of the dataset
# Binary encoding can be done on features like attrition 

print(df_analyze.dtypes)

In [None]:
# Print the dataset info
# Shows that there are no missing values in any of the rows

print(df_analyze.info())

In [None]:
# check for unique values in each column. This is especially useful for categorical data

df_analyze.nunique()

In [None]:
# Find out the min. and max age in the Dataframe which can aid me for the future parts
age_range = df_analyze['Age'].describe()

print("Age Range:")
print(age_range[['min', 'max']])


In [None]:
# Perform the same thing for distance from home
distancefrmhome_range = df_analyze['DistanceFromHome'].describe()

print("DistanceFromHome Range:")
print(distancefrmhome_range[['min', 'max']])


In [None]:
# Create a histogram to further visualize the age distribution

# Set the style for the plot
sns.set(style="whitegrid")

# Create a histogram using Seaborn
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot the histogram
sns.histplot(data=df_analyze, x="Age", bins=20, kde=True, color="skyblue")

# Add labels and a title
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Distribution of Age")

# Show the plot
plt.show()

## Analyse correlation between age and attrition rate

In [None]:
# Create a new column for age groups 
age_bins = [18, 30, 40, 50, 60]
age_labels = ['18-29', '30-39', '40-49', '50-59']
df_analyze['AgeGroup'] = pd.cut(df_analyze['Age'], bins=age_bins, labels=age_labels)

# Calculate attrition percentages within each age group
attrition_by_age = df_analyze.groupby('AgeGroup')['Attrition'].value_counts(normalize=True).unstack()
attrition_by_age = attrition_by_age.fillna(0)

# Create a stacked bar chart
attrition_by_age.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(10, 6))
plt.xlabel("Age Group")
plt.ylabel("Percentage")
plt.title("Attrition by Age Group")
plt.legend(title="Attrition", labels=["No", "Yes"])

plt.show()

In [None]:
# Create a new column for age groups 
age_bins = [18, 30, 61]
age_labels = ['18-29', '30-60']
df_analyze['AgeGroup'] = pd.cut(df_analyze['Age'], bins=age_bins, labels=age_labels)

# Calculate attrition percentages within each age group
attrition_by_age = df_analyze.groupby('AgeGroup')['Attrition'].value_counts(normalize=True).unstack()
attrition_by_age = attrition_by_age.fillna(0)

# Create a stacked bar chart
attrition_by_age.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(10, 6))
plt.xlabel("Age Group")
plt.ylabel("Percentage")
plt.title("Attrition by Age Group")
plt.legend(title="Attrition", labels=["No", "Yes"])

plt.show()

<font color=blue>__From this, we can see that those of the youngest age group (18-29) are more likely to leave. This could be due to job hopping becoming more popular amongst younger people__</font>

## Analyse correlation between job satisfaction and attrition rate

In [None]:
# Count the number of employees with and without attrition for each job satisfaction level
attrition_counts = df_analyze[df_analyze['Attrition'] == 'Yes']['JobSatisfaction'].value_counts().sort_index()
no_attrition_counts = df_analyze[df_analyze['Attrition'] == 'No']['JobSatisfaction'].value_counts().sort_index()

# Calculate attrition percentages for each job satisfaction level
total_counts = attrition_counts + no_attrition_counts
attrition_percentage = (attrition_counts / total_counts) * 100
no_attrition_percentage = (no_attrition_counts / total_counts) * 100

# Create a stacked bar chart
fig, ax = plt.subplots()
x = range(len(attrition_percentage))
bar1 = ax.bar(x, attrition_percentage, label='Attrition (%)')
bar2 = ax.bar(x, no_attrition_percentage, bottom=attrition_percentage, label='No Attrition (%)')

# Set labels, title, and legend
ax.set_xlabel('Job Satisfaction')
ax.set_ylabel('Percentage of Employees')
ax.set_title('Attrition by Job Satisfaction')
ax.set_xticks(x)
ax.set_xticklabels(attrition_percentage.index)
ax.legend()

plt.show()


<font color=blue>__From this, we can see that Job Satisfaction plays a minor role in affecting the attriiton rate. Those of a lower job satisfaction rate seem to have a higher attrition rate__</font>

## Analyse correlation between environment satisfaction and attrition rate

In [None]:
# Count the number of employees with and without attrition for each environment satisfaction level
attrition_counts = df_analyze[df_analyze['Attrition'] == 'Yes']['EnvironmentSatisfaction'].value_counts().sort_index()
no_attrition_counts = df_analyze[df_analyze['Attrition'] == 'No']['EnvironmentSatisfaction'].value_counts().sort_index()

# Calculate attrition percentages for each job satisfaction level
total_counts = attrition_counts + no_attrition_counts
attrition_percentage = (attrition_counts / total_counts) * 100
no_attrition_percentage = (no_attrition_counts / total_counts) * 100

# Create a stacked bar chart
fig, ax = plt.subplots()
x = range(len(attrition_percentage))
bar1 = ax.bar(x, attrition_percentage, label='Attrition (%)')
bar2 = ax.bar(x, no_attrition_percentage, bottom=attrition_percentage, label='No Attrition (%)')

# Set labels, title, and legend
ax.set_xlabel('Environment Satisfaction')
ax.set_ylabel('Percentage of Employees')
ax.set_title('Attrition by Environment Satisfaction')
ax.set_xticks(x)
ax.set_xticklabels(attrition_percentage.index)
ax.legend()

plt.show()


## Analyse correlation between work life balance level and attrition rate

In [None]:
# Count the number of employees with and without attrition for each work life balance level
attrition_counts = df_analyze[df_analyze['Attrition'] == 'Yes']['WorkLifeBalance'].value_counts().sort_index()
no_attrition_counts = df_analyze[df_analyze['Attrition'] == 'No']['WorkLifeBalance'].value_counts().sort_index()

# Calculate attrition percentages for each job satisfaction level
total_counts = attrition_counts + no_attrition_counts
attrition_percentage = (attrition_counts / total_counts) * 100
no_attrition_percentage = (no_attrition_counts / total_counts) * 100

# Create a stacked bar chart
fig, ax = plt.subplots()
x = range(len(attrition_percentage))
bar1 = ax.bar(x, attrition_percentage, label='Attrition (%)')
bar2 = ax.bar(x, no_attrition_percentage, bottom=attrition_percentage, label='No Attrition (%)')

# Set labels, title, and legend
ax.set_xlabel('Work Life Balance')
ax.set_ylabel('Percentage of Employees')
ax.set_title('Attrition by Work Life Balance')
ax.set_xticks(x)
ax.set_xticklabels(attrition_percentage.index)
ax.legend()

plt.show()


<font color=blue>__From this, we can see that those employees with the Work Life Balance level of 1 have an increased attrition rate.__</font>

## Analyse correlation between Education Level and attrition rate

In [None]:
# Count the number of employees with and without attrition for each education level
attrition_counts = df_analyze[df_analyze['Attrition'] == 'Yes']['Education'].value_counts().sort_index()
no_attrition_counts = df_analyze[df_analyze['Attrition'] == 'No']['Education'].value_counts().sort_index()

# Calculate attrition percentages for each education level
total_counts = attrition_counts + no_attrition_counts
attrition_percentage = (attrition_counts / total_counts) * 100
no_attrition_percentage = (no_attrition_counts / total_counts) * 100

# Create a stacked bar chart
fig, ax = plt.subplots()
x = range(len(attrition_percentage))
bar1 = ax.bar(x, attrition_percentage, label='Attrition (%)')
bar2 = ax.bar(x, no_attrition_percentage, bottom=attrition_percentage, label='No Attrition (%)')

# Set labels, title, and legend
ax.set_xlabel('Education Level')
ax.set_ylabel('Percentage of Employees')
ax.set_title('Attrition by Education Level')
ax.set_xticks(x)
ax.set_xticklabels(attrition_percentage.index)
ax.legend()

plt.show()


<font color=blue>__From this, we can see that education level does not really affect the attrition level.__</font>

## Analyse correlation between distance from home and attrition rate

In [None]:
# Create a box plot
plt.figure(figsize=(10, 6))  # Set the figure size (adjust as needed)
sns.boxplot(x='Attrition', y='DistanceFromHome', data=df_analyze, palette='Set2')

# Set labels and title
plt.xlabel('Attrition')
plt.ylabel('Distance From Home')
plt.title('Distribution of Distance From Home by Attrition')

plt.show()


<font color=blue>__From this, we can see that the distance from home has a slight affect on attrition rate. This might be attributed to the fact that the employees dislike travelling too far for work__</font>

## Income correlation with attrition

In [None]:
# Create a box plot to visualize monthly income by attrition
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_analyze, x='Attrition', y='MonthlyIncome', palette='Set2')

# Set labels and title
plt.xlabel('Attrition')
plt.ylabel('Monthly Income')
plt.title('Monthly Income by Attrition')

plt.show()


<font color=blue>__From this, we can see that income plays quite a big part in attrition rate, with those of a lower income being more likely to leave.__</font>

## Number of companies worked + age group correlation with attrition

__This heatmap explores how attrition connects with the number of jobs employees had before joining the current company. We want to find out if more job changes lead to higher attrition. Keep in mind that older employees may have had more jobs in the past, so age matters too. The heatmap helps us see the combined effect of job changes and age on attrition.__

In [None]:
# Create age bins
age_bins = [18, 30, 40, 50, 60]  # Define age groups as needed
age_labels = ['18-30', '31-40', '41-50', '51-60']

# Convert 'Attrition' to binary (1 for 'Yes' and 0 for 'No')
df_analyze['AttritionBinary'] = df_analyze['Attrition'].map({'Yes': 1, 'No': 0})

# Group employees into age categories
df_analyze['AgeGroup'] = pd.cut(df_analyze['Age'], bins=age_bins, labels=age_labels)

# Create a pivot table to calculate the percentage of attrition in each age group
pivot_table = df_analyze.pivot_table(index='AgeGroup', columns='NumCompaniesWorked', values='AttritionBinary', aggfunc='mean')

# Create a heatmap to visualize the percentages
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table * 100, cmap='YlGnBu', annot=True, fmt='.2f', cbar=False)
plt.title('Attrition Percentage vs. Age Group and Number of Companies Worked At')
plt.xlabel('Number of Companies Worked At')
plt.ylabel('Age Group')
plt.show()

<font color=blue>__From this, we can see that while those aged 18-30 and have worked at 5-6 companies have a much higher attrition percentage, the patterns for the other pairings are too inconsistent to reach a conclusion.__</font>

## Marital Status correlation with attrition

In [None]:
# Calculate the percentage of attrition within each marital status group
attrition_by_marital = df_analyze.groupby('MaritalStatus')['Attrition'].value_counts(normalize=True).unstack()
attrition_by_marital = attrition_by_marital.fillna(0)

# Create a stacked bar chart
plt.figure(figsize=(8, 6))
attrition_by_marital.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(8, 6))
plt.title('Attrition by Marital Status (Percentage)')
plt.xlabel('Marital Status')
plt.ylabel('Percentage')
plt.xticks(rotation=0)
plt.legend(title='Attrition', labels=['No', 'Yes'], loc='upper right')
plt.show()


<font color=blue>__The data indicates that single individuals have a higher attrition rate. This might be because single employees typically have fewer family responsibilities, allowing them to take more career risks and explore new job opportunities.__</font>

In [None]:
# Create a bar chart to visualize the relationship between income, marital status, and attrition
plt.figure(figsize=(10, 6))
sns.barplot(data=df_analyze, x='MaritalStatus', y='MonthlyIncome', hue='Attrition')
plt.title('Income vs. Marital Status by Attrition')
plt.xlabel('Marital Status')
plt.ylabel('Monthly Income')
plt.legend(title='Attrition')
plt.show()


<font color=blue>__The data indicates that income still plays a part in the attrition rate amongst the different marital statuses__</font>


### Data Analysis conclusions

__After analysis the data and clarifying my assumptions, these are the conclusions I've come up with__

1. Younger employees (18-29) are more likely to leave, possibly due to a tendency for job hopping among this age group.
2. Job satisfaction has a minor role in affecting attrition, with lower job satisfaction associated with higher attrition rates.
3. Employees with an Environment Satisfaction rating of 1 have a significantly higher attrition rate.
4. Distance from home slightly affects the attrition rate, possibly because employees dislike long commutes.
5. Income significantly impacts attrition, with lower-income employees more likely to leave.
6. Among employees aged 18-30 who have worked at 5-6 companies, there is a notably higher attrition percentage, but other age and experience pairings show inconsistent patterns.
7. Single individuals have a higher attrition rate, possibly due to fewer family responsibilities, allowing them to take more career risks.
8. Income still plays a part in attrition across different marital statuses.
9. Employees with the lowest level of Work Life Balance of 1 seem to have a higher attrition rate.



## Data Cleaning


In [None]:
# Reload the dataset
df=pd.read_csv("IBM.csv")


In [None]:
df.head()

In [None]:
# Check for duplicates

df[df.duplicated()]

In [None]:
# Check for Null values
# Recall previously we identified some NaN at "country", "director" and "cast" columns

df.isnull().sum()

In [None]:
# Peform Binary Encoding on the Attrition feature

df['Attrition'] = df['Attrition'].map({'Yes': True, 'No': False})


In [None]:
# Visualize outliers

# Create subplots for each numerical column
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))
plt.subplots_adjust(hspace=0.5)

# Box plots for each column
sns.boxplot(x=df['Age'], ax=axes[0, 0])
axes[0, 0].set_title('Age')

sns.boxplot(x=df['DistanceFromHome'], ax=axes[0, 1])
axes[0, 1].set_title('DistanceFromHome')

sns.boxplot(x=df['MonthlyIncome'], ax=axes[0, 2])
axes[0, 2].set_title('MonthlyIncome')

sns.boxplot(x=df['NumCompaniesWorked'], ax=axes[1, 0])
axes[1, 0].set_title('NumCompaniesWorked')

sns.boxplot(x=df['YearsAtCompany'], ax=axes[1, 1])
axes[1, 1].set_title('YearsAtCompany')

# Remove empty subplot
fig.delaxes(axes[1, 2])

plt.show()


In [None]:
# Calculate IQR for YearsAtCompany
Q1_YearsAtCompany = df['YearsAtCompany'].quantile(0.25)
Q3_YearsAtCompany = df['YearsAtCompany'].quantile(0.75)
IQR_YearsAtCompany = Q3_YearsAtCompany - Q1_YearsAtCompany

# Define upper and lower bounds for outliers
lower_bound_YearsAtCompany = Q1_YearsAtCompany - 1.5 * IQR_YearsAtCompany
upper_bound_YearsAtCompany = Q3_YearsAtCompany + 1.5 * IQR_YearsAtCompany

# Remove outliers for YearsAtCompany
df = df[(df['YearsAtCompany'] >= lower_bound_YearsAtCompany) & (df['YearsAtCompany'] <= upper_bound_YearsAtCompany)]

# Calculate IQR for MonthlyIncome
Q1_MonthlyIncome = df['MonthlyIncome'].quantile(0.25)
Q3_MonthlyIncome = df['MonthlyIncome'].quantile(0.75)
IQR_MonthlyIncome = Q3_MonthlyIncome - Q1_MonthlyIncome

# Define upper and lower bounds for outliers
lower_bound_MonthlyIncome = Q1_MonthlyIncome - 1.5 * IQR_MonthlyIncome
upper_bound_MonthlyIncome = Q3_MonthlyIncome + 1.5 * IQR_MonthlyIncome

# Remove outliers for MonthlyIncome
df = df[(df['MonthlyIncome'] >= lower_bound_MonthlyIncome) & (df['MonthlyIncome'] <= upper_bound_MonthlyIncome)]

# Visualize the boxplots after removing outliers
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
plt.subplots_adjust(hspace=0.5)

# Box plots for each column
sns.boxplot(x=df['YearsAtCompany'], ax=axes[0, 0])
axes[0, 0].set_title('YearsAtCompany (Outliers Removed)')

sns.boxplot(x=df['MonthlyIncome'], ax=axes[0, 1])
axes[0, 1].set_title('MonthlyIncome (Outliers Removed)')

# Remove empty subplot
fig.delaxes(axes[1, 0])
fig.delaxes(axes[1, 1])

plt.show()


In [None]:
# Find the minimum and maximum values of MonthlyIncome
min_monthly_income = df['MonthlyIncome'].min()
max_monthly_income = df['MonthlyIncome'].max()

# Display the range
print(f"MonthlyIncome Range: {min_monthly_income} to {max_monthly_income}")


In [None]:
# Define income categories and labels
bins = [1009, 3000, 7000, 10000, 14000]
labels = [1, 2, 3, 4]

# Create a new column 'IncomeCategory' with the income groups
df['IncomeCategory'] = pd.cut(df['MonthlyIncome'], bins=bins, labels=labels, include_lowest=True)

# Convert 'IncomeCategory' to integer type
df['IncomeCategory'] = df['IncomeCategory'].astype(int)

# Remove the 'MonthlyIncome' column
df = df.drop('MonthlyIncome', axis=1)

# Display the resulting DataFrame
print(df[['IncomeCategory']])


In [None]:
df.head()

In [None]:
# Remove the features Education, EducationField, Department, DistanceFromHome, NumCompaniesWorked as they do not contribute much to the affect on Attrition

columns_to_remove = ['Education', 'EducationField', 'Department', 'DistanceFromHome', 'NumCompaniesWorked']
df = df.drop(columns=columns_to_remove)

In [None]:
df.head()

In [None]:
# Check unique values in the 'Age' column
print(df['Age'].unique())

# Print the 'AgeGroup' column
print(df[['Age']].head())

In [None]:
# Binning of age as age group patterns may be non-linear
# Define the age bins
age_bins = [18, 30, 40, 50, 60]

# Define the bin labels
age_labels = ['18-30', '31-40', '41-50', '51-60']

# Create a new column 'AgeGroup' with the age group labels
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

# Perform one-hot encoding on the 'AgeGroup' column
df = pd.get_dummies(df, columns=['AgeGroup'], prefix='AgeGroup')

# Drop the original 'Age' column
df = df.drop(columns=['Age'])

# Display the resulting DataFrame
print(df.columns)


In [None]:
# Display the first few rows of the updated DataFrame
df.head()

In [None]:
# Perform one-hot encoding on the 'MaritalStatus' column
df = pd.get_dummies(df, columns=['MaritalStatus'], prefix='MaritalStatus')

# Display the resulting DataFrame
print(df.head())


In [None]:
# Display the first few rows of the updated DataFrame
df.head()

### Train Test Split 

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['Attrition'], axis=1)
y = df['Attrition']

# Calculate the class weights
class_weights = len(y) / (2 * y.value_counts())

# Split the data into training and testing sets (70% train, 30% test)
X_train_base, X_test, y_train_base, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### SMOTE oversampling for True values

<font color=grey>__SMOTE is used to address my class imbalance of Attrition which can help the model be equally trained for both__</font>

In [None]:
# Assuming 'Attrition' is the column of interest in your DataFrame
num_false = df[df['Attrition'] == False].shape[0]
num_true = df[df['Attrition'] == True].shape[0]

print("Number of False Attrition: ", num_false)
print("Number of True Attrition: ", num_true)


In [None]:
# Perform SMOTE oversampling on the training data
from imblearn.over_sampling import SMOTE

X_train, y_train = SMOTE().fit_resample(X_train_base, y_train_base)

print("Support in the resampled training set:")
print(pd.Series(y_train).value_counts())



In [None]:
# Assuming 'y_train' is the oversampled target variable after applying SMOTE
num_false_smote = y_train[y_train == False].shape[0]
num_true_smote = y_train[y_train == True].shape[0]

print("Number of False Attrition (after SMOTE): ", num_false_smote)
print("Number of True Attrition (after SMOTE): ", num_true_smote)


### StandardScaler normalization

<font color=grey>__Standard Scaler is used to transform numerical features by scaling them and ensuring that all features contribute equally to the model and enhancing the convergence speed of certain algorithms sensitive to feature scale.__</font>

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify the numerical columns
numerical_columns = ['EnvironmentSatisfaction', 'JobSatisfaction', 'YearsAtCompany', 'IncomeCategory', 'WorkLifeBalance']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training set
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])

# Transform the testing set using the same scaler
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])


In [None]:
import joblib

# Save the fitted scaler to a file
joblib.dump(scaler, 'ibm_scaler.pkl')

## Model Training and Testing

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a decision tree classifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display a classification report
print(classification_report(y_test, y_pred))


In [None]:
importance = dt_clf.feature_importances_


In [None]:
feature_indexes_by_importance = importance.argsort()


In [None]:
feature_labels = np.array(df.columns)


In [None]:
# Print each feature label, from least important to most important
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

### Random Forest Classifier Model testing

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_clf.predict(X_test)

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display a classification report to evaluate model performance
print(classification_report(y_test, y_pred))


<font color=blue>__This model performed slightly better with having a higher accuracy.__</font>

### Mid modelling thoughts

<font color=green>__While these models have a good f1 score for False values, the True values need to be worked on. Thus, I need to use better algorithms and other methods to balance out precision, recall and the f1 score of True values__</font>


### XGBoost Model Testing using threshold and scaling the weight to attain a higher f1-score for True

In [None]:
import xgboost as xgb

scale_pos_weight = 8

# Initialize the XGBoost classifier with hyperparameters
xgb_clf_two = xgb.XGBClassifier(
    n_estimators=100,  # Number of boosting rounds (you can adjust this)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=3,  # Maximum tree depth
    min_child_weight=1,  # Minimum sum of instance weight (Hessian) needed in a child
    gamma=0,  # Minimum loss reduction required to make a further partition on a leaf node
    subsample=1,  # Fraction of training data to randomly sample for growing trees
    colsample_bytree=1,  # Fraction of features to be used for growing trees
    objective='binary:logistic',  # Binary classification objective
    random_state=42,
    scale_pos_weight=scale_pos_weight  # Adjust this value
)

# Fit the model to the training data
xgb_clf_two.fit(X_train, y_train)

# Make predictions on the test data
y_pred_proba = xgb_clf_two.predict_proba(X_test)  # Get class probabilities

# Adjust the classification threshold 
threshold = 0.6
y_pred = (y_pred_proba[:, 1] > threshold).astype(int)  # Classify based on threshold

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print(report)


<font color=blue>__This model managed to balance slightly increase recall. However, I can still do some hyperparameter tuning to fit me needs__</font>

### XGB Model Testing again but with Random Search first to find the best parameters

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import uniform, randint

xgb_clf_three = xgb.XGBClassifier()

# Define a random search space for hyperparameters
param_dist = {
    'n_estimators': randint(100, 1000),  # Random integer between 100 and 1000
    'learning_rate': uniform(0.01, 0.2),  # Random float between 0.01 and 0.21
    'max_depth': randint(3, 6),  # Random integer between 3 and 6
    'min_child_weight': randint(1, 4),  # Random integer between 1 and 4
    'gamma': uniform(0, 0.2),  # Random float between 0 and 0.2
    'subsample': uniform(0.8, 0.2),  # Random float between 0.8 and 1.0
    'colsample_bytree': uniform(0.8, 0.2),  # Random float between 0.8 and 1.0
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=xgb_clf_three, param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42)

# Fit the model to the training data and find the best hyperparameters
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Use the best hyperparameters to create the final model
final_xgb_model = xgb.XGBClassifier(**best_params)

# Fit the final model to the training data
final_xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = final_xgb_model.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print(report)


<font color=blue>__This model performed slightly better, having a higher f1-score for the true values, which may be due to the random search finding better parameters.__</font>

### XGB Model Testing again but with Optuna to find the best parameters

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
import optuna

def objective(trial):
    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 4),
        'gamma': trial.suggest_uniform('gamma', 0, 0.2),
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 1.0),
        'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 5, 10),
    }

    # Initialize the XGBoost classifier
    xgb_clf_base = xgb.XGBClassifier(**params)

    # Fit the model to the training data
    xgb_clf_base.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = xgb_clf_base.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return 1.0 - accuracy  # Optuna minimizes, so we use (1 - accuracy) to maximize accuracy

# Create the Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

# Get the best hyperparameters
best_params = study.best_params

# Use the best hyperparameters to create the final model
xgb_clf_opt = xgb.XGBClassifier(**best_params)

# Fit the final model to the training data
xgb_clf_opt.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_clf_opt.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print(report)


In [None]:
importance = xgb_clf_opt.feature_importances_


In [None]:
feature_indexes_by_importance = importance.argsort()


In [None]:
feature_labels = np.array(df.columns)


In [None]:
# Print each feature label, from least important to most important
for index in feature_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

In [None]:
# Create a confusion matrix
cm = metrics.confusion_matrix(y_test, xgb_clf_opt.predict(X_test))

# Plot the confusion matrix
disp = metrics.ConfusionMatrixDisplay(cm)
disp.plot(cmap='Blues_r')

### LightGBM Model Testing with Optuna to find the best parameters

In [None]:
import lightgbm as lgb
import optuna


# Define the LightGBM objective function for Optuna
def objective(trial):
    # Define hyperparameter search space
    params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
    'max_depth': trial.suggest_int('max_depth', 6,10),  # Increase the maximum depth
    'min_child_samples': trial.suggest_int('min_child_samples', 5, 20),  # Increase the minimum child samples
    'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),  # Slightly decrease subsample
    'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),  # Slightly decrease colsample_bytree
    'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 8, 10),
}


    


    # Initialize the LightGBM classifier
    lgb_clf_base = lgb.LGBMClassifier(**params, verbose=-1)

    # Fit the model to the training data
    lgb_clf_base.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred_prob = lgb_clf_base.predict_proba(X_test)[:, 1]  
    threshold = 0.9
    y_pred = (y_pred_prob > threshold).astype(bool)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return 1.0 - accuracy  

# Create the Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
best_params = study.best_params

# Use the best hyperparameters to create the final model
lgb_clf_optuna = lgb.LGBMClassifier(**best_params, verbose=-1)

# Fit the final model to the training data
lgb_clf_optuna.fit(X_train, y_train)

# Make predictions on the test data
y_pred_prob = lgb_clf_optuna.predict_proba(X_test)[:, 1]  # Probability of class 1
threshold = 0.9 # Adjust the threshold as needed
y_pred = (y_pred_prob > threshold).astype(bool)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print(report)

### Best hyperparamters discovered from Optuna

After one of my generations from Optuna, I was able to retrieve the best hyperparameters so far as seen below but was not able to reproduce it due to not setting a random_state

In [None]:
# Best Hyperparameters: {'learning_rate': 0.020756915668425923, 'max_depth': 6, 'min_child_samples': 7, 'subsample': 0.9116578004400187, 'colsample_bytree': 0.9580980020238541, 'scale_pos_weight': 8.636121660101466}

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# Define the hyperparameters
best_params = {'learning_rate': 0.020756915668425923, 
               'max_depth': 6, 
               'min_child_samples': 7, 
               'subsample': 0.9116578004400187, 
               'colsample_bytree': 0.9580980020238541, 
               'scale_pos_weight': 8.636121660101466}

# Initialize the LightGBM classifier with the predefined hyperparameters
lgb_clf_predefined = lgb.LGBMClassifier(**best_params, verbose=-1)

# Fit the model to the training data
lgb_clf_predefined.fit(X_train, y_train)

# Make predictions on the test data
y_pred_prob = lgb_clf_predefined.predict_proba(X_test)[:, 1]
threshold = 0.85
y_pred = (y_pred_prob > threshold).astype(bool)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Predefined Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print(report)


<font color=blue>__This model in my opinion is the best out of the rest after experimenting as it has double the f1-score for the True values as compared to the first model, without it being at the expense of the False f1 score. It has a decent balance of precision and recall.__</font>

In [None]:
import joblib

joblib.dump(lgb_clf_predefined, 'ibm_lgm_clf_model.pkl')

In [None]:
print(df)

In [None]:
df.info()