<a href="https://colab.research.google.com/github/nadairshaid/big-pandas-MIT/blob/decision_tree/Copy_of_Decision_Tree_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing and Mounting**

In [1]:
from google.colab import drive
drive.mount('/content/drive')


ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# Setting the precision of floating numbers to 5 decimal points
pd.set_option("display.float_format", lambda x: "%.5f" % x)

# To tune model, get different metric scores, and split data
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
    classification_report,
    precision_recall_curve
)
from sklearn import metrics

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To impute missing values
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression

# To build classification models 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


# For tuning the model
from sklearn.model_selection import GridSearchCV

# To supress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Importing data
survey_train = pd.read_csv('/content/drive/MyDrive/MITHackathon/Surveydata_train.csv') 
survey_test = pd.read_csv('/content/drive/MyDrive/MITHackathon/Surveydata_test.csv') 
travel_train = pd.read_csv('/content/drive/MyDrive/MITHackathon/Traveldata_train.csv') 
travel_test = pd.read_csv('/content/drive/MyDrive/MITHackathon/Traveldata_test.csv') 


**Getting Info**

In [None]:
survey_train.shape


In [None]:
survey_train.head()

In [None]:
survey_train.tail()

In [None]:
survey_train.info()

In [None]:
travel_train.shape

In [None]:
travel_train.head()

In [None]:
travel_train.tail()

In [None]:
travel_train.info()

**Merging Data Frames**

In [None]:
df = travel_train.merge(survey_train, how='outer', on= 'ID', indicator= True)

In [None]:
df

In [None]:

num_cols = df.describe().columns.tolist()
cat_cols = df.describe(include='object').columns.tolist()

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
nona_df = df.copy()

In [None]:
df=df.drop(['_merge','ID'],axis=1)

In [None]:
def fill_nas_with_mode(df, col):
    mode = df[col].mode().iloc[0]
    df[col].fillna(mode, inplace=True)

def fill_nas_with_median(df, col):
    median = df[col].median()
    df[col].fillna(median, inplace=True)

for col in cat_cols:
    fill_nas_with_mode(nona_df, col)

for col in num_cols:
    fill_nas_with_median(nona_df, col)


In [None]:
# Creating list of dummy columns
nona_df.drop(columns=['ID','_merge'], inplace=True)
dumm_df = pd.get_dummies(nona_df, drop_first=True)

dumm_df.columns = dumm_df.columns.str.strip()
dumm_df.columns = dumm_df.columns.str.replace(' ', '_')

# scaling the data
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(dumm_df), columns=dumm_df.columns)


In [None]:
scaled_df.head()

In [None]:
dumm_df.head()

In [None]:
Y= dumm_df.Overall_Experience
X= dumm_df.drop(columns = ['Overall_Experience'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1, stratify = Y)

In [None]:
# Creating metric function 
def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))
    
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Attrite', 'Attrite'], yticklabels=['Not Attrite', 'Attrite'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

**Decision Tree**

In [None]:
X_train

In [None]:
y_train

In [None]:
dt = DecisionTreeClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)


In [None]:
dt.fit(X_train, y_train)

In [None]:
y_train_pred_dt = dt.predict(X_train)

metrics_score(y_train, y_train_pred_dt)


In [None]:
y_test_pred_dt = dt.predict(X_test)

metrics_score(y_test, y_test_pred_dt)


In [None]:
importances = dt.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance,importance_df.index)


In [None]:
features = list(X.columns)

plt.figure(figsize = (30, 20))

tree.plot_tree(dt, max_depth = 4, feature_names = features, filled = True, fontsize = 12, node_ids = True, class_names = True)

plt.show()


**Using Random Forest**

In [None]:
# Fitting the Random Forest classifier on the training data
rf_estimator = RandomForestClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)

rf_estimator.fit(X_train, y_train)


In [None]:
# Checking performance on the training data
y_pred_train_rf = rf_estimator.predict(X_train)

metrics_score(y_train, y_pred_train_rf)


In [None]:
# Checking performance on the testing data
y_pred_test_rf = rf_estimator.predict(X_test)

metrics_score(y_test, y_pred_test_rf)


In [None]:
importances = rf_estimator.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance, importance_df.index)


SVM with RBF kernel has good recall among all the models and Random Forest, has less recall compared to SVM but F1 Score, Accuracy and Precision values are good in case of Random Forest. It may be possible to further try and tune the model, and the HR department can use this model to predict whether an employee is at risk of attrition or not.
