In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle
from sklearn.metrics import roc_auc_score

In [19]:
df = pd.read_csv("/Users/phanindrasai/Downloads/kub/fast_api/HR_comma_sep.csv")

In [20]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Departments,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [21]:
df.rename(columns={'Departments ':'departments'}, inplace=True)

In [22]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,departments,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [23]:
df['salary'] = df['salary'].map({'low':0, 'medium':1, 'high':2})

In [24]:
enc = LabelEncoder()
df['departments'] = enc.fit_transform(df.departments)

In [25]:
y = df['left']
X = df.drop('left', axis=1, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.15)

In [26]:
class my_classifier(BaseEstimator,):
    def __init__(self, estimator=None):
        self.estimator = estimator
    def fit(self, X, y=None):
        self.estimator.fit(X,y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X,y)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)

In [27]:
pipe = Pipeline([ ('clf', my_classifier())])

In [28]:
parameters = [
             {'clf':[RandomForestClassifier()],
             'clf__n_estimators': [75, 100, 125,],
             'clf__min_samples_split': [2,4,6],
             'clf__max_depth': [5, 10, 15,]

             },
           ]

In [29]:
grid = GridSearchCV(pipe, parameters, cv=5, scoring='roc_auc')
grid.fit(x_train,y_train)

model = grid.best_estimator_
score = grid.best_score_

print(f'The estimator is found to be {model} with an ROC-AUC score of {score}')

The estimator is found to be Pipeline(steps=[('clf',
                 RandomForestClassifier(max_depth=15, min_samples_split=6,
                                        n_estimators=125))]) with an ROC-AUC score of 0.9940976530401915


In [30]:
y_pred = model.predict(x_test)

In [31]:
roc_auc = roc_auc_score(y_test, y_pred)

In [34]:
print(f'The ROC-AUC for test data is found to be {roc_auc}')

The ROC-AUC for test data is found to be 0.9757009345794392


In [37]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)