In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
attrition = pd.read_csv('HREmployeeAttrition.csv')

In [3]:
attrition['Attrition'] = np.where(attrition.Attrition == 'Yes',1,0)

In [4]:
categorical = attrition.select_dtypes(include = 'object')
print(len(categorical.columns))

8


In [5]:
numerical = attrition.select_dtypes(include = ['int64','int32'])
print(len(numerical.columns))

27


In [6]:
attrition_cat = pd.get_dummies(categorical)

In [7]:
attrition_final = pd.concat([numerical,attrition_cat],axis=1)

In [8]:
attrition_final = attrition_final.drop('Attrition',axis=1)

In [9]:
target = attrition['Attrition']

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train,x_test,y_train,y_test = train_test_split(attrition_final,target,test_size=0.3,random_state=0,
                                                 stratify=attrition['Attrition'])

In [12]:
x_train.shape,x_test.shape

((1029, 55), (441, 55))

In [13]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [14]:
x_train.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
1078,44,136,28,3,1,1523,4,32,3,4,...,1,0,0,0,0,1,0,1,1,0
866,40,1184,2,4,1,1212,2,62,3,2,...,0,0,1,0,0,1,0,1,1,0
84,34,1153,1,2,1,110,1,94,3,2,...,0,0,0,0,0,1,0,1,1,0
458,40,1094,28,3,1,615,3,58,1,3,...,0,0,1,0,1,0,0,1,1,0
832,37,367,25,2,1,1161,3,52,2,2,...,0,0,0,0,1,0,0,1,1,0


In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
sc = StandardScaler()
sc.fit(x_train)

StandardScaler()

In [17]:
x_train.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
1078,44,136,28,3,1,1523,4,32,3,4,...,1,0,0,0,0,1,0,1,1,0
866,40,1184,2,4,1,1212,2,62,3,2,...,0,0,1,0,0,1,0,1,1,0
84,34,1153,1,2,1,110,1,94,3,2,...,0,0,0,0,0,1,0,1,1,0
458,40,1094,28,3,1,615,3,58,1,3,...,0,0,1,0,1,0,0,1,1,0
832,37,367,25,2,1,1161,3,52,2,2,...,0,0,0,0,1,0,0,1,1,0


In [18]:
from imblearn.over_sampling import SMOTE

In [19]:
os = SMOTE(random_state=0)
smote_train, smote_target = os.fit_sample(x_train, y_train)

In [20]:
smote_train.shape

(1726, 55)

In [21]:
smote_target.shape

(1726,)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb

In [23]:
model_params = {
    #'xgb':{'model': xgb.XGBClassifier(n_jobs = -1),
          #'params' : {'n_estimators' : [100, 200, 500, 750],
        #'learning_rate' : [0.01, 0.02, 0.05, 0.1, 0.25],
        #'min_child_weight': [1, 5, 7, 10],
        #'gamma': [0.1, 0.5, 1, 1.5, 5],
        #'subsample': [0.6, 0.8, 1.0],
        #'colsample_bytree': [0.6, 0.8, 1.0],
        #'max_depth': [3, 4, 5, 10, 12]}},
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [None]:
model = LogisticRegression(C=5)
model.fit(x_train,y_train)
model_predictions = model.predict(x_test)
accuracy = accuracy_score(y_test,model_predictions)
report = classification_report(y_test,model_predictions)
print(accuracy)
print(report)