In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pylab
import statsmodels.api as sm
from pprint import pprint as pp
%matplotlib inline

In [3]:
from scipy import stats
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
def avg_training_score_grading(x):
    grade = 0;
    if x < 59 :
        grade = 0
    elif x < 79:
        grade = 1
    else :
        grade = 2
    return grade    

In [None]:
def no_of_trainings_grade(x):
    grade = 0
    if x <= 5:
        grade = 0
    elif x <= 7:
        grade = 1
    else :
        grade = 2
    return grade   

In [None]:
def education_encode(x):
    x = str(x)
    x = ''.join(list(x)[0:4])
    grade = np.NaN
    if x[0:4] == 'Bach':
        grade = 1
    elif x[0:4] == 'Mast':
        grade = 2
    elif x[0:4] == 'Belo':
        grade = 0
    return grade 

In [None]:
def region_encode(x):
    return int(x[len('region_'):])

In [None]:
def age_bining(x):
    x = int(x)
    x = (x//3)*3
    return x
    

In [None]:
df_train['age'] = df_train['age'].apply(age_bining)
df_test['age'] = df_test['age'].apply(age_bining)

In [None]:
df_train['region'] = df_train['region'].apply(region_encode)
df_test['region'] = df_test['region'].apply(region_encode)

In [None]:
df_train['no_of_trainings'] = df_train['no_of_trainings'].apply(no_of_trainings_grade)
df_test['no_of_trainings'] = df_test['no_of_trainings'].apply(no_of_trainings_grade)

In [None]:
df_train['avg_training_score'] = df_train['avg_training_score'].apply(avg_training_score_grading)
df_test['avg_training_score'] = df_test['avg_training_score'].apply(avg_training_score_grading)

In [None]:
df_train['education'] = df_train['education'].apply(education_encode)
df_test['education'] = df_test['education'].apply(education_encode)

In [None]:
imputer = Imputer(missing_values='NaN', strategy="most_frequent", axis=0)
train_previous_year_rating = imputer.fit_transform(df_train.previous_year_rating.values.reshape(-1, 1))
df_train['previous_year_rating'] = train_previous_year_rating
test_previous_year_rating = imputer.transform(df_test.previous_year_rating.values.reshape(-1, 1))
df_test['previous_year_rating'] = test_previous_year_rating

In [None]:
df_train['education'] = df_train['education'].replace(r'', np.NaN)
df_test['education'] = df_test['education'].replace(r'', np.NaN)

In [None]:
df_train['education'] = df_train['education'].astype(object)
df_test['education'] = df_test['education'].astype(object)

In [None]:
imputer = Imputer(missing_values='NaN', strategy="most_frequent", axis=0)
train_education_filled = imputer.fit_transform(df_train.education.values.reshape(-1, 1))
df_train['education'] = train_education_filled
test_education_filled = imputer.transform(df_test.education.values.reshape(-1, 1))
df_test['education'] = test_education_filled

In [None]:
dict_labelencode={}
for col in ['department', 'region', 'gender', 'recruitment_channel', 'no_of_trainings', 'awards_won?', 'age', 
                    'length_of_service', 'KPIs_met >80%', 'avg_training_score', 
                    'previous_year_rating', 'education']:
#for col in ['department', 'gender', 'recruitment_channel']:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    dict_labelencode[col] = le.classes_
    print("One hot encoding  for -----> ", col)                
    onehotencoder = OneHotEncoder(categorical_features = 'all')
    dummy_var = onehotencoder.fit_transform(df_train[col].values.reshape(-1,1)).toarray()
    valueList = dict_labelencode[col]
    i = 0 
    for value in valueList:
        print("One hot encoding  for value-----> ", value) 
        newcolumn = col + str("_") + str(value)
        print("One hot encoding  for label class -----> ", newcolumn)
        df_train[newcolumn] = dummy_var[:, i]
        if i == 0:
            df_train = df_train.drop([newcolumn], axis=1) 
            print("new dropped column -----> ", newcolumn)
        i = i+1      
    df_train = df_train.drop([col], axis=1)
    print("dropped column -----> ", col)
    print("############## test data ###################")
    dummy_var = onehotencoder.transform(df_test[col].values.reshape(-1,1)).toarray()
    valueList = dict_labelencode[col]
    i = 0 
    for value in valueList:
        print("test One hot encoding  for value-----> ", value) 
        newcolumn = col + str("_") + str(value)
        print("test One hot encoding  for label class -----> ", newcolumn)
        df_test[newcolumn] = dummy_var[:, i]
        if i == 0:
            df_test = df_test.drop([newcolumn], axis=1) 
            print("new test dropped column -----> ", newcolumn)
        i = i+1  
    df_test = df_test.drop([col], axis=1)
    print("test dropped column -----> ", col)
    
    #df_test = df_test.drop([col], axis=1)

In [None]:
trainlist = df_train.columns
trainlist = trainlist.drop(['employee_id', 'is_promoted'])
testlist = df_test.columns
testlist = testlist.drop(['employee_id'])

In [None]:
X1 = df_train.loc[:,trainlist].values
y1 = df_train.loc[:,['is_promoted']].values
#y_train = df_train.loc[:,['is_promoted']].values

X2 = df_test.loc[:,testlist].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.05, random_state = 0)

In [None]:
df_train.describe()

# Decision Tree Classifier

In [None]:
clf_tree=DecisionTreeClassifier(criterion='entropy', max_depth=50, min_samples_split=10)
bag_classifier = BaggingClassifier(base_estimator=clf_tree, n_estimators=90)
bag_classifier.fit(X_train, y_train)

In [None]:
bg_y_pred = bag_classifier.predict(X_test)
print(accuracy_score(y_test, bg_y_pred))
print(f1_score(y_test, bg_y_pred))

In [84]:
bg_y_pred2 = bag_classifier.predict(X2)
df_test['is_promoted'] = bg_y_pred2
header = ['employee_id', 'is_promoted']
df_test[header].to_csv('bag_submission.csv', columns = header, index=False)