In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Reading the data to DataFrame.

data = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [3]:
#Handling the null files. 

data.isnull()
#We see that the columns 'previous_year_rating' and 'education' have NaN values

#Replacing the NaN values in the column 'previous_year_rating' with the mode of the remaining elements.
data[data.previous_year_rating.isnull()]
data.previous_year_rating.fillna(data.previous_year_rating.mode()[0], inplace = True)

#Probabilistically filling the NaN values in the 'education' column
s = data.education.value_counts(normalize = True)
missing = data['education'].isnull()
data.loc[missing,'education'] = np.random.choice(s.index, size=len(data[missing]),p=s.values)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [6]:
data.previous_year_rating.value_counts()

3.0    22742
5.0    11741
4.0     9877
1.0     6223
2.0     4225
Name: previous_year_rating, dtype: int64

In [7]:
data.education.value_counts()

Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: education, dtype: int64

In [10]:
data.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [11]:
#Separating the features and the resulting vao

Y = data.is_promoted
df_raw = data.drop('is_promoted', axis = 1)

In [14]:
#One hot encoding of the dataset

col = ('department','region','education','gender','recruitment_channel')
df = df_raw
for column in col:
    raw = pd.get_dummies(df_raw[column], prefix = column)
    df = pd.concat([df, raw], axis = 1)
    df.drop(column, axis = 1, inplace = True)

In [15]:
df.set_index('employee_id', inplace = True)

In [16]:
df.to_csv(r'Encoded_train_data.csv', index = False)

In [263]:
from sklearn.preprocessing import PolynomialFeatures
#poly = PolynomialFeatures(interaction_only=False)

#df_poly = poly.fit_transform(df)
df_poly = np.hstack((df, df**2, df**3, df**4))

In [34]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(df ,Y ,test_size = 0.3,random_state = 4)

In [200]:
#Applying logistic regression

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)



In [31]:
#Applying SVM

from sklearn import svm

clf2 = svm.SVC()
clf2.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [37]:
#Applying Random Forest

from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import f1_score
#from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(n_estimators = 100, max_features=0.5, min_samples_split=2, random_state=42, oob_score = True)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [38]:
#Checking the feature importance and the Fscore of the models. We see that random forest classifier gives the best prediction score

feat_importances = pd.Series(rfc.feature_importances_, index=X_train.columns)
print(feat_importances)
yp_train = rfc.predict(X_train)
yp_test = rfc.predict(X_test)

ftrain = f1_score(y_train, yp_train)
ftest = f1_score(y_test, yp_test)
print(ftrain)
print(ftest)

0.9989427578915572
0.48308172826652784


In [169]:
data_test.isnull().sum()

employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [39]:
data_test.previous_year_rating.fillna(data.previous_year_rating.mode()[0], inplace = True)

s = data_test.education.value_counts(normalize = True)
missing = data_test['education'].isnull()
data_test.loc[missing,'education'] = np.random.choice(s.index, size=len(data_test[missing]),p=s.values)

In [46]:
col = ('department','region','education','gender','recruitment_channel')
test = data_test
for column in col:
    raw = pd.get_dummies(data_test[column], prefix = column)
    test = pd.concat([test, raw], axis = 1)
    test.drop(column, axis = 1, inplace = True)
test.set_index('employee_id')
test.drop('employee_id', axis = 1, inplace = True)

In [45]:
data_test.shape

(23490, 13)

In [47]:
y_pred = rfc.predict(test)
print(y_pred.sum())
Y_pred = pd.DataFrame(y_pred)
Y_pred.rename(columns={"0": "is_promoted"})
Y_pred = pd.concat([data_test['employee_id'], Y_pred], axis = 1)

813


In [48]:
Y_pred.to_csv(r'Y_pred2.csv', index = False)