In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### EMP IDS common to both train and test sets - to isolate the test data

In [3]:
common = set(train.Emp_ID.unique()) & set(test.Emp_ID.unique())

In [4]:
print(train.shape,test.shape)

(19104, 13) (741, 1)


### Feature Processing and Engineering

In [5]:
train['MMM-YY'] = pd.to_datetime(train['MMM-YY'],errors='coerce')
train['Dateofjoining'] = pd.to_datetime(train['Dateofjoining'],errors='coerce')
train['LastWorkingDate'] = pd.to_datetime(train['LastWorkingDate'],errors='coerce')

## Deriving Features at EMP ID level
### 1. Calculating the Average Business Value 
### 2. Average Quarterly Rating
### 3. Number of reporting instances for each Emp ID 

In [6]:
sub_train = train[['Emp_ID','Total Business Value']]
sub_train_q = train[['Emp_ID','Quarterly Rating']]
rows = train[['Emp_ID','MMM-YY']]
strain = sub_train.groupby('Emp_ID')['Total Business Value'].mean().reset_index()
qtrain = sub_train_q.groupby('Emp_ID')['Quarterly Rating'].mean().reset_index()
rows = rows.groupby('Emp_ID')['MMM-YY'].count().reset_index()
strain.rename(columns = {'Total Business Value':'Avg Total Business Value'},inplace=True)
qtrain.rename(columns = {'Quarterly Rating':'Avg Quarterly Rating'},inplace=True)
rows.rename(columns = {'MMM-YY':'entries'},inplace=True)

In [7]:
train_final = pd.merge(train,strain,on = 'Emp_ID')
train_final = pd.merge(train_final,qtrain,on='Emp_ID')
train_final = pd.merge(train_final,rows,on='Emp_ID')

In [8]:
train_final.shape

(19104, 16)

### Deriving the Target Column for the Train Data using the LastWorkingDate is null or not

In [9]:
train_final = train_final.sort_values("MMM-YY",ascending=True)
train_final.drop_duplicates("Emp_ID",keep='last',inplace=True)
train_final['target'] = np.where(train_final['LastWorkingDate'].notna(),1,0)

In [10]:
train_final.isna().sum()

MMM-YY                        0
Emp_ID                        0
Age                           0
Gender                        0
City                          0
Education_Level               0
Salary                        0
Dateofjoining                 0
LastWorkingDate             765
Joining Designation           0
Designation                   0
Total Business Value          0
Quarterly Rating              0
Avg Total Business Value      0
Avg Quarterly Rating          0
entries                       0
target                        0
dtype: int64

### Scaling the numerical features - Salary and Avg Total Business Value

In [11]:
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler

In [12]:
rscaler = RobustScaler( 
with_centering=True, 
with_scaling=True, 
quantile_range=(25.0, 75.0), 
copy=True, 
) 

In [13]:
train_final_sub = rscaler.fit_transform(train_final[['Salary','Avg Total Business Value']])
train_final_sub = pd.DataFrame(train_final_sub,columns = ['log_salary','log_bv'])
train_final = pd.concat([train_final.reset_index(drop=True),train_final_sub.reset_index(drop=True)],axis=1)

### Checking if Designation has changed for employee since joining 

In [14]:
train_final['Desig_Change'] = np.where((train_final['Joining Designation']<train_final['Designation']),1,0)

### Label and one hot encoding Encoding Categorical Features like Gender,Education_Level and City

In [15]:
train_final['Gender'] = train_final['Gender'].astype('category')
train_final['Education_Level'] = train_final['Education_Level'].astype('category')
train_final['City'] = train_final['City'].astype('category')
le_city = preprocessing.LabelEncoder()
le_ed = preprocessing.LabelEncoder()
le_gender = preprocessing.LabelEncoder()
train_final['Gender'] = le_gender.fit_transform(train_final['Gender'])
train_final['City'] = le_city.fit_transform(train_final['City'])
train_final['Education_Level'] = le_ed.fit_transform(train_final['Education_Level'])

In [16]:
train_final = pd.get_dummies(train_final,columns=['Gender','City','Education_Level'])

### Creating the Training and Testing datasets

In [17]:
X = train_final[~(train_final['Emp_ID'].isin(common))]
X_test = train_final[train_final['Emp_ID'].isin(common)]

In [22]:
y = X['target']
X = X.drop(['MMM-YY','Emp_ID','Age','Dateofjoining','LastWorkingDate','target','Total Business Value','Avg Total Business Value','Salary','Quarterly Rating'],axis=1)
X_test.sort_values("Emp_ID",inplace=True)
X_test_final = X_test.drop(['MMM-YY','Emp_ID','Age','Dateofjoining','LastWorkingDate','target','Total Business Value','Avg Total Business Value','Salary','Quarterly Rating'],axis=1)

### The number of employees not going to leave is very less compared to otherwise, hence we need to employ Anomaly Detection techniques as the classes are too far imbalanced to be using upsampling or ensemble techniques which will probably lead to overfitting

In [25]:
y.value_counts()

1    1616
0      24
Name: target, dtype: int64

### Employing One Class SVM

In [30]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)
model = OneClassSVM(gamma='auto', nu=0.2,kernel='rbf')
trainX = trainX[trainy==1]
model.fit(trainX)
yhat = model.predict(testX)
testy[testy == 0] = -1
testy[testy == 1] = 1
# checking f1 score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)

F1 Score: 0.083


### Make predictions on the given Test data

In [31]:
test_pred = model.predict(X_test_final)
print(pd.DataFrame(test_pred)[0].value_counts())
test_pred_series = pd.Series(test_pred,index = range(0,len(test_pred)))
test_x = pd.Series(X_test['Emp_ID'].values,index = range(0,len(test_pred)))
X_test_ = pd.DataFrame({'Emp_ID':test_x,'Target':test_pred_series})
X_test_.Target = np.where(X_test_.Target == -1,0,1)

-1    374
 1    367
Name: 0, dtype: int64


### After converting -1 to 0

In [32]:
X_test_.Target.value_counts()

0    374
1    367
Name: Target, dtype: int64

In [33]:
X_test_.to_csv("test_predictions.csv",index=False)