# Feature Engineering

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy

from sklearn.model_selection import train_test_split

In [2]:
from sklearn.metrics import f1_score

In [3]:
train_df = pd.read_csv('../cleaned_data/train.csv')
test_df = pd.read_csv('../cleaned_data/test.csv')


train_df.drop('Unnamed: 0', axis = 1, inplace = True)
train_df.set_index('employee_id', inplace = True)
test_df.drop('Unnamed: 0', axis = 1, inplace = True)
test_df.set_index('employee_id', inplace = True)

## Correlational Analysis

In [None]:
np.abs(train_df.corr()['is_promoted']).sort_values()[::-1].index[1:11]

In [None]:
features_1 = ["predicted_rating", "KPIs_met >80%", "awards_won?",
       "avg_training_score", "previous_year_rating", "region_4", "region_22",
       "department_sales_and_martketing", "department_technology",
       "Master's & above"]

In [None]:
features_1

In [None]:
train_df.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier

Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis=1)
X = X[features_1]
newtest_df = test_df[features_1]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(max_depth=40, random_state=0, min_samples_split=10)
clf.fit(X, Y)
print(f1_score(Y, clf.predict(X)))
pred = clf.predict(newtest_df)

In [None]:
train_df.describe().iloc[:, 1:10]

## Scaling Data

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Y = train_df['is_promoted']
X = train_df.drop(['is_promoted'], axis=1)
# X = scaler.fit_transform(X)
# newtest_df = scaler.transform(test_df)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [20]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=40, random_state=0, min_samples_split=10)
clf.fit(X_train, y_train)
print(f1_score(y_test, clf.predict(X_test)))

0.42892768079800503


In [21]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)
print(f1_score(y_test, clf.predict(X_test)))

0.43835616438356173


## Correlational Analysis

In [8]:
train_df.corr()['is_promoted'].sort_values()[::-1][1:20]

predicted_rating          0.265593
KPIs_met >80%             0.221582
awards_won?               0.195871
avg_training_score        0.181147
previous_year_rating      0.153230
region_4                  0.038031
region_22                 0.037893
department_technology     0.031085
Master's & above          0.029343
region_7                  0.023841
region_17                 0.022513
referred                  0.018641
region_25                 0.017911
region_28                 0.017812
region_23                 0.016664
department_procurement    0.015548
department_analytics      0.012369
department_operations     0.009113
region_3                  0.006215
Name: is_promoted, dtype: float64