### Outline
- Preprocessing
- Simulate Futur Data
- KNN
- Linear Regression
- Logistic Regression
- SVM

In [1]:
import pandas as pd

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [8]:
data = pd.read_csv("hr_data_clean.csv")
data = data.drop("Unnamed: 0", axis=1)
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,division,salary,left
0,0.65,0.53,2.0,157.0,3.0,0,0,sales,KOSONG,1
1,0.8,0.86,5.0,262.0,6.0,0,0,sales,medium,1
2,0.11,0.72,7.0,272.0,4.0,0,0,KOSONG,medium,1
3,0.72,0.87,5.0,223.0,5.0,0,0,sales,low,1
4,0.37,0.52,2.0,159.0,3.0,0,0,sales,low,1


In [23]:
data_numerical = data._get_numeric_data()
data_numerical.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left
0,0.65,0.53,2.0,157.0,3.0,0,0,1
1,0.8,0.86,5.0,262.0,6.0,0,0,1
2,0.11,0.72,7.0,272.0,4.0,0,0,1
3,0.72,0.87,5.0,223.0,5.0,0,0,1
4,0.37,0.52,2.0,159.0,3.0,0,0,1


In [24]:
data_numerical = data_numerical.drop(["Work_accident", "promotion_last_5years", "left"], axis=1)
data_numerical.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company
0,0.65,0.53,2.0,157.0,3.0
1,0.8,0.86,5.0,262.0,6.0
2,0.11,0.72,7.0,272.0,4.0
3,0.72,0.87,5.0,223.0,5.0
4,0.37,0.52,2.0,159.0,3.0


In [26]:
data_categorical = data.drop(data_numerical.columns, axis=1)
data_categorical = data_categorical.drop("left", axis=1)
data_categorical.head()

Unnamed: 0,Work_accident,promotion_last_5years,division,salary
0,0,0,sales,KOSONG
1,0,0,sales,medium
2,0,0,KOSONG,medium
3,0,0,sales,low
4,0,0,sales,low


###### Standardize & Dummy Categorical

In [27]:
data_categorical_dummy = pd.get_dummies(data_categorical)
data_categorical_dummy.head()

Unnamed: 0,Work_accident,promotion_last_5years,division_IT,division_KOSONG,division_RandD,division_accounting,division_hr,division_management,division_marketing,division_product_mng,division_sales,division_support,division_technical,salary_KOSONG,salary_high,salary_low,salary_medium
0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [29]:
standardizer = StandardScaler()
standardizer.fit(data_numerical)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [32]:
data_numerical_std = pd.DataFrame(standardizer.transform(data_numerical))
data_numerical_std.columns = data_numerical.columns
data_numerical_std.index = data_numerical.index
data_numerical_std.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company
0,0.138937,-1.093826,-1.462863,-0.88204,-0.341235
1,0.749318,0.844835,0.971113,1.220423,1.713436
2,-2.058434,0.022372,2.593763,1.420657,0.343655
3,0.423782,0.903582,0.971113,0.439508,1.028546
4,-1.000441,-1.152573,-1.462863,-0.841993,-0.341235


###### Standar Deviasi dan Mean

In [33]:
data_numerical_std["satisfaction_level"].mean() # mean

1.8473566344012805e-15

In [35]:
data_numerical_std["satisfaction_level"].std() # standard deviation

1.0000333372226655

In [36]:
data_input = pd.concat([data_categorical_dummy,  data_numerical_std], axis=1)
data_input.head()

Unnamed: 0,Work_accident,promotion_last_5years,division_IT,division_KOSONG,division_RandD,division_accounting,division_hr,division_management,division_marketing,division_product_mng,...,division_technical,salary_KOSONG,salary_high,salary_low,salary_medium,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0.138937,-1.093826,-1.462863,-0.88204,-0.341235
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0.749318,0.844835,0.971113,1.220423,1.713436
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,-2.058434,0.022372,2.593763,1.420657,0.343655
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0.423782,0.903582,0.971113,0.439508,1.028546
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,-1.000441,-1.152573,-1.462863,-0.841993,-0.341235


###### Simulate Future Data

In [38]:
data_output = data["left"]

In [39]:
#train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    data_input, 
    data_output, 
    test_size = 0.2, 
    random_state=123)

In [40]:
x_train.head()

Unnamed: 0,Work_accident,promotion_last_5years,division_IT,division_KOSONG,division_RandD,division_accounting,division_hr,division_management,division_marketing,division_product_mng,...,division_technical,salary_KOSONG,salary_high,salary_low,salary_medium,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company
3553,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,-1.244593,-0.447606,1.782438,-0.36143,-1.026126
2112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,-0.105215,0.551098,0.159788,-0.021031,-1.026126
1794,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,-0.959748,-1.211321,-1.462863,-0.841993,-0.341235
13885,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1.359699,0.316109,0.971113,-0.041055,4.452998
11250,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,-0.471444,0.316109,-0.651538,1.280493,-0.341235


###### Benchmark

In [41]:
y_train.value_counts(normalize=True)

0    0.76148
1    0.23852
Name: left, dtype: float64

###### K-Nearest Neighbor

In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support

In [54]:
knn = KNeighborsClassifier(n_neighbors=11)

In [55]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [56]:
knn.score(x_train, y_train)

0.94957913159429952

In [57]:
knn.score(x_test, y_test)

0.94299999999999995

In [58]:
x_proba = knn.predict_proba(x_test)

In [61]:
x_proba[:1]

array([[ 1.,  0.]])

###### Linear Regression

In [62]:
from sklearn.linear_model import LinearRegression

In [63]:
linreg = LinearRegression()

In [64]:
linreg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [67]:
linreg.score(x_train, y_train)

0.23708594307074493

In [68]:
linreg.score(x_test, y_test)

0.20586302960843816

In [69]:
linreg.predict(x_train)

array([ 0.21435547,  0.27392578,  0.45556641, ...,  0.34960938,
        0.20849609,  0.13720703])

###### Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression

In [71]:
logreg = LogisticRegression()

In [72]:
logreg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
logreg.predict(x_train)

array([0, 0, 1, ..., 0, 0, 0])

In [74]:
logreg.score(x_train, y_train)

0.79298274856238016

In [76]:
logreg.score(x_test, y_test)

0.78766666666666663

In [77]:
logreg.predict_proba(x_train)

array([[ 0.90978422,  0.09021578],
       [ 0.77074319,  0.22925681],
       [ 0.47026472,  0.52973528],
       ..., 
       [ 0.64401352,  0.35598648],
       [ 0.82031095,  0.17968905],
       [ 0.89165717,  0.10834283]])

###### SVM

In [78]:
from sklearn.svm import SVC

In [85]:
svm = SVC(kernel="linear", probability=True)

In [86]:
svm.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [87]:
svm.score(x_train, y_train)

0.77648137344778734

In [88]:
svm.score(x_test, y_test)

0.77533333333333332

In [89]:
svm.predict_proba(x_train)

array([[ 0.77359937,  0.22640063],
       [ 0.77346076,  0.22653924],
       [ 0.77330198,  0.22669802],
       ..., 
       [ 0.77346788,  0.22653212],
       [ 0.77357004,  0.22642996],
       [ 0.77349649,  0.22650351]])

In [67]:
linreg.score(x_train, y_train)

0.23708594307074493

###### SVM with kernel function

In [90]:
poly_svm = SVC(kernel="poly")

In [93]:
poly_svm.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [94]:
poly_svm.score(x_train, y_train)

0.94382865238769897

In [95]:
poly_svm.score(x_test, y_test)

0.93733333333333335

### Save the Next Session

### Tree-based Method

##### Outline
- Decision Tree
- Bagging
- Random Forest
- Boosting

In [96]:
from sklearn.tree import DecisionTreeClassifier

In [97]:
dectree = DecisionTreeClassifier()

In [98]:
dectree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [99]:
dectree.score(x_train, y_train)

1.0

In [100]:
dectree.score(x_test, y_test)

0.97066666666666668

###### Bagging

In [101]:
from sklearn.ensemble import BaggingClassifier

###### Base Classifier

In [102]:
base_dectree = DecisionTreeClassifier()

In [151]:
bagging_dectree = BaggingClassifier(base_estimator=base_dectree, n_estimators=75, random_state=123)

In [152]:
bagging_dectree.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=75, n_jobs=1, oob_score=False,
         random_state=123, verbose=0, warm_start=False)

In [153]:
bagging_dectree.score(x_test, y_test)

0.98733333333333329

###### Random Forest

In [154]:
from sklearn.ensemble import RandomForestClassifier

In [155]:
base_dectree = DecisionTreeClassifier()

In [211]:
random_forest = RandomForestClassifier(n_estimators=10, random_state=123)

In [212]:
random_forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [213]:
random_forest.score(x_test, y_test)

0.98299999999999998

# Boosting

In [214]:
from sklearn.ensemble import AdaBoostClassifier

In [215]:
base_ = DecisionTreeClassifier()

In [221]:
adaboost = AdaBoostClassifier(base_estimator=base_, n_estimators=50, random_state=123)

In [222]:
adaboost.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=123)

In [223]:
adaboost.score(x_train, y_train)

1.0

In [225]:
adaboost.score(x_test, y_test)

0.97199999999999998