In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, OneHotEncoder, StandardScaler

In [2]:
df = pd.read_csv("data/general_data.csv", index_col="EmployeeID")
df.drop(['EmployeeCount','StandardHours'],axis=1, inplace = True) # Remove boring columns 
df = df[~df.isna().apply(any, axis=1)] # Remove nan columns

cat_cols = list(df.dtypes[df.dtypes == 'object'].index.values)
cat_cols.remove('Attrition') # remove target column 
num_cols = list(df.dtypes[df.dtypes != 'object'].index.values) + ["Attrition"]
num_cols.remove('Attrition')

# 5.1 Prediction

In [3]:
scaler = StandardScaler()
polynomilas = PolynomialFeatures(3)
encoder = OneHotEncoder(sparse=False)

In [4]:
X = df[df.columns.difference(['Attrition'])]
y = df.Attrition 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
# Train data
poly = polynomilas.fit_transform(X_train[num_cols])
scaled = scaler.fit_transform(poly)
encoded = encoder.fit_transform(X_train[cat_cols])
print(f"numerical shape is {scaled.shape}")
print(f"categorical  shape is {encoded.shape}")
X_train_proc = np.concatenate((scaled, encoded), axis=1)
X_train_proc.shape

numerical shape is (3505, 560)
categorical  shape is (3505, 27)


(3505, 587)

In [7]:
# Test data
poly_test = polynomilas.transform(X_test[num_cols])
scaled_test = scaler.transform(poly_test)
encoded_test = encoder.transform(X_test[cat_cols])
print(f"numerical shape is {scaled_test.shape}")
print(f"categorical  shape is {encoded_test.shape}")
X_test_proc = np.concatenate((scaled_test, encoded_test), axis=1)
X_test_proc.shape

numerical shape is (877, 560)
categorical  shape is (877, 27)


(877, 587)

In [8]:
# No optimalizations 
# neigh = KNeighborsClassifier(n_neighbors=3)
neigh = LogisticRegression()
neigh.fit(X_train[num_cols], y_train) 
y_pred = neigh.predict(X_test[num_cols])

print(f1_score(y_test, y_pred, average='micro'))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
print(classification_report(y_test,y_pred))

0.8608893956670466
0.43044469783352335
0.5
              precision    recall  f1-score   support

          No       0.86      1.00      0.93       755
         Yes       0.00      0.00      0.00       122

    accuracy                           0.86       877
   macro avg       0.43      0.50      0.46       877
weighted avg       0.74      0.86      0.80       877



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [125]:
# With optimalizations 
# neigh = KNeighborsClassifier(n_neighbors=3)
neigh = LogisticRegression()
neigh.fit(X_train_proc, y_train) 
y_pred = neigh.predict(X_test_proc)

print(f1_score(y_test, y_pred, average='micro'))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
print(classification_report(y_test,y_pred))

0.9019384264538198
0.8358204992033988
0.705954836608403
              precision    recall  f1-score   support

          No       0.91      0.98      0.94       755
         Yes       0.76      0.43      0.55       122

    accuracy                           0.90       877
   macro avg       0.84      0.71      0.75       877
weighted avg       0.89      0.90      0.89       877



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# 5.2 Feature selection

In [126]:
from sklearn.feature_selection import RFE

In [127]:
feature_selection = RFE(LogisticRegression())
feature_selection = feature_selection.fit(X_train_proc, y_train) 
print(feature_selection.support_)
print(feature_selection.ranking_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[False  True False  True  True False False False False  True  True  True
 False  True False False False False False  True False False False False
 False False  True  True  True  True  True  True False False False  True
 False False False False False False False  True False  True False  True
 False  True False  True  True False False  True  True  True False  True
  True False  True  True  True  True False False  True False False  True
 False False False False False False False False False  True False  True
 False  True False False False False False False False  True False  True
 False  True  True  True  True  True  True  True  True  True  True False
 False False False False False False False  True  True False  True  True
  True  True False False False False  True  True False False False  True
  True False  True  True False False False  True  True  True  True False
  True False False False False  True  True  True False  True False False
  True False False  True  True  True False False  T

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [128]:
neigh = LogisticRegression()
temp= pd.DataFrame(X_train_proc)
X_train_proc_selected = temp[temp.columns[feature_selection.support_]].values
neigh.fit(X_train_proc_selected, y_train) 
temp= pd.DataFrame(X_test_proc)
X_test_proc_selected = temp[temp.columns[feature_selection.support_]].values
y_pred = neigh.predict(X_test_proc_selected)

print(f1_score(y_test, y_pred, average='micro'))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
print(classification_report(y_test,y_pred))

0.8985176738882554
0.8256295738269479
0.6970958636412985
              precision    recall  f1-score   support

          No       0.91      0.98      0.94       755
         Yes       0.74      0.42      0.53       122

    accuracy                           0.90       877
   macro avg       0.83      0.70      0.74       877
weighted avg       0.89      0.90      0.89       877



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [117]:
X_train_proc

array([[ 0.        ,  2.00045994,  2.17956966, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        ,  0.56668265,  1.68897665, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.        , -0.75680407, -1.00928488, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.        ,  1.89016938, -0.88663663, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.        , -0.53622295,  0.09454938, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.        , -0.09506071,  1.44368015, ...,  1.        ,
         0.        ,  1.        ]])