In [38]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import PolynomialFeatures,  MinMaxScaler

# Polynomiálne črty 
Niekedy sa v naších dátach môžu nachádzať závislosti medzi stĺpcami, ktoré nemusia byť lineárne. V tomto prípade nám vedia pomôcť polynomiálne črty. Ide o techniku, kde vytvárame nové stĺpce prenásobením starých stĺcov medzi sebou aj s využitím exponentov. 

Polynomiálne črty pre n=1:
- 1 (bias)
- stĺpec na prvú

In [39]:
test = pd.DataFrame([[1,2], [1,2], [3,4]])
test

Unnamed: 0,0,1
0,1,2
1,1,2
2,3,4


In [40]:
pf = PolynomialFeatures(1)
pd.DataFrame(pf.fit_transform(test))

Unnamed: 0,0,1,2
0,1.0,1.0,2.0
1,1.0,1.0,2.0
2,1.0,3.0,4.0


Polynomiálne črty pre n=2
- 1 (bias)
- stĺpec na prvú
- stĺpec na druhú
- interakcia medzi pármi čŕt

Príklad pre stĺpce A a B 

[1, a, b, a^2, ab, b^2].

In [28]:
test

Unnamed: 0,0,1
0,1,2
1,1,2
2,3,4


In [29]:
pf = PolynomialFeatures(2)
pd.DataFrame(pf.fit_transform(test))

Unnamed: 0,0,1,2,3,4,5
0,1.0,1.0,2.0,1.0,2.0,4.0
1,1.0,1.0,2.0,1.0,2.0,4.0
2,1.0,3.0,4.0,9.0,12.0,16.0


In [30]:
pf = PolynomialFeatures(3)
pd.DataFrame(pf.fit_transform(test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,1.0,2.0,1.0,2.0,4.0,1.0,2.0,4.0,8.0
1,1.0,1.0,2.0,1.0,2.0,4.0,1.0,2.0,4.0,8.0
2,1.0,3.0,4.0,9.0,12.0,16.0,27.0,36.0,48.0,64.0


# Vyhodnotenie predstracovania

In [45]:
df = pd.read_csv("processed_data/numerical.csv", index_col="EmployeeID")
y = df['Attrition_num']
X = df[df.columns.difference(['Attrition_num', 'Attrition'])]
X.columns

Index(['Age', 'DistanceFromHome', 'Education', 'JobLevel', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [46]:
pf = PolynomialFeatures(3)
scaler = MinMaxScaler()

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train_sc = scaler.fit_transform(X_train)
X_train_sc_pf = pf.fit_transform(X_train_sc)

X_test_sc = scaler.fit_transform(X_test)
X_test_sc_pf = pf.fit_transform(X_test_sc)

In [50]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_sc_pf, y_train) 
y_pred = neigh.predict(X_test_sc_pf)

print(f1_score(y_test, y_pred, average='micro'))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
print(classification_report(y_test,y_pred))

0.9258836944127709
0.8371227066482541
0.8710509173813918
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       755
           1       0.71      0.80      0.75       122

    accuracy                           0.93       877
   macro avg       0.84      0.87      0.85       877
weighted avg       0.93      0.93      0.93       877

