In [None]:
# Importy bibliotek
import pandas as pd
import numpy as np
from datetime import datetime

import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
import xgboost as xgb

# Pliki CSV umieszczone w folderze data
uczestnicy = pd.read_csv('data/PPK_Uczestnicy.csv', sep=';')
pracodawcy = pd.read_csv('data/PPK_Pracodawcy.csv',sep=';')

# Mergowanie dwóch csv
df = pd.merge(
    uczestnicy,
    pracodawcy,
    how='left',
    left_on='EMPL_ID',
    right_on='ID'
)


In [None]:
#Age to int

def to_int(age):
    age = age[:age.find(",")]
    return int(age)

df["AGE"] = df["AGE"].apply(to_int)

In [None]:
# Dropowanie niepotrzebnych kolumn
df.drop(columns=['MEMBER_ID','EMPL_ID', 'WORK_START', 'WORK_STOP', 'LOGICAL_FACTOR_1','LOGICAL_FACTOR_2','ID','PKD_CODE','PPK_BANK','NUMERICAL_VALUE'], inplace=True)

In [None]:
# Grouping other nationalities into one category '0'
unique_nat = df['NATIONALITY'].unique()
for val in unique_nat:
    suma = ( df['NATIONALITY'].values == val ).sum()
    if suma < 5000:
        df.loc[df.NATIONALITY == val, 'NATIONALITY'] = 0       

In [None]:
# zamiana danych kategorycznych na dummy variables - tworzymy nową kolumnę na każdy możliwy output kolumny, porównaj poprzednią i następną komórkę
df = pd.get_dummies(df, columns = ['SEX', 'COMPANY_SIZE', 'COMPANY_TYPE', 'VOIVODESHIP', 'NATIONALITY','PPK_STAGE'], 
                         prefix = ['SEX', 'COMPANY_SIZE', 'COMPANY_TYPE', 'VOIVODESHIP', 'NATIONALITY','PPK_STAGE'])

In [None]:
# 
def find_period(data):
    start, stop = data
    if type(stop) is float:
        stop = datetime.now().strftime("%Y-%m-%d")
    
    start_year, start_month, start_day = map(int, start.split('-'))
    stop_year, stop_month, stop_day = map(int, stop.split('-'))
    
    days = (stop_year - start_year)*365
    days += (stop_month - start_month)*30 if stop_month > start_month else (start_month - stop_month)*30
    days += stop_day - start_day if stop_day > start_day else start_day - stop_day
    
    return days


df['DURATION'] = list(zip(df['CREATED_AT'], df['RESIGN_DATE']))

df['DURATION'] = df['DURATION'].apply(find_period) 

df['RESIGNED'] = ~df['RESIGN_DATE'].isna()

df.drop(columns = ['SIGN_DATE', 'RESIGN_DATE'], inplace=True)
# z tym nie wiem co mam zrobic, mozna wyjebac albo cos mądrego zrobic
df.drop(columns=["CREATED_AT", "UOZ_START_DATE", "UOP_SIGN_DATE", 'REGION_CODE'], inplace=True)

### wyrzucenie paru kolumn co nie pasują narazie i TEST/TRAIN split

In [None]:
# ta kolumna to w ogole jest 100% skorelowana z targetem, pozdro
df.drop(columns=["RESIGNED"], inplace = True)

# to tez jest podejrzanie skorelowane
# df.drop(columns=["DURATION"], inplace = True)

In [None]:
df.columns


In [None]:
#POSITIVE = SUSPENDED jak cos

X_train, X_test, y_train, y_test = train_test_split(df[df.columns[1:]], df["IS_SUSPENDED"], stratify=df["IS_SUSPENDED"], test_size=0.10, random_state=56)
#building the model
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1).fit(X_train, y_train)

print('Accuracy of XGB classifier on training set: {:.2f}'
       .format(xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
       .format(xgb_model.score(X_test[X_train.columns], y_test)))

In [None]:
y_pred = xgb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from xgboost import plot_importance
fig, ax = plt.subplots(figsize=(10,8))
plot_importance(xgb_model, ax=ax)

In [None]:
df['proba'] = xgb_model.predict_proba(df[X_train.columns])[:,1]
df[['AGE','proba']].head(50)

In [None]:
# dane testowe
testowe = pd.read_csv('data/PPK_Uczestnicy_TEST.csv',sep=';')

In [None]:
testowe.columns

In [None]:
testowe["RESIGN_DATE"].value_counts()

In [None]:
testowe["RESIGN_DATE"].isna().sum()