In [90]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from boruta import BorutaPy
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
#os.chdir("C:/Users/micha/OneDrive/Dokumenty/GitHub/Machine_Learning/ML_classification")

# df -> data with part of the feature selection on whole df and part on train
# df_new -> data with feature selection on train df

df_old = pd.read_excel('../data/input_processed/train.xlsx', index_col=0)
test_old = pd.read_excel('../data/input_processed/test.xlsx', index_col=0)
fr_old = pd.read_excel('../data/input_processed/feature_ranking.xlsx', index_col=0)

df_new = pd.read_excel('../data/input_processed/train_1.xlsx', index_col=0)
test_new = pd.read_excel('../data/input_processed/test_1.xlsx', index_col=0)
fr_new = pd.read_excel('../data/input_processed/feature_ranking_1.xlsx', index_col=0)

In [None]:
def grid_search(df, x_col, model, param_grid, y_col, cv):

    grid_search = GridSearchCV(model, param_grid, scoring='balanced_accuracy', cv=cv)

    grid_search.fit(df.loc[:, x_col], df.loc[:, y_col])
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(df.loc[:, x_col])

    accuracy = balanced_accuracy_score(df.loc[:, y_col], y_pred)
    confusion = confusion_matrix(df.loc[:, y_col], y_pred)

    print(f"On Data: {x_col}, and model {model}")
    print("Confusion matrix:")
    print(confusion)
    print("Accuracy:", accuracy)
    print("Best model has parameters:")
    print(best_model)

    return model, accuracy

def get_model_name(model):
    return model.__class__.__name__.split("(")[0]

def cross_validation(model, _X, _y, _cv=5):
      _scoring = ['balanced_accuracy']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      val_acc_scores = results['test_balanced_accuracy']
      val_acc_mean = np.mean(val_acc_scores) * 100
      val_acc_std = np.std(val_acc_scores) * 100

      return val_acc_mean, val_acc_std
      #return {"Training Accuracy scores": results['train_balanced_accuracy'],
       #       "Mean Training Accuracy": results['train_balanced_accuracy'].mean()*100,
        #      
         #     "Validation Accuracy scores": results['test_balanced_accuracy'],
          #    "Mean Validation Accuracy": results['test_balanced_accuracy'].mean()*100,
           #   }
           
def cross_validation_accuracy(models, df, x_cols, y_col, cv=5):
    results_df = pd.DataFrame(columns=['Data Type', 'Mean Validation Accuracy', 'Validation Accuracy Std. Dev.'])
    
    for model in models:
        model_name = get_model_name(model)
        
        for x_col_name, x_col in x_cols.items():
            scores = cross_validate(model, df.loc[:, x_col], df.loc[:, y_col], cv=cv, scoring='balanced_accuracy', return_train_score=True)
            mean_accuracy = np.mean(scores['test_score']) * 100
            std_accuracy = np.std(scores['test_score']) * 100
            
            results_df = results_df.append({'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
    
    return results_df

In [None]:
# tu są definiowane których kolumn będziemy później używać za pomocą kryteriów wyliczonych na koniec 01_Data_preparation
# Jeśli masz w głowie jakieś inne to możesz dodać w 01, żeby były uwzględnione w feature_ranking.xlsx, oni tam podawali 
# FRE np. możesz spojrzeć w feature selection na wykład. 

boruta = fr_old[fr_old['boruta_rank'].isin([1])].index.tolist()
mi_score = fr_old[fr_old['mi_score'] > 0.01].index.tolist()
f_score = fr_old[fr_old['sign_fscore_0_1'] == 1].index.tolist()
Importance = fr_old[fr_old['Importance'] > 0.01].index.tolist()
Correlation = fr_old[fr_old['Corr'] > 0.1].index.tolist()
y_col = 'account_status'

x_cols_old = {
    'boruta': boruta,
    'mi_score': mi_score,
    'f_score': f_score,
    'Importance': Importance,
    'Correlation':Correlation
}

In [None]:
# zdefioniowanie x_cols_new

boruta = fr_new[fr_new['boruta_rank'].isin([1])].index.tolist()
mi_score = fr_new[fr_new['mi_score'] > 0.01].index.tolist()
f_score = fr_new[fr_new['sign_fscore_0_1'] == 1].index.tolist()
Importance = fr_new[fr_new['Importance'] > 0.01].index.tolist()
Correlation = fr_new[fr_new['Corr'] > 0.1].index.tolist()
y_col = 'account_status'

x_cols_new = {
    'boruta': boruta,
    'mi_score': mi_score,
    'f_score': f_score,
    'Importance': Importance,
    'Correlation':Correlation
}

# TUTAJ TUNUJEMY HYPERPARAMETRY

Bagging zastosowny do:
- Decision tree (nazwa BaggingClassifier - nazwać odpowiednio do decision tree)

modele które mamy:
- SVC
- LogisticRegression
- Decision tree

modele do dorzucenia:
- KNN

cv po wyestymowaniu zrobić

### SVM

In [None]:
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10,20],
    'gamma': [0.01, 0.1, 1]
}
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

# Tu definiujesz zbiór parametrów na przestrzeni której będzies szukać najlepszego doboru parametrów.

In [None]:
# grid search SVM for old df

for x_col in x_cols_old:
    accuracy = grid_search(df_old, x_cols_old[x_col], SVC(), param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)
print(results_df)

In [None]:
# save results to other df
results_df_SVM_old = results_df
results_df_SVM_old

In [None]:
# grid search SVM for new df 
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_new:
    accuracy = grid_search(df_new, x_cols_new[x_col], SVC(), param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)
print(results_df)

In [None]:
# save results to other df
results_df_SVM_new = results_df
results_df_SVM_new

In [None]:
df_compare_SVC = pd.concat([results_df_SVM_old, results_df_SVM_new], axis=1)
df_compare_SVC.columns = ['Data Type', 'Accuracy Old', 'Data Type New', 'Accuracy New']
df_compare_SVC = df_compare_SVC.drop(columns=['Data Type New'])

In [None]:
df_compare_SVC

### Decision Tree

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

In [None]:
# grid search DT for old df

model = DecisionTreeClassifier()
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_old:
    accuracy = grid_search(df_old, x_cols_old[x_col], model, param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)

In [None]:
results_DT_old = results_df
results_DT_old

In [None]:
# grid search DT for new df
model = DecisionTreeClassifier()

results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_new:
    accuracy = grid_search(df_new, x_cols_new[x_col], model, param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)

In [None]:
results_DT_new = results_df
results_DT_new

In [None]:
df_compare_DT = pd.concat([results_DT_old, results_DT_new], axis=1)
df_compare_DT.columns = ['Data Type', 'Accuracy Old', 'Data Type New', 'Accuracy New']
df_compare_DT = df_compare_DT.drop(columns=['Data Type New'])

In [None]:
df_compare_DT

### Logistic regression

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # Some solvers only support certain types of penalties
}

In [None]:
# grid search for logistic regression for new df

model = LogisticRegression()
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_new:
    accuracy = grid_search(df_new, x_cols_new[x_col], model, param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)

In [None]:
results_LR_new = results_df
results_LR_new

# TU SPRAWDZAMY SKUTECZNOŚĆ NA CV

In [None]:
# Tu są zdefiniowane modele, ja je definiowałem trochę z buta, trochę z wcześniejszych wyników gridSearchu
# Dobrze by to wyglądało jakbyś tu np. porównał modele dla konkretnych typów. Czyli np. tabela z mean_accuracy i std.dev
# dla DecisionTreeClassifier kla każdego typu danych i np. różnych parametrów (np. criterion = 'gini' lub 'entropy')
# I porównać normalny DecisionTreeClassifier z BaggingClassifier(DecisionTreeClassifier()) i tak dla każdej z grup modeli
# mamy tu DecisionTree, SVC, LogisticRegreesion i RandomForest, możesz też spróbować policzyć KNN bo tego nie liczyłem a może wyniki będą spoko



modelDT_1 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4,
                       min_samples_split=10)
modelDT_B1 = BaggingClassifier(DecisionTreeClassifier(), random_state = 42)

model3 = SVC(kernel = 'rbf',C=20, gamma='scale')
model4 = LogisticRegression(penalty = 'l2', fit_intercept = False, solver = 'lbfgs')

model6 = RandomForestClassifier(random_state = 42)


models = [modelDT_1, model3, model4, modelDT_B1, model6] 


In [None]:
results = cross_validation_accuracy(models, df, x_cols, y_col)

In [None]:
results.sort_values(by=['Mean Validation Accuracy'], ascending=[False])

In [None]:
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10,20],
    'gamma': [0.01, 0.1, 1]
}

svm = SVC()

grid_search = GridSearchCV(svm, param_grid, scoring='balanced_accuracy', cv=5)

grid_search.fit(df.loc[:, boruta], df.loc[:,y_col])
best_model = grid_search.best_estimator_
y_pred = best_model.predict(test.loc[:, boruta])
accuracy = balanced_accuracy_score(test.loc[:,y_col], y_pred)
confusion = confusion_matrix(test.loc[:,y_col], y_pred)
print(confusion)
accuracy


## Bagging

### DecisionTree

In [None]:
model = BaggingClassifier(DecisionTreeClassifier(), random_state = 42)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

n_scores = cross_val_score(model, df.loc[:, boruta], df.loc[:,y_col], scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Balanced Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


In [None]:
model.fit(df.loc[:, boruta], df.loc[:, y_col])
predictions = model.predict(test.loc[:, boruta])

In [None]:
balanced_acc = balanced_accuracy_score(test.loc[:, y_col], predictions)
confusion_mat = confusion_matrix(test.loc[:, y_col], predictions)
print('Balanced Accuracy: %.3f' % balanced_acc)
print(confusion_mat)

In [None]:
predictions_series = pd.Series(predictions, name = 'Prediction')
predictions_series.reset_index(drop=True, inplace=True)
test.loc[:, y_col].reset_index(drop=True, inplace=True)
results_df = pd.concat([predictions_series, test.loc[:, y_col]], axis=1)
results_df[results_df['Prediction'] != results_df['account_status']]