In [10]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from boruta import BorutaPy
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [4]:
#os.chdir("C:/Users/micha/OneDrive/Dokumenty/GitHub/Machine_Learning/ML_classification")

# df -> data with part of the feature selection on whole df and part on train
# df_new -> data with feature selection on train df

df_old = pd.read_excel('../data/input_processed/train.xlsx', index_col=0)
test_old = pd.read_excel('../data/input_processed/test.xlsx', index_col=0)
fr_old = pd.read_excel('../data/input_processed/feature_ranking.xlsx', index_col=0)

df_new = pd.read_excel('../data/input_processed/train_1.xlsx', index_col=0)
test_new = pd.read_excel('../data/input_processed/test_1.xlsx', index_col=0)
fr_new = pd.read_excel('../data/input_processed/feature_ranking_1.xlsx', index_col=0)

In [50]:
def grid_search(df, x_col, model, param_grid, y_col, cv):

    grid_search = GridSearchCV(model, param_grid, scoring='balanced_accuracy', cv=cv)

    grid_search.fit(df.loc[:, x_col], df.loc[:, y_col])
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(df.loc[:, x_col])

    accuracy = balanced_accuracy_score(df.loc[:, y_col], y_pred)
    confusion = confusion_matrix(df.loc[:, y_col], y_pred)

    print(f"On Data: {x_col}, and model {model}")
    print("Confusion matrix:")
    print(confusion)
    print("Accuracy:", accuracy)
    print("Best model has parameters:")
    print(best_model)

    return model, accuracy

def get_model_name(model):
    return model.__class__.__name__.split("(")[0]


def cross_validation(model, _X, _y, _cv=5):
      _scoring = ['balanced_accuracy']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      val_acc_scores = results['test_balanced_accuracy']
      val_acc_mean = np.mean(val_acc_scores) * 100
      val_acc_std = np.std(val_acc_scores) * 100

      return val_acc_mean, val_acc_std
      #return {"Training Accuracy scores": results['train_balanced_accuracy'],
       #       "Mean Training Accuracy": results['train_balanced_accuracy'].mean()*100,
        #      
         #     "Validation Accuracy scores": results['test_balanced_accuracy'],
          #    "Mean Validation Accuracy": results['test_balanced_accuracy'].mean()*100,
           #   }
           
def cross_validation_accuracy(models, df, x_cols, y_col, cv=5):
    results_df = pd.DataFrame(columns=['Model', 'Data Type', 'Mean Validation Accuracy', 'Validation Accuracy Std. Dev.'])
    
    for model_name, model in models.items():
        for x_col_name, x_col in x_cols.items():
            scores = cross_validate(model, df.loc[:, x_col], df.loc[:, y_col], cv=cv, scoring='balanced_accuracy', return_train_score=True)
            mean_accuracy = np.mean(scores['test_score']) * 100
            std_accuracy = np.std(scores['test_score']) * 100
            
            results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
    
    return results_df

In [6]:
# tu są definiowane których kolumn będziemy później używać za pomocą kryteriów wyliczonych na koniec 01_Data_preparation
# Jeśli masz w głowie jakieś inne to możesz dodać w 01, żeby były uwzględnione w feature_ranking.xlsx, oni tam podawali 
# FRE np. możesz spojrzeć w feature selection na wykład. 

boruta = fr_old[fr_old['boruta_rank'].isin([1])].index.tolist()
mi_score = fr_old[fr_old['mi_score'] > 0.01].index.tolist()
f_score = fr_old[fr_old['sign_fscore_0_1'] == 1].index.tolist()
Importance = fr_old[fr_old['Importance'] > 0.01].index.tolist()
Correlation = fr_old[fr_old['Corr'] > 0.1].index.tolist()
y_col = 'account_status'

x_cols_old = {
    'boruta': boruta,
    'mi_score': mi_score,
    'f_score': f_score,
    'Importance': Importance,
    'Correlation':Correlation
}

In [7]:
# zdefioniowanie x_cols_new

boruta = fr_new[fr_new['boruta_rank'].isin([1])].index.tolist()
mi_score = fr_new[fr_new['mi_score'] > 0.01].index.tolist()
f_score = fr_new[fr_new['sign_fscore_0_1'] == 1].index.tolist()
Importance = fr_new[fr_new['Importance'] > 0.01].index.tolist()
Correlation = fr_new[fr_new['Corr'] > 0.1].index.tolist()
y_col = 'account_status'

x_cols_new = {
    'boruta': boruta,
    'mi_score': mi_score,
    'f_score': f_score,
    'Importance': Importance,
    'Correlation':Correlation
}

# TUTAJ TUNUJEMY HYPERPARAMETRY

### SVM

In [13]:
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10,20],
    'gamma': [0.01, 0.1, 1]
}
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

# Tu definiujesz zbiór parametrów na przestrzeni której będzies szukać najlepszego doboru parametrów.

In [None]:
# grid search SVM for old df

for x_col in x_cols_old:
    accuracy = grid_search(df_old, x_cols_old[x_col], SVC(), param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)
print(results_df)

In [None]:
# save results to other df
results_df_SVM_old = results_df
results_df_SVM_old

In [14]:
# grid search SVM for new df 
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_new:
    accuracy = grid_search(df_new, x_cols_new[x_col], SVC(), param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)
print(results_df)

On Data: ['customer_age', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 743  347]
 [ 145 5550]]
Accuracy: 0.8280952227529379
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 630  460]
 [ 165 5530]]
Accuracy: 0.7745044341165194
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_sex', 'customer_education', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization', 'customer_single'], and model SVC()
Confusion matrix:
[[ 748  342]
 [ 136 5559]]
Accuracy: 0.8311789675475831
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 630  460]
 [ 165 5530]]
Accuracy: 0.7745044341165194
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 682  408]
 [ 155 5540]]
Accuracy: 0.7992356082512424
Best model has parameters:
SVC(C=20, gamma=1)
     Data Type                     Accuracy
0       boruta  (SVC(), 0.8280952227529379)
1     mi_score  (SVC(), 0.7745044341165194)
2      f_score  (SVC(), 0.8311789675475831)
3   Importance  (SVC(), 0.7745044341165194)
4  Correlation  (SVC(), 0.7992356082512424)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


In [15]:
# save results to other df
results_df_SVM_new = results_df
results_df_SVM_new

Unnamed: 0,Data Type,Accuracy
0,boruta,"(SVC(), 0.8280952227529379)"
1,mi_score,"(SVC(), 0.7745044341165194)"
2,f_score,"(SVC(), 0.8311789675475831)"
3,Importance,"(SVC(), 0.7745044341165194)"
4,Correlation,"(SVC(), 0.7992356082512424)"


In [None]:
df_compare_SVC = pd.concat([results_df_SVM_old, results_df_SVM_new], axis=1)
df_compare_SVC.columns = ['Data Type', 'Accuracy Old', 'Data Type New', 'Accuracy New']
df_compare_SVC = df_compare_SVC.drop(columns=['Data Type New'])

In [None]:
df_compare_SVC

### Decision Tree

In [22]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

In [23]:
# grid search DT for old df

model = DecisionTreeClassifier()
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_old:
    accuracy = grid_search(df_old, x_cols_old[x_col], model, param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)

On Data: ['customer_age', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1084   55]
 [  41 5908]]
Accuracy: 0.9724100567436615
Best model has parameters:
DecisionTreeClassifier(max_depth=10)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_number_of_dependents', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1072   67]
 [  97 5852]]
Accuracy: 0.9624356045998833
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                       min_samples_split=5)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_age', 'customer_sex', 'customer_number_of_dependents', 'customer_education', 'customer_salary_range', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization', 'customer_married', 'customer_single'], and model DecisionTreeClassifier()
Confusion matrix:
[[1090   49]
 [  73 5876]]
Accuracy: 0.9723544184686015
Best model has parameters:
DecisionTreeClassifier(max_depth=10, min_samples_leaf=2)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1053   86]
 [ 116 5833]]
Accuracy: 0.9524980478639699
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1060   79]
 [ 135 5814]]
Accuracy: 0.9539740117601898
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


In [24]:
results_DT_old = results_df
results_DT_old

Unnamed: 0,Data Type,Accuracy
0,boruta,"(DecisionTreeClassifier(), 0.9724100567436615)"
1,mi_score,"(DecisionTreeClassifier(), 0.9624356045998833)"
2,f_score,"(DecisionTreeClassifier(), 0.9723544184686015)"
3,Importance,"(DecisionTreeClassifier(), 0.9524980478639699)"
4,Correlation,"(DecisionTreeClassifier(), 0.9539740117601898)"


In [25]:
# grid search DT for new df
model = DecisionTreeClassifier()

results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_new:
    accuracy = grid_search(df_new, x_cols_new[x_col], model, param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)

On Data: ['customer_age', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1011   79]
 [  82 5613]]
Accuracy: 0.9565621702604087
Best model has parameters:
DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1006   84]
 [ 113 5582]]
Accuracy: 0.9515469065895563
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                       min_samples_split=5)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_sex', 'customer_education', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization', 'customer_single'], and model DecisionTreeClassifier()
Confusion matrix:
[[ 999   91]
 [ 122 5573]]
Accuracy: 0.9475457306022504
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4,
                       min_samples_split=5)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1003   87]
 [ 111 5584]]
Accuracy: 0.9503463524256752
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                       min_samples_split=5)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1014   76]
 [ 144 5551]]
Accuracy: 0.9524949456709975
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


In [26]:
results_DT_new = results_df
results_DT_new

Unnamed: 0,Data Type,Accuracy
0,boruta,"(DecisionTreeClassifier(), 0.9565621702604087)"
1,mi_score,"(DecisionTreeClassifier(), 0.9515469065895563)"
2,f_score,"(DecisionTreeClassifier(), 0.9475457306022504)"
3,Importance,"(DecisionTreeClassifier(), 0.9503463524256752)"
4,Correlation,"(DecisionTreeClassifier(), 0.9524949456709975)"


In [27]:
results_DT_new.sort_values(by=['Accuracy'], ascending=False)

Unnamed: 0,Data Type,Accuracy
0,boruta,"(DecisionTreeClassifier(), 0.9565621702604087)"
4,Correlation,"(DecisionTreeClassifier(), 0.9524949456709975)"
1,mi_score,"(DecisionTreeClassifier(), 0.9515469065895563)"
3,Importance,"(DecisionTreeClassifier(), 0.9503463524256752)"
2,f_score,"(DecisionTreeClassifier(), 0.9475457306022504)"


In [None]:
df_compare_DT = pd.concat([results_DT_old, results_DT_new], axis=1)
df_compare_DT.columns = ['Data Type', 'Accuracy Old', 'Data Type New', 'Accuracy New']
df_compare_DT = df_compare_DT.drop(columns=['Data Type New'])

In [None]:
df_compare_DT

### Logistic regression

In [13]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # Some solvers only support certain types of penalties
}

In [14]:
# grid search for logistic regression for new df

model = LogisticRegression()
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_new:
    accuracy = grid_search(df_new, x_cols_new[x_col], model, param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)

On Data: ['customer_age', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model LogisticRegression()
Confusion matrix:
[[ 502  588]
 [ 186 5509]]
Accuracy: 0.7139451152225919
Best model has parameters:
LogisticRegression(C=10, penalty='l1', solver='liblinear')


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model LogisticRegression()
Confusion matrix:
[[ 468  622]
 [ 192 5503]]
Accuracy: 0.6978220070720332
Best model has parameters:
LogisticRegression(C=10, penalty='l1', solver='saga')


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_sex', 'customer_education', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization', 'customer_single'], and model LogisticRegression()
Confusion matrix:
[[ 519  571]
 [ 185 5510]]
Accuracy: 0.7218310766727614
Best model has parameters:
LogisticRegression(C=100, penalty='l1', solver='saga')


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model LogisticRegression()
Confusion matrix:
[[ 468  622]
 [ 192 5503]]
Accuracy: 0.6978220070720332
Best model has parameters:
LogisticRegression(C=10, penalty='l1', solver='saga')


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model LogisticRegression()
Confusion matrix:
[[ 508  582]
 [ 190 5505]]
Accuracy: 0.716346223550354
Best model has parameters:
LogisticRegression(C=100, penalty='l1', solver='liblinear')


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


In [15]:
results_LR_new = results_df
results_LR_new

Unnamed: 0,Data Type,Accuracy
0,boruta,"(LogisticRegression(), 0.7139451152225919)"
1,mi_score,"(LogisticRegression(), 0.6978220070720332)"
2,f_score,"(LogisticRegression(), 0.7218310766727614)"
3,Importance,"(LogisticRegression(), 0.6978220070720332)"
4,Correlation,"(LogisticRegression(), 0.716346223550354)"


### KNN

In [9]:
param_grid = {
    'n_neighbors': [3, 5, 7, 10, 15, 20],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

In [11]:
# grid search for KN for new df

model = KNeighborsClassifier()
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

for x_col in x_cols_new:
    accuracy = grid_search(df_new, x_cols_new[x_col], model, param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)

On Data: ['customer_age', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model KNeighborsClassifier()
Confusion matrix:
[[ 749  341]
 [ 141 5554]]
Accuracy: 0.8311987015811391
Best model has parameters:
KNeighborsClassifier(n_neighbors=10, p=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model KNeighborsClassifier()
Confusion matrix:
[[ 734  356]
 [ 196 5499]]
Accuracy: 0.8194891704456669
Best model has parameters:
KNeighborsClassifier(n_neighbors=10, p=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_sex', 'customer_education', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization', 'customer_single'], and model KNeighborsClassifier()
Confusion matrix:
[[ 639  451]
 [ 143 5552]]
Accuracy: 0.7805643933596991
Best model has parameters:
KNeighborsClassifier(n_neighbors=10, p=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model KNeighborsClassifier()
Confusion matrix:
[[ 734  356]
 [ 196 5499]]
Accuracy: 0.8194891704456669
Best model has parameters:
KNeighborsClassifier(n_neighbors=10, p=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model KNeighborsClassifier()
Confusion matrix:
[[ 752  338]
 [ 159 5536]]
Accuracy: 0.8309945147441422
Best model has parameters:
KNeighborsClassifier(n_neighbors=10, p=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


In [12]:
results_KNN_new = results_df
results_KNN_new

Unnamed: 0,Data Type,Accuracy
0,boruta,"(KNeighborsClassifier(), 0.8311987015811391)"
1,mi_score,"(KNeighborsClassifier(), 0.8194891704456669)"
2,f_score,"(KNeighborsClassifier(), 0.7805643933596991)"
3,Importance,"(KNeighborsClassifier(), 0.8194891704456669)"
4,Correlation,"(KNeighborsClassifier(), 0.8309945147441422)"


# Definicja najlepszych modeli, bagging

In [56]:
# definte models with best parameters, according to grid search
model_SVC = SVC(C = 20, gamma = 1)                                                                              # f_score
model_DT_1 = DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10)                     # boruta
model_DT_2 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4)                      # Correlation
model_DT_3 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2, min_samples_split=5) #mi_score
model_LR = LogisticRegression(C=100, penalty='l1', solver='saga')                                               # f_score
model_KNN = KNeighborsClassifier(n_neighbors=10, p=1)                                                           # boruta

In [68]:
bagging_model_names = ['b_SVC', 'b_DT_1', 'b_DT_2', 'b_DT_3', 'b_LR', 'b_KNN']
models = [model_SVC, model_DT_1, model_DT_2, model_DT_3, model_LR, model_KNN]

In [69]:
# dictinary with bagging models to use in cross_validation_accuracy function
bagging_models = {}
for model_name, model in zip(bagging_model_names, models):
    bagging_models[model_name] = BaggingClassifier(model, random_state=42)

In [59]:
model_names = ['SVC', 'DT_1', 'DT_2', 'DT_3', 'LR', 'KNN']

In [70]:
# dictinary with models to use in cross_validation_accuracy function
normal_models = {}
for model_name, model in zip(model_names, models):
    normal_models[model_name] = model

In [71]:
normal_models

{'SVC': SVC(C=20, gamma=1),
 'DT_1': DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10),
 'DT_2': DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4),
 'DT_3': DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                        min_samples_split=5),
 'LR': LogisticRegression(C=100, penalty='l1', solver='saga'),
 'KNN': KNeighborsClassifier(n_neighbors=10, p=1)}

In [72]:
bagging_models

{'b_SVC': BaggingClassifier(estimator=SVC(C=20, gamma=1), random_state=42),
 'b_DT_1': BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=10,
                                                    min_samples_leaf=2,
                                                    min_samples_split=10),
                   random_state=42),
 'b_DT_2': BaggingClassifier(estimator=DecisionTreeClassifier(criterion='entropy',
                                                    max_depth=10,
                                                    min_samples_leaf=4),
                   random_state=42),
 'b_DT_3': BaggingClassifier(estimator=DecisionTreeClassifier(criterion='entropy',
                                                    max_depth=10,
                                                    min_samples_leaf=2,
                                                    min_samples_split=5),
                   random_state=42),
 'b_LR': BaggingClassifier(estimator=LogisticRegression(C=100, penalty='l

# Cross validation and fitting models

In [61]:
cv_results_bagging_models = cross_validation_accuracy(bagging_models, df_new, x_cols_new, y_col)

  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': 

In [73]:
cv_results_normal_models = cross_validation_accuracy(normal_models, df_new, x_cols_new, y_col)

  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Model': model_name, 'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': 

In [63]:
cv_results_bagging_models.sort_values(by=['Mean Validation Accuracy'], ascending=[False])

Unnamed: 0,Model,Data Type,Mean Validation Accuracy,Validation Accuracy Std. Dev.
10,b_DT_2,boruta,89.981071,1.465231
15,b_DT_3,boruta,89.657997,0.932212
5,b_DT_1,boruta,89.451031,1.688827
9,b_DT_1,Correlation,88.954338,1.508679
7,b_DT_1,f_score,88.927999,1.486707
19,b_DT_3,Correlation,88.600092,1.452258
14,b_DT_2,Correlation,88.409799,1.433106
17,b_DT_3,f_score,88.407826,1.466368
12,b_DT_2,f_score,88.250679,1.476224
16,b_DT_3,mi_score,87.955031,0.578017


In [74]:
cv_results_normal_models.sort_values(by=['Mean Validation Accuracy'], ascending=[False])

Unnamed: 0,Model,Data Type,Mean Validation Accuracy,Validation Accuracy Std. Dev.
5,DT_1,boruta,89.258282,1.055933
14,DT_2,Correlation,88.512698,0.417653
19,DT_3,Correlation,88.28334,0.764142
12,DT_2,f_score,88.086242,1.034579
9,DT_1,Correlation,88.055956,1.282371
10,DT_2,boruta,88.042142,1.015262
15,DT_3,boruta,87.78556,1.426828
7,DT_1,f_score,87.715322,1.275489
16,DT_3,mi_score,87.696877,0.926072
17,DT_3,f_score,87.605134,1.588708


In [78]:
all_results = pd.concat([cv_results_bagging_models, cv_results_normal_models], axis=0)

In [80]:
all_results.sort_values(by=['Mean Validation Accuracy'], ascending=[False])

Unnamed: 0,Model,Data Type,Mean Validation Accuracy,Validation Accuracy Std. Dev.
10,b_DT_2,boruta,89.981071,1.465231
15,b_DT_3,boruta,89.657997,0.932212
5,b_DT_1,boruta,89.451031,1.688827
5,DT_1,boruta,89.258282,1.055933
9,b_DT_1,Correlation,88.954338,1.508679
7,b_DT_1,f_score,88.927999,1.486707
19,b_DT_3,Correlation,88.600092,1.452258
14,DT_2,Correlation,88.512698,0.417653
14,b_DT_2,Correlation,88.409799,1.433106
17,b_DT_3,f_score,88.407826,1.466368


# Check with test dataset

In [89]:
# Step 2: Make predictions using the trained model

# testnew
# df_new

# b_DT_3	mi_score	87.955031	0.578017

best_model = bagging_models['b_DT_3']
x_cols_best_model = x_cols_new['mi_score']

x_train_best_model = df_new.loc[:, x_cols_best_model]
y_train_best_model = df_new.loc[:, y_col]

x_test_best_model = test_new.loc[:, x_cols_best_model]
y_test_best_model = test_new.loc[:, y_col]

best_model.fit(x_train_best_model, y_train_best_model)

y_test_pred = best_model.predict(x_test_best_model)

# Step 3: Compare predicted values with actual values
test_accuracy = balanced_accuracy_score(y_test_best_model, y_test_pred)

print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.879514832850357
