In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from boruta import BorutaPy
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression




In [3]:
#os.chdir("C:/Users/micha/OneDrive/Dokumenty/GitHub/Machine_Learning/ML_classification")

df = pd.read_excel('../data/input_processed/train.xlsx', index_col=0)
test = pd.read_excel('../data/input_processed/test.xlsx', index_col=0)
fr = pd.read_excel('../data/input_processed/feature_ranking.xlsx', index_col=0)


In [430]:
def grid_search(df, x_col, model, param_grid, y_col, cv):

    grid_search = GridSearchCV(model, param_grid, scoring='balanced_accuracy', cv=cv)

    grid_search.fit(df.loc[:, x_col], df.loc[:, y_col])
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(df.loc[:, x_col])
    accuracy = balanced_accuracy_score(df.loc[:, y_col], y_pred)
    confusion = confusion_matrix(df.loc[:, y_col], y_pred)
    print(f"On Data: {x_col}, and model {model}")
    print("Confusion matrix:")
    print(confusion)
    print("Accuracy:", accuracy)
    print("Best model has parameters:")
    print(best_model)
    return model, accuracy

def get_model_name(model):
    return model.__class__.__name__.split("(")[0]

def cross_validation(model, _X, _y, _cv=5):
      _scoring = ['balanced_accuracy']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      val_acc_scores = results['test_balanced_accuracy']
      val_acc_mean = np.mean(val_acc_scores) * 100
      val_acc_std = np.std(val_acc_scores) * 100

      return val_acc_mean, val_acc_std
      #return {"Training Accuracy scores": results['train_balanced_accuracy'],
       #       "Mean Training Accuracy": results['train_balanced_accuracy'].mean()*100,
        #      
         #     "Validation Accuracy scores": results['test_balanced_accuracy'],
          #    "Mean Validation Accuracy": results['test_balanced_accuracy'].mean()*100,
           #   }
           
def cross_validation_accuracy(models, df, x_cols, y_col, cv=5):
    results_df = pd.DataFrame(columns=['Data Type', 'Mean Validation Accuracy', 'Validation Accuracy Std. Dev.'])
    
    for model in models:
        model_name = get_model_name(model)
        
        for x_col_name, x_col in x_cols.items():
            scores = cross_validate(model, df.loc[:, x_col], df.loc[:, y_col], cv=cv, scoring='balanced_accuracy', return_train_score=True)
            mean_accuracy = np.mean(scores['test_score']) * 100
            std_accuracy = np.std(scores['test_score']) * 100
            
            results_df = results_df.append({'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
    
    return results_df

In [4]:
# tu są definiowane których kolumn będziemy później używać za pomocą kryteriów wyliczonych na koniec 01_Data_preparation
# Jeśli masz w głowie jakieś inne to możesz dodać w 01, żeby były uwzględnione w feature_ranking.xlsx, oni tam podawali 
# FRE np. możesz spojrzeć w feature selection na wykład. 

boruta = fr[fr['boruta_rank'].isin([1])].index.tolist()
mi_score = fr[fr['mi_score'] > 0.01].index.tolist()
f_score = fr[fr['sign_fscore_0_1'] == 1].index.tolist()
Importance = fr[fr['Importance'] > 0.01].index.tolist()
Correlation = fr[fr['Corr'] > 0.1].index.tolist()
y_col = 'account_status'

x_cols = {
    'boruta': boruta,
    'mi_score': mi_score,
    'f_score': f_score,
    'Importance': Importance,
    'Correlation':Correlation
}

# TUTAJ TUNUJEMY HYPERPARAMETRY

### SVM

In [336]:
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10,20],
    'gamma': [0.01, 0.1, 1]
}
results_df = pd.DataFrame(columns=['Data Type', 'Accuracy'])

# Tu definiujesz zbiór parametrów na przestrzeni której będzies szukać najlepszego doboru parametrów.

In [338]:
for x_col in x_cols:
    accuracy = grid_search(df, x_col, SVC(), param_grid, y_col, 5)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)
print(results_df)

On Data: ['customer_age', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 776  363]
 [ 147 5802]]
Accuracy: 0.8282946750628808
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_number_of_dependents', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 744  395]
 [ 151 5798]]
Accuracy: 0.813911074097638
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_age', 'customer_sex', 'customer_number_of_dependents', 'customer_education', 'customer_salary_range', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization', 'customer_married', 'Single'], and model SVC()
Confusion matrix:
[[ 900  239]
 [  93 5856]]
Accuracy: 0.8872669667591561
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 651  488]
 [ 168 5781]]
Accuracy: 0.7716569771946532
Best model has parameters:
SVC(C=20, gamma=1)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model SVC()
Confusion matrix:
[[ 706  433]
 [ 154 5795]]
Accuracy: 0.7969776314948647
Best model has parameters:
SVC(C=20, gamma=1)
                                           Data Type  \
0  [customer_age, customer_available_credit_limit...   
1  [customer_number_of_dependents, total_products...   
2  [customer_age, customer_sex, customer_number_o...   
3  [period_inactive, contacts_in_last_year, credi...   
4  [total_products, period_inactive, contacts_in_...   

                      Accuracy  
0  (SVC(), 0.8282946750628808)  
1   (SVC(), 0.813911074097638)  
2  (SVC(), 0.8872669667591561)  
3  (SVC(), 0.7716569771946532)  
4  (SVC(), 0.7969776314948647)  


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


In [339]:
results_df

Unnamed: 0,Data Type,Accuracy
0,"[customer_age, customer_available_credit_limit...","(SVC(), 0.8282946750628808)"
1,"[customer_number_of_dependents, total_products...","(SVC(), 0.813911074097638)"
2,"[customer_age, customer_sex, customer_number_o...","(SVC(), 0.8872669667591561)"
3,"[period_inactive, contacts_in_last_year, credi...","(SVC(), 0.7716569771946532)"
4,"[total_products, period_inactive, contacts_in_...","(SVC(), 0.7969776314948647)"


### Decision Tree

In [340]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

In [341]:
model = DecisionTreeClassifier()

for x_col in x_cols:
    accuracy = grid_search(df, x_col, model, param_grid, y_col, cv)
    results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)
    


On Data: ['customer_age', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1045   94]
 [ 102 5847]]
Accuracy: 0.9501628637093964
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4,
                       min_samples_split=10)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_number_of_dependents', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1050   89]
 [ 103 5846]]
Accuracy: 0.9522737237841524
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                       min_samples_split=10)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['customer_age', 'customer_sex', 'customer_number_of_dependents', 'customer_education', 'customer_salary_range', 'customer_available_credit_limit', 'total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization', 'customer_married', 'Single'], and model DecisionTreeClassifier()
Confusion matrix:
[[1062   77]
 [  84 5865]]
Accuracy: 0.9591384095806452
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                       min_samples_split=10)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'remaining_credit_limit', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1066   73]
 [ 108 5841]]
Accuracy: 0.9588771900929631
Best model has parameters:
DecisionTreeClassifier(max_depth=10, min_samples_leaf=2)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


On Data: ['total_products', 'period_inactive', 'contacts_in_last_year', 'credit_card_debt_balance', 'transaction_amount_ratio', 'total_transaction_amount', 'total_transaction_count', 'transaction_count_ratio', 'average_utilization'], and model DecisionTreeClassifier()
Confusion matrix:
[[1047   92]
 [ 116 5833]]
Accuracy: 0.9498641584873238
Best model has parameters:
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                       min_samples_split=10)


  results_df = results_df.append({'Data Type': x_col, 'Accuracy': accuracy}, ignore_index=True)


In [318]:
param_grid = {
    'criterion' : 'entropy',
    'max_depth': 10,
    'min_samples_leaf': 2,
    'min_samples_split': 10
}

In [328]:
model = DecisionTreeClassifier(**param_grid)
model.fit(df.loc[:,Correlation], df.loc[:,y_col])

y_pred = model.predict(test.loc[:, Correlation])
accuracy = balanced_accuracy_score(test.loc[:,y_col], y_pred)
confusion = confusion_matrix(test.loc[:,y_col], y_pred)
print(confusion)
accuracy


[[ 392   96]
 [ 103 2448]]


0.8814511827570031

# TU SPRAWDZAMY SKUTECZNOŚĆ NA CV

In [2]:
# Tu są zdefiniowane modele, ja je definiowałem trochę z buta, trochę z wcześniejszych wyników gridSearchu
# Dobrze by to wyglądało jakbyś tu np. porównał modele dla konkretnych typów. Czyli np. tabela z mean_accuracy i std.dev
# dla DecisionTreeClassifier kla każdego typu danych i np. różnych parametrów (np. criterion = 'gini' lub 'entropy')
# I porównać normalny DecisionTreeClassifier z BaggingClassifier(DecisionTreeClassifier()) i tak dla każdej z grup modeli
# mamy tu DecisionTree, SVC, LogisticRegreesion i RandomForest, możesz też spróbować policzyć KNN bo tego nie liczyłem a może wyniki będą spoko


modelDT_1 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=4,
                       min_samples_split=10)
modelDT_B1 = BaggingClassifier(DecisionTreeClassifier(), random_state = 42)

model3 = SVC(kernel = 'rbf',C=20, gamma='scale')
model4 = LogisticRegression(penalty = 'l2', fit_intercept = False, solver = 'lbfgs')

model6 = RandomForestClassifier(random_state = 42)
models = [modelDT_1, model3, model4, modelDT_B1, model6] 


NameError: name 'DecisionTreeClassifier' is not defined

In [444]:
results = cross_validation_accuracy(models, df, x_cols, y_col)

  results_df = results_df.append({'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Data Type': x_col_name, 'Mean Validation Accuracy': mean_accuracy, 'Validation Accuracy Std. Dev.': std_accuracy, 'Model': model_name}, ignore_index=True)
  results_df = results_df.append({'Data Type': x_c

In [448]:
results.sort_values(by=['Mean Validation Accuracy'], ascending=[False])

Unnamed: 0,Data Type,Mean Validation Accuracy,Validation Accuracy Std. Dev.,Model
15,boruta,91.440624,1.060495,BaggingClassifier
16,mi_score,90.999607,0.489255,BaggingClassifier
17,f_score,90.871118,0.775036,BaggingClassifier
19,Correlation,90.205244,0.947151,BaggingClassifier
24,Correlation,90.12358,1.626829,RandomForestClassifier
20,boruta,89.750003,1.860963,RandomForestClassifier
18,Importance,89.597758,0.672876,BaggingClassifier
21,mi_score,89.308131,1.818857,RandomForestClassifier
22,f_score,89.227653,1.919099,RandomForestClassifier
23,Importance,88.416829,0.903984,RandomForestClassifier


Bagging zastosowny do:
- Decision tree (nazwa BaggingClassifier - nazwać odpowiednio do decision tree)

modele które mamy:
- SVC
- LogisticRegression
- Decision tree

modele do dorzucenia:
- KNN

cv po wyestymowaniu zrobić
- 

In [244]:
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10,20],
    'gamma': [0.01, 0.1, 1]
}

svm = SVC()

grid_search = GridSearchCV(svm, param_grid, scoring='balanced_accuracy', cv=5)

grid_search.fit(df.loc[:, boruta], df.loc[:,y_col])
best_model = grid_search.best_estimator_
y_pred = best_model.predict(test.loc[:, boruta])
accuracy = balanced_accuracy_score(test.loc[:,y_col], y_pred)
confusion = confusion_matrix(test.loc[:,y_col], y_pred)
print(confusion)
accuracy


KeyboardInterrupt: 

## Bagging

### DecisionTree

In [400]:
model = BaggingClassifier(DecisionTreeClassifier(), random_state = 42)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

n_scores = cross_val_score(model, df.loc[:, boruta], df.loc[:,y_col], scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Balanced Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


Balanced Accuracy: 0.906 (0.022)


In [280]:
model.fit(df.loc[:, boruta], df.loc[:, y_col])
predictions = model.predict(test.loc[:, boruta])

In [281]:
balanced_acc = balanced_accuracy_score(test.loc[:, y_col], predictions)
confusion_mat = confusion_matrix(test.loc[:, y_col], predictions)
print('Balanced Accuracy: %.3f' % balanced_acc)
print(confusion_mat)

Balanced Accuracy: 0.763
[[ 178  147]
 [  38 1663]]


In [200]:
predictions_series = pd.Series(predictions, name = 'Prediction')
predictions_series.reset_index(drop=True, inplace=True)
test.loc[:, y_col].reset_index(drop=True, inplace=True)
results_df = pd.concat([predictions_series, test.loc[:, y_col]], axis=1)
results_df[results_df['Prediction'] != results_df['account_status']]

Unnamed: 0,Prediction,account_status
12,0,1
49,1,0
71,1,0
91,1,0
144,1,0
...,...,...
1886,1,0
1914,1,0
1933,1,0
1982,1,0
