In [513]:
# standard libraries
import os
import pandas as pd
import numpy as np
#import re
import os
from IPython.display import Image
from abc import ABC, abstractmethod
import time
#import sklearn
#import time

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate
from IPython.display import clear_output
import xgboost

# data pre-processing
from scipy.io import arff
#from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.impute._base import _BaseImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection._split import BaseShuffleSplit
from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# prediction models
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm._base import BaseSVC 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import tensorflow as tf

# import warnings filter
import warnings
warnings.filterwarnings('ignore')
from warnings import simplefilter 
simplefilter(action='ignore', category=FutureWarning)

## Data Loading

In [2]:
class FilePathManager:
    def __init__(self, local_dir: str):
        self.local_dir = local_dir
    
    def retrieve_full_path(self):
        return os.getcwd()+'/'+self.local_dir

In [3]:
class Loader:
    df = pd.DataFrame()
    
    def load_data(self, file_name):
        pass
    
    def get_df(self):
        pass
    
    def size(self):
        return len(self.df)

In [4]:
from typing import Callable
 
class CSVLoader(Loader):
    def __init__(self, file_path_manager: FilePathManager):
        self.file_path_manager = file_path_manager
        
    def load_data(self, _prepare_data: Callable[[pd.DataFrame], pd.DataFrame] = None):
        self.df = pd.read_csv(self.file_path_manager.retrieve_full_path())
        if _prepare_data:
            self.df = _prepare_data(self.df)
    
    def get_df(self):
        return self.df;
    
    def size(self):
        return len(self.df)  

In [154]:
def clean_data(df):
    df['y'] = df['y'].astype(int)
    df['x32'] = df['x32'].str.replace('%','').astype(float)
    df['x37'] = df['x37'].str.replace('$','').astype(float)
    return df

In [69]:
labelencoder = LabelEncoder()

In [89]:
my_cat_vars=[*cat_vars]
df[my_cat_vars].astype(str)

Unnamed: 0,x32,x29,x30,x37,x24
0,0.0%,July,tuesday,$1313.96,euorpe
1,-0.02%,Aug,wednesday,$1962.78,asia
2,-0.01%,July,wednesday,$430.47,asia
3,0.01%,July,wednesday,$-2366.29,asia
4,0.01%,July,tuesday,$-620.66,asia
...,...,...,...,...,...
159995,0.0%,Aug,wednesday,$-891.96,asia
159996,-0.01%,May,wednesday,$1588.65,asia
159997,-0.0%,Jun,wednesday,$687.46,asia
159998,-0.02%,May,wednesday,$439.21,asia


In [93]:
df['x24']

0         2
1         1
2         1
3         1
4         1
         ..
159995    1
159996    1
159997    1
159998    1
159999    1
Name: x24, Length: 160000, dtype: int64

In [62]:
df[['x32', 'x29', 'x30', 'x37', 'x24']]

Unnamed: 0,x32,x29,x30,x37,x24
0,0.0%,July,tuesday,$1313.96,euorpe
1,-0.02%,Aug,wednesday,$1962.78,asia
2,-0.01%,July,wednesday,$430.47,asia
3,0.01%,July,wednesday,$-2366.29,asia
4,0.01%,July,tuesday,$-620.66,asia
...,...,...,...,...,...
159995,0.0%,Aug,wednesday,$-891.96,asia
159996,-0.01%,May,wednesday,$1588.65,asia
159997,-0.0%,Jun,wednesday,$687.46,asia
159998,-0.02%,May,wednesday,$439.21,asia


In [155]:
loader = CSVLoader(FilePathManager('final_project(5).csv'))
loader.load_data(clean_data)
df = loader.get_df()

# Model Setup

In [246]:
class BaseImputer:
    def fit(self, X, y=None):
        pass
    
    def transform(self, X):
        pass

class BaseModel:

    def fit(self, X, y, sample_weight=None):
        pass
    
    def predict(self, X):
        pass

In [465]:
class Modeling:
    _X_train_fitted = None
    _X_test_fitted = None
    _y_train = None
    _y_test = None
    _y_preds = None
    _y_preds_proba = None
    
    def __init__(self, data: pd.DataFrame, 
                 target_name: str, 
                 shuffle_splitter: BaseShuffleSplit, 
                 imputer: BaseImputer, 
                 model: BaseModel, scaler = None, encoder = None):
        self._data = data
        self._target_name = target_name
        self._shuffle_splitter = shuffle_splitter
        self._imputer = imputer
        self._model = model
        self._encoder = encoder
        self._X, self._y = self._split_data()
        self._scaler = scaler
        
    @property
    def X(self):
        return self._X
    
    @property
    def y(self):
        return self._y

    @property
    def model(self):
        return self._model
    
    @model.setter
    def model(self, model):
        self._model = model
     
    @property
    def X_train(self):
        return self._X_train_fitted
    
    @property
    def X_test(self):
        return self._X_test_fitted
    
    @property
    def y_train(self):
        return self._y_train
    
    @property
    def y_test(self):
        return self._y_test
    
    @property
    def y_preds(self):
        return self._y_preds
    
    def _split_data(self):
        X = self._data.copy()
        return X.drop([self._target_name], axis=1) , X[self._target_name]
    
    def _shuffle_split(self):
        X = self.X
        y = self.y
        for train_index, test_index in self._shuffle_splitter.split(X,y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
        return X_train, X_test, y_train, y_test
    
    def _fit_imputer(self, train):
        if self._imputer is not None:
            self._imputer.fit(train)
    
    def _fit_scaler(self, train, cont_vars = None):
        transform_cols = None
        if cont_vars is None:
            transform_cols = self.X.columns
        else:
            transform_cols = cont_vars
            
        if self._scaler is not None:
            self._scaler.fit(train[transform_cols])
    
    def _impute_data(self, X: pd.DataFrame):
        if self._imputer is not None:
            return pd.DataFrame(self._imputer.transform(X), columns = self.X.columns, index = X.index)
        return X
    
    def _scale_data(self, X: pd.DataFrame, cont_vars = None):
        transform_cols = None
        if cont_vars is None:
            transform_cols = X.columns
        else:
            transform_cols = cont_vars
        scaled_data = X[transform_cols]
        if self._scaler is not None:
            scaled_data = pd.DataFrame(self._scaler.transform(X[transform_cols]), columns = transform_cols)
        X[transform_cols] = scaled_data
        return X
    
    def _encode_data(self):
        df = self.X.copy()
        cont_vars = df.describe().columns
        cat_vars = set(df.columns) - set(cont_vars)
        for column in [*cat_vars]:
            df[column] = self._encoder.fit_transform(df[column].astype(str))
        self._X = df
        return cont_vars, cat_vars
        
    
    def prepare(self):
        cont_vars = None
        if self._encoder is not None: 
            cont_vars, _ = self._encode_data()
        X_train, X_test, y_train, y_test = self._shuffle_split()   
        self._fit_imputer(X_train)
        X_train = self._impute_data(X_train)
        X_test = self._impute_data(X_test)
        self._fit_scaler(X_train, cont_vars)
        self._X_train_fitted = self._scale_data(X_train, cont_vars)
        self._X_test_fitted = self._scale_data(X_test, cont_vars)
        self._y_train = y_train
        self._y_test = y_test
        
    def prepare_and_train(self):
        self.prepare()
        return self.train()
        
    def train(self):
        self._model.fit(self.X_train, self.y_train)
        self._y_preds = self._model.predict(self.X_train)
        self._y_preds_proba = self._model.predict_proba(self.X_train)
        
        return self.metrics(self.y_train, self.y_preds, self._y_preds_proba)
        
    def test(self):
        return self.metrics(self.y_test, self._model.predict(self.X_test), self._model.predict_proba(self.X_test))
       
    @abstractmethod
    def metrics(self, y_true = None, y_pred = None, y_preds_proba = None):
        pass

In [225]:
df.columns

Index(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20',
       'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30',
       'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40',
       'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49', 'y'],
      dtype='object')

In [427]:
class XGBModel:
    _model = None
    
    def __init__(self, params, num_round: int = 100):
        self._params = params
        self._num_round = num_round
        
    def fit(self, X, y, sample_weight=None):
        dtrain = xgb.DMatrix(X, label = y)
        self._model = xgb.train(self._params, dtrain)
        
    def predict(self, X):
        dtest = xgb.DMatrix(X)
        return self._model.predict(dtest)

In [467]:
class ClassificationModeling(Modeling):
    def __init__(self, 
                 data: pd.DataFrame, 
                 target_name: str, 
                 shuffle_splitter: BaseShuffleSplit, 
                 imputer: BaseImputer, 
                 model: BaseModel, 
                 scaler = None,
                 encoder = None,
                 beta: int = 1, 
                 classification: str = 'binary'):
        super().__init__(data, target_name, shuffle_splitter, imputer, model, scaler, encoder)
        self.beta = beta
        self.classification = classification
        
    @abstractmethod
    def metrics(self, y_true = None, y_pred = None, y_preds_proba=None):
        pass

In [449]:
from typing import Type, TypeVar

class XGBClassificationModeling(ClassificationModeling):
    TXGB = TypeVar("TXGB", bound=XGBClassifier)
    all_models = [];
    
    def __init__(self, 
             data: pd.DataFrame, 
             target_name: str, 
             shuffle_splitter: BaseShuffleSplit, 
             imputer: BaseImputer, 
             model: BaseModel, 
             scaler = None,
             encoder = None,
             beta: int = 1, 
             classification: str = 'binary'):
         super().__init__(data, target_name, shuffle_splitter, imputer, model, scaler, encoder, beta, classification)
        
            
    def parameter_tuning(self, params, class_to_instantiate: Type[TXGB]):
        list_of_models = []
        combination = []
        params_base = {}
        output = []
        for key, value in params.items():
            if isinstance(value, list):
                combination.append((key,value))
            else:
                params_base[key]=value
              
        result = XGBClassificationModeling.get_combinations(combination)

        for r in result:
            list_of_models.append(class_to_instantiate(**{**params_base, **r}))
            
        for a_model in list_of_models:
            self.model = a_model
            startTrain = time.time()
            train_metrics = self.train()
            endTrain = time.time()
            test_metrics = self.test()
            endTest = time.time()
            train_time = endTrain - startTrain
            test_time = endTest - endTrain
            output.append({'model': a_model, 'train_metrics': {**train_metrics,**{'elapsed_time':train_time}}, 'test_metrics': {**test_metrics,**{'elapsed_time':test_time}}})
        self.all_models = output
        return output
        
    def find_best_model(self):
        max_accuracy = self.all_models[0]['test_metrics']['accuracy']
        location = 0
        for indx, output_metrics in enumerate(self.all_models):
            if max_accuracy < output_metrics['test_metrics']['accuracy']:
                max_accuracy = output_metrics['test_metrics']['accuracy']
                location = indx
            elif max_accuracy == output_metrics['test_metrics']['accuracy']:
                if output_metrics['test_metrics']['elapsed_time'] < self.all_models[location]['test_metrics']['elapsed_time']:
                    location = indx
                
        return self.all_models[location]
    
    @staticmethod
    def get_combinations(tuples):
        length = len(tuples)
        if length > 1:
            total_params = []
            tuple_copy = tuples.copy()
            a_tuple = tuple_copy.pop(0)
            params_list = XGBClassificationModeling.get_combinations(tuple_copy)
            for value in a_tuple[1]:
                for a_params in params_list:
                    temp = { a_tuple[0]: value}
                    total_params.append({**temp, **a_params})
            return total_params
        else:
            params_list = []
            a_tuple =  tuples[0]
            for value in a_tuple[1]:
                temp = {}
                temp[a_tuple[0]] = value
                params_list.append(temp)
            return params_list
            
    
    def metrics(self, y_true = None, y_pred = None, y_pred_proba = None):
        if y_true is None and y_pred is None:
            y_true = self.y_train
            y_pred = self.y_preds       
        return {
                'matrix': confusion_matrix(y_true, y_pred), 
                'accuracy': round(accuracy_score(y_true, y_pred), 5), 
                'precision': precision_score(y_true, y_pred, average=self.classification), 
                'recall': recall_score(y_true, y_pred, average=self.classification),
                'f1': f1_score(y_true, y_pred),
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba
               }

In [228]:
xgb_classifier = XGBClassificationModeling(loader.get_df(),'y',
                                           StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                                           None, XGBClassifier, None, LabelEncoder(), beta=1)

In [229]:
xgb_classifier.prepare()

In [230]:
xgb_classifier.X['x24']

0         2
1         1
2         1
3         1
4         1
         ..
159995    1
159996    1
159997    1
159998    1
159999    1
Name: x24, Length: 160000, dtype: int64

In [231]:
xgb_results = xgb_classifier.parameter_tuning( { 
    'max_depth': [3],
    'learning_rate': [0.1],
    'n_estimators': [100],
    'colsample_bytree': [0.3],
 }, XGBClassifier)



In [232]:
xgb_results

[{'model': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=0.3,
                enable_categorical=False, gamma=0, gpu_id=-1,
                importance_type=None, interaction_constraints='',
                learning_rate=0.1, max_delta_step=0, max_depth=3,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',
                random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                subsample=1, tree_method='exact', validate_parameters=1,
                verbosity=None),
  'train_metrics': {'matrix': array([[60663,  6399],
          [12781, 32157]]),
   'accuracy': 0.82875,
   'precision': 0.8340336134453782,
   'recall': 0.7155859183764297,
   'f1': 0.7702828945792514,
   'elapsed_time': 24.367619037628174},
  'test_metrics': {'matrix': array([[25890,  2851],
          [ 5630, 13629]]),
 

In [233]:
xgb_results = xgb_classifier.parameter_tuning( { 
    'max_depth': [3,6,10],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'colsample_bytree': [0.3, 0.7],
 }, XGBClassifier)





In [236]:
xgb_classifier.find_best_model()

{'model': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.7,
               enable_categorical=False, gamma=0, gpu_id=-1,
               importance_type=None, interaction_constraints='',
               learning_rate=0.1, max_delta_step=0, max_depth=10,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=1000, n_jobs=8, num_parallel_tree=1,
               predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
               scale_pos_weight=1, subsample=1, tree_method='exact',
               validate_parameters=1, verbosity=None),
 'train_metrics': {'matrix': array([[67062,     0],
         [    0, 44938]]),
  'accuracy': 1.0,
  'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'elapsed_time': 101146.86843967438},
 'test_metrics': {'matrix': array([[27657,  1084],
         [ 1452, 17807]]),
  'accuracy': 0.94717,
  'precision': 0.9426181779683447,
  'recall

In [549]:
from typing import Type, TypeVar

class TuningClassificationModeling(ClassificationModeling):
    TClass = None
    all_models = [];
    
    def __init__(self, 
             data: pd.DataFrame, 
             target_name: str, 
             shuffle_splitter: BaseShuffleSplit, 
             imputer: BaseImputer, 
             model: BaseModel, 
             scaler = None,
             encoder = None,
             beta: int = 1, 
             classification: str = 'binary',
                 classification_type: str = 'logistic'):
        super().__init__(data, target_name, shuffle_splitter, imputer, model, scaler, encoder, beta, classification)
        if classification_type == 'logistic':
            TClass = TypeVar("TClass", bound=LogisticRegression)
        elif classification_type == 'xgb':
            TClass = TypeVar("TClass", bound=XGBClassifier)
        elif classification_type == 'neural':
            TClass = TypeVar("TClass", bound=NNModel)
            

    def parameter_tuning(self, params, class_to_instantiate: Type[TClass]):
        list_of_models = []
        combination = []
        params_base = {}
        output = []
        for key, value in params.items():
            if isinstance(value, list):
                combination.append((key,value))
            else:
                params_base[key]=value
        result = {}
        if len(combination) > 0:       
            result = TuningClassificationModeling.get_combinations(combination)
        print(params_base)
        for r in result:
            list_of_models.append(class_to_instantiate(**{**params_base, **r}))
            
        for a_model in list_of_models:
            self.model = a_model
            startTrain = time.time()
            train_metrics = self.train()
            endTrain = time.time()
            test_metrics = self.test()
            endTest = time.time()
            train_time = endTrain - startTrain
            test_time = endTest - endTrain
            output.append({'model': a_model, 'train_metrics': {**train_metrics,**{'elapsed_time':train_time}}, 'test_metrics': {**test_metrics,**{'elapsed_time':test_time}}})
        self.all_models = output
        return output
        
    def find_best_model(self, metric):
        max_accuracy = self.all_models[0]['test_metrics'][metric]
        location = 0
        for indx, output_metrics in enumerate(self.all_models):
            if max_accuracy < output_metrics['test_metrics'][metric]:
                max_accuracy = output_metrics['test_metrics'][metric]
                location = indx
            elif max_accuracy == output_metrics['test_metrics'][metric]:
                if output_metrics['test_metrics']['elapsed_time'] < self.all_models[location]['test_metrics']['elapsed_time']:
                    location = indx
                
        return self.all_models[location]
    
    @staticmethod
    def get_combinations(tuples):
        length = len(tuples)
        if length > 1:
            total_params = []
            tuple_copy = tuples.copy()
            a_tuple = tuple_copy.pop(0)
            params_list = TuningClassificationModeling.get_combinations(tuple_copy)
            for value in a_tuple[1]:
                for a_params in params_list:
                    temp = { a_tuple[0]: value}
                    total_params.append({**temp, **a_params})
            return total_params
        else:
            params_list = []
            a_tuple =  tuples[0]
            for value in a_tuple[1]:
                temp = {}
                temp[a_tuple[0]] = value
                params_list.append(temp)
            return params_list
            
    
    def metrics(self, y_true = None, y_pred = None, y_pred_proba=None):
        if y_true is None and y_pred is None:
            y_true = self.y_train
            y_pred = self.y_preds
        conf_matrix = confusion_matrix(y_true, y_pred)
        return  {
                'matrix': conf_matrix, 
                'auc': roc_auc_score(y_true, y_pred),
                'accuracy': round(accuracy_score(y_true, y_pred), 5), 
                'precision': precision_score(y_true, y_pred, average=self.classification), 
                'recall': recall_score(y_true, y_pred, average=self.classification),
                'f1': f1_score(y_true, y_pred),
                'cost': TuningClassificationModeling.cost_calc(conf_matrix),
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba
               }
    
    @staticmethod
    def cost_calc(conf_matrix):
        cost_matrix = np.array([[0,-100],[-25,0]])
        cost = np.sum(cost_matrix*conf_matrix)/np.sum(conf_matrix)
        return cost

In [516]:
linear_modeling = TuningClassificationModeling(loader.get_df(),'y',
                                           StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                                           SimpleImputer(missing_values=np.nan, strategy='mean'), LogisticRegression, None, LabelEncoder(), beta=1)

In [517]:
linear_modeling.prepare()

In [518]:
linear_result = linear_modeling.parameter_tuning( { 
    'penalty':'l2', #change to l1
    'random_state':1,
    'solver': 'liblinear',
    'C':  [0.001, 0.01, 1, 10],
 }, LogisticRegression)

{'penalty': 'l2', 'random_state': 1, 'solver': 'liblinear'}


In [520]:
linear_result

[{'model': LogisticRegression(C=0.001, random_state=1, solver='liblinear'),
  'train_metrics': {'matrix': array([[55387, 11675],
          [21518, 23420]]),
   'auc': 0.673534929921288,
   'accuracy': 0.70363,
   'precision': 0.6673315287077931,
   'recall': 0.5211624905425253,
   'f1': 0.5852585808354054,
   'cost': -15.227232142857142,
   'y_pred': array([1, 1, 0, ..., 0, 0, 1]),
   'y_pred_proba': array([[0.14075395, 0.85924605],
          [0.33423334, 0.66576666],
          [0.61417412, 0.38582588],
          ...,
          [0.8398927 , 0.1601073 ],
          [0.65163991, 0.34836009],
          [0.30405237, 0.69594763]]),
   'elapsed_time': 6.521888971328735},
  'test_metrics': {'matrix': array([[23648,  5093],
          [ 9243, 10016]]),
   'auc': 0.6714326204801649,
   'accuracy': 0.70133,
   'precision': 0.6629161426964061,
   'recall': 0.5200685393841841,
   'f1': 0.5828677839851025,
   'cost': -15.424479166666666,
   'y_pred': array([1, 0, 0, ..., 0, 1, 0]),
   'y_pred_proba':

In [519]:
linear_modeling.find_best_model('auc')

{'model': LogisticRegression(C=0.001, random_state=1, solver='liblinear'),
 'train_metrics': {'matrix': array([[55387, 11675],
         [21518, 23420]]),
  'auc': 0.673534929921288,
  'accuracy': 0.70363,
  'precision': 0.6673315287077931,
  'recall': 0.5211624905425253,
  'f1': 0.5852585808354054,
  'cost': -15.227232142857142,
  'y_pred': array([1, 1, 0, ..., 0, 0, 1]),
  'y_pred_proba': array([[0.14075395, 0.85924605],
         [0.33423334, 0.66576666],
         [0.61417412, 0.38582588],
         ...,
         [0.8398927 , 0.1601073 ],
         [0.65163991, 0.34836009],
         [0.30405237, 0.69594763]]),
  'elapsed_time': 6.521888971328735},
 'test_metrics': {'matrix': array([[23648,  5093],
         [ 9243, 10016]]),
  'auc': 0.6714326204801649,
  'accuracy': 0.70133,
  'precision': 0.6629161426964061,
  'recall': 0.5200685393841841,
  'f1': 0.5828677839851025,
  'cost': -15.424479166666666,
  'y_pred': array([1, 0, 0, ..., 0, 1, 0]),
  'y_pred_proba': array([[0.34566391, 0.65433

In [397]:
# class NNTuningClassificationModeling(TuningClassificationModeling):
#     TClass = None
#     all_models = [];
    
#     def __init__(self, 
#              data: pd.DataFrame, 
#              target_name: str, 
#              shuffle_splitter: BaseShuffleSplit, 
#              imputer: BaseImputer, 
#              model: BaseModel, 
#              scaler = None,
#              encoder = None,
#              beta: int = 1, 
#              classification: str = 'binary',
#                  classification_type: str = 'logistic'):
#         super().__init__(data, target_name, shuffle_splitter, imputer, model, scaler, encoder, beta, classification, classification_type)
#         if classification_type == 'neural':
#             TClass = TypeVar("TClass", bound=NNModel)
                
# #     def train(self, epoch, batch):
# #         logDir = "logs/{epoch}-{batchsize}-{time}".format(epoch=epoch, batchsize=batch, time=time.time())
# #         self.tb_callback.log_dir = logDir
# #         self._model.fit(self.X_train, self.y_train, batch_size=batch, epochs=epoch, validation_data=(self.X_test, self.y_test), callbacks=[self.tb_callback])
# #         self._y_preds = self._model.predict(self.X_train)
# #         return self.metrics(self.y_train, self.y_preds)
    
# #     def metrics(self, y_true = None, y_pred = None):
# #         if y_true is None and y_pred is None:
# #             y_true = self.y_train
# #             y_pred = self.y_preds
            
# #         y_pred_proba= pd.Series(y_pred.reshape((y_pred.shape[1], y_pred.shape[0]))[0], index=y_true.index)
# #         y_pred = pd.Series( (y_pred_proba>0.5).astype(int), index=y_true.index)
# #         return super().metrics(y_true,y_pred, y_pred_proba)   

In [575]:
class NNModel:
    model = None
    epoch = 50
    batch_size = 32
    loss = 'BinaryCrossentropy',
    metric = 'accuracy'
    optimizer = 'adam'
    
    def __init__(self,**inputs):
        self.model = tf.keras.Sequential()
        for arg, content in inputs.items():
            if arg.startswith('input'):
                self.model.add( tf.keras.layers.Input( shape=(content,) ) )
            if arg.startswith('layer'):
                self.model.add( tf.keras.layers.Dense(content['s'], activation = content['activation']) )
            if arg == 'epoch':
                self.epoch = content
            if arg == 'bs':
                self.batch_size = content
            if arg == 'optimizer':
                self.optimizer = content
            if arg == 'loss':
                self.loss = content
            if arg == 'metric':
                self.metric = content
        self.model.compile(optimizer=self.optimizer, loss=self.loss, metrics=[self.metric])
        print(self.model)
    
    def fit(self, X, y):
        self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epoch)
    
    def predict(self, X):
        y_pred_proba = self.predict_proba(X)
        return pd.Series( (y_pred_proba>0.5).astype(int))
        
    
    def predict_proba(self, X):
        y_pred_proba = self.model.predict(X)
        return pd.Series(y_pred_proba.reshape((y_pred_proba.shape[1], y_pred_proba.shape[0]))[0])
        

In [399]:
mynn= NNModel(input=67,
        layer1={'s':300, 'activation': 'relu'}, 
        layer2={'s':200, 'activation': 'relu'}, 
        layer3={'s':100, 'activation': 'relu'},
        layer4={'s':1, 'activation':'sigmoid'},
        loss='BinaryCrossentropy',
        metric='accuracy',
        epoch=30,
        bs=100, 
        optimizer='adam')

<keras.engine.sequential.Sequential object at 0x7ff6c56de490>


In [356]:
mynn.model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 300)               20400     
_________________________________________________________________
dense_43 (Dense)             (None, 200)               60200     
_________________________________________________________________
dense_44 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_45 (Dense)             (None, 1)                 101       
Total params: 100,801
Trainable params: 100,801
Non-trainable params: 0
_________________________________________________________________


In [576]:
nn_modeling = TuningClassificationModeling(loader.get_df(),'y',
                                           StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                                           SimpleImputer(missing_values=np.nan, strategy='mean'), NNModel, None, LabelEncoder(), beta=1,classification_type='neural' )

In [577]:
nn_modeling.prepare()

In [578]:
nn_model_tunning = nn_modeling.parameter_tuning( { 
        'input':50,
        'layer1':{'s':300, 'activation': 'relu'}, 
        'layer2':{'s':200, 'activation': 'relu'}, 
        'layer3':{'s':100, 'activation': 'relu'},
        'layer4':{'s':1, 'activation':'sigmoid'},
        'loss':'BinaryCrossentropy',
        'metric':'accuracy',
        'epoch':10,
        'bs':[100,1000], 
        'optimizer':'adam'
 }, NNModel)        

{'input': 50, 'layer1': {'s': 300, 'activation': 'relu'}, 'layer2': {'s': 200, 'activation': 'relu'}, 'layer3': {'s': 100, 'activation': 'relu'}, 'layer4': {'s': 1, 'activation': 'sigmoid'}, 'loss': 'BinaryCrossentropy', 'metric': 'accuracy', 'epoch': 10, 'optimizer': 'adam'}
<keras.engine.sequential.Sequential object at 0x7ff69437a700>
<keras.engine.sequential.Sequential object at 0x7ff69434f0a0>
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [579]:
nn_model_tunning

[{'model': <__main__.NNModel at 0x7ff69437abb0>,
  'train_metrics': {'matrix': array([[64138,  2924],
          [ 2132, 42806]]),
   'auc': 0.9544777063362342,
   'accuracy': 0.95486,
   'precision': 0.9360594795539033,
   'recall': 0.9525568561128667,
   'f1': 0.9442361141747916,
   'cost': -3.086607142857143,
   'y_pred': 0         0
   1         0
   2         1
   3         0
   4         0
            ..
   111995    1
   111996    1
   111997    0
   111998    0
   111999    1
   Length: 112000, dtype: int64,
   'y_pred_proba': 0         0.019525
   1         0.021994
   2         0.873751
   3         0.001637
   4         0.066766
               ...   
   111995    0.955188
   111996    0.968965
   111997    0.001305
   111998    0.004798
   111999    0.998538
   Length: 112000, dtype: float32,
   'elapsed_time': 29.768935918807983},
  'test_metrics': {'matrix': array([[27309,  1432],
          [ 1090, 18169]]),
   'auc': 0.9467893957250937,
   'accuracy': 0.94746,
   'precisio

In [418]:
nn_modeling.find_best_model()['model'].batch_size

100

In [580]:
nn_modeling.find_best_model('auc')

{'model': <__main__.NNModel at 0x7ff69437abb0>,
 'train_metrics': {'matrix': array([[64138,  2924],
         [ 2132, 42806]]),
  'auc': 0.9544777063362342,
  'accuracy': 0.95486,
  'precision': 0.9360594795539033,
  'recall': 0.9525568561128667,
  'f1': 0.9442361141747916,
  'cost': -3.086607142857143,
  'y_pred': 0         0
  1         0
  2         1
  3         0
  4         0
           ..
  111995    1
  111996    1
  111997    0
  111998    0
  111999    1
  Length: 112000, dtype: int64,
  'y_pred_proba': 0         0.019525
  1         0.021994
  2         0.873751
  3         0.001637
  4         0.066766
              ...   
  111995    0.955188
  111996    0.968965
  111997    0.001305
  111998    0.004798
  111999    0.998538
  Length: 112000, dtype: float32,
  'elapsed_time': 29.768935918807983},
 'test_metrics': {'matrix': array([[27309,  1432],
         [ 1090, 18169]]),
  'auc': 0.9467893957250937,
  'accuracy': 0.94746,
  'precision': 0.9269425029335238,
  'recall': 0.9

In [581]:
def tune_cost_proba(train_proba, test_proba, y_train, y_test, conf_train, conf_test):
    cost_results = pd.DataFrame()
    thresh = 0
    for i in range(11):
        yhat_train = pd.Series(train_proba < thresh).astype(int)
        yhat_test = pd.Series(test_proba < thresh).astype(int)
        conf_train = confusion_matrix(y_train, yhat_train)
        conf_test = confusion_matrix(y_test, yhat_test)
        cost_results = cost_results.append({"Threshold": thresh,
                                        "Train Cost": -TuningClassificationModeling.cost_calc(conf_train),
                                        "Test Cost":  -TuningClassificationModeling.cost_calc(conf_test)},
                                        ignore_index=True)
        thresh = thresh + 0.05
    return cost_results

In [545]:
linear_modeling.find_best_model('auc')['train_metrics']['y_pred_proba']

array([[0.14075395, 0.85924605],
       [0.33423334, 0.66576666],
       [0.61417412, 0.38582588],
       ...,
       [0.8398927 , 0.1601073 ],
       [0.65163991, 0.34836009],
       [0.30405237, 0.69594763]])

In [546]:
linear_modeling.find_best_model('auc')['test_metrics']['y_pred_proba']

array([[0.34566391, 0.65433609],
       [0.56366816, 0.43633184],
       [0.89095928, 0.10904072],
       ...,
       [0.7267635 , 0.2732365 ],
       [0.08251836, 0.91748164],
       [0.86682084, 0.13317916]])

In [551]:
linear_modeling.y_test

67652     0
73756     1
137380    1
8186      1
25244     0
         ..
148081    1
52534     0
89420     0
70022     1
92035     1
Name: y, Length: 48000, dtype: int64

In [570]:
train_proba = linear_modeling.find_best_model('auc')['train_metrics']['y_pred_proba']
test_proba = linear_modeling.find_best_model('auc')['test_metrics']['y_pred_proba']
conf_train = linear_modeling.find_best_model('auc')['train_metrics']['matrix']
conf_test = linear_modeling.find_best_model('auc')['test_metrics']['matrix']
   
cost_results = tune_cost_proba(train_proba[:,0], test_proba[:,0], linear_modeling.y_train, linear_modeling.y_test, conf_train, conf_test)

In [584]:
train_proba = nn_modeling.find_best_model('auc')['train_metrics']['y_pred_proba']
test_proba = nn_modeling.find_best_model('auc')['test_metrics']['y_pred_proba']
conf_train = nn_modeling.find_best_model('auc')['train_metrics']['matrix']
conf_test = nn_modeling.find_best_model('auc')['test_metrics']['matrix']

cost_results = tune_cost_proba(1-train_proba, 1-test_proba, nn_modeling.y_train, nn_modeling.y_test, conf_train, conf_test)

In [585]:
cost_results

Unnamed: 0,Test Cost,Threshold,Train Cost
0,10.030729,0.0,10.030804
1,3.471354,0.05,3.322545
2,2.850521,0.1,2.56317
3,2.590625,0.15,2.26942
4,2.560417,0.2,2.185491
5,2.596875,0.25,2.180357
6,2.739063,0.3,2.260937
7,2.885417,0.35,2.387946
8,3.064062,0.4,2.583705
9,3.301042,0.45,2.826116


In [305]:
sample_multiargument(input=67,layer1=300, layer2=200, layer3=100, epoch=30, bs=100, optimizer='adam')

add input
add layer
add layer
add layer
set epoch
set bs


In [502]:
xgb_classifier = TuningClassificationModeling(loader.get_df(),'y',
                                           StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=12343),
                                           None, XGBClassifier, None, LabelEncoder(), beta=1,classification_type = 'xgb' )

In [503]:
xgb_classifier.prepare()

In [504]:
xgb_results = xgb_classifier.parameter_tuning( { 
    'max_depth': [3],
    'learning_rate': [0.1],
    'n_estimators': [100],
    'colsample_bytree': [0.3],
 }, XGBClassifier)

{}


In [505]:
xgb_results

[{'model': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=0.3,
                enable_categorical=False, gamma=0, gpu_id=-1,
                importance_type=None, interaction_constraints='',
                learning_rate=0.1, max_delta_step=0, max_depth=3,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',
                random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                subsample=1, tree_method='exact', validate_parameters=1,
                verbosity=None),
  'train_metrics': {'matrix': array([[60663,  6399],
          [12781, 32157]]),
   'accuracy': 0.82875,
   'precision': 0.8340336134453782,
   'recall': 0.7155859183764297,
   'f1': 0.7702828945792514,
   'cost': -8.566294642857143,
   'y_pred': array([1, 0, 0, ..., 0, 0, 0]),
   'y_pred_proba': array([[0.44765896, 0.5523

In [445]:
xgb_results[0]['model'].predict_proba(xgb_classifier.X_test)

array([[0.31724763, 0.6827524 ],
       [0.4503014 , 0.5496986 ],
       [0.380547  , 0.619453  ],
       ...,
       [0.70762724, 0.29237276],
       [0.17255515, 0.82744485],
       [0.86919796, 0.13080207]], dtype=float32)