In [2]:
#!/user/env python3
'''This script pulls in inventory data, 
   builds and tests several predictive models performance on the data'''
   
__author__ = 'Sam M. Mfalila'
__email__ = 'sam.mfalila@gmail.com'

In [3]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
import sklearn.preprocessing
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import warnings
warnings.filterwarnings('ignore')

In [4]:
#Define data file path and target variable
#file = 'derived_data/train_data_abtv4_undersampled.csv'
file = 'derived_data/balanced.csv'
target = 'went_on_backorder_Yes'
seed = 777

class Data():
    '''Loads data, samples training data if specified, assigns features_df and target_df
    '''
    def __init__(self, file, target, sample=False, n_samples=None, frac=None):
        self.file = file
        self. sample = sample
        self.n_sample = n_sample
        self.target = target
        self.frac = frac
    
    def get_data(file, sample=False, n_samples=None, frac=None):
        '''Load train data with option to sample'''
        data = pd.read_csv(file)
        if sample:
            '''Sample train data due to resource limitation'''
            #data = data.sample(n_samples, random_state=123)
            data = data.groupby(target).apply(lambda x: x.sample(frac=frac))
            data = data.reset_index(drop=True)
        else:
            data = data
        print(data.shape, 'data loaded\n')
        return data
         
    def get_features():
        '''Assign features dataframe'''
        features_df = data
        print(features_df.shape, 'features assigned\n')
        return features_df
        
    def get_target():
        '''Assign target'''
        target_df = data[target].values
        print(target_df.shape, '...target rows loaded\n')
        return target_df


    
class FeatureSelector(BaseEstimator, TransformerMixin):
    '''Custom transformer to extract columns passed as arguments'''
    
    def __init__(self, feature_names):
        '''Class constructor'''
        self._feature_names = feature_names
        
    def fit(self, features_df, target = None):
        '''Returns self and nothing else'''
        return self
    
    def transform( self, features_df, target = None):
        '''This method returns selected features'''
        return features_df[ self._feature_names]      
    


class DropMissing(BaseEstimator, TransformerMixin):
    '''Takes df, drops all missing
    '''
    def __init__(self, df):
        self.df = df
        
    def fit(self, df, target=None):
        '''Returns self, nothing else.
        '''
        return self
    
    def transform(self, df, target=None):
        df.dropna(axis=0, how='any', inplace=True)        
        return df
    
    

class CategoricalFeatsAdded(BaseEstimator, TransformerMixin):
    ''' A custom transformer to add 'neg_inv_balance' indicator 
        Takes df, checks if 'national_inv' is negative and adds indicator variable
    '''
    def __init__ (self, neg_inv_balance=True, low_inventory=True, \
                  low_intransit=True, high_forecast=True):
        ''' class constructor'''
        self._neg_inv_balance = neg_inv_balance
        self._low_inventory = low_inventory
        self._low_intransit = low_intransit
        self._high_forecast = high_forecast
    
    def fit( self, features_df, target = None):
        ''' returns self, nothing else is done here'''
        return self
    
    def transform(self, features_df, target = None):
        ''' creates aformentioned features and drops redundant ones'''

        if self._neg_inv_balance:
            '''check if needed'''
            features_df['neg_inv_balance'] = (features_df.national_inv < 0).astype(int) 
            
        if self._low_inventory:
            '''check if needed'''
            features_df['low_inventory'] = (data['national_inv'] < \
                                            data['national_inv'].median()).astype(int)
            
        if self._low_intransit:
            '''check if needed'''
            features_df['low_intransit'] = (data['in_transit_qty'] < \
                                            data['in_transit_qty'].mean()).astype(int)
            
        if self._high_forecast:
            '''check if needed'''
            features_df['high_forcast'] = (data['forecast_3_month'] > \
                                           data['forecast_3_month'].mean()).astype(int)

        return features_df.values

    
    
class RemoveNegativeValues(BaseEstimator, TransformerMixin):
    '''Takes df, converts all negative values to positive
    '''
    def __init__(self, features_df):
        self.features_df = features_df
        
    def fit(self, features_df, target=None):
        '''Returns self, does nothing else
        '''
        return self
    
    def transform(self, features_df, target=None):
        '''Takes df, returns absolute values
        '''
        features_df = features_df.abs()        
        return features_df


    
class SimpleImputerTransformer(BaseEstimator, TransformerMixin):
    '''This transformer imputes missing values'''
    def __init__(self, features_df, target=None):
        self.features_df = features_df
        
    def fit(self, features_df, target=None):
        return self
    
    def transform(self, features_df, target=None):
        imputer = SimpleImputer(missing_values = np.NaN,
                                strategy='mean')
        
        # Fit data to the imputer object 
        imputer = imputer.fit(features_df)
        
        # Impute the data      
        imputed = imputer.transform(features_df)
        
        features_df = pd.DataFrame(data=imputed)
    
        return features_df    
    
    
    
class CapOutliers(BaseEstimator, TransformerMixin):
    '''Takes df, caps outliers
    '''
    def __init__(self, features_df):
        self.features_df = features_df
        
    def fit(self, features_df, Target=None):
        '''Returns self, does nothing else
        '''
        return self
    
    def transform(self, features_df, q=0.90, target=None):
        for col in features_df.columns:

            if (((features_df[col].dtype)=='float64') | ((features_df[col].dtype)=='int64')):
                percentiles = features_df[col].quantile([0.01,q]).values
                features_df[col][features_df[col] <= percentiles[0]] = percentiles[0]
                features_df[col][features_df[col] >= percentiles[1]] = percentiles[1]
            else:
                features_df[col]=features_df[col]
        return features_df
    
    
    
class StandardScalerTransformer(BaseEstimator, TransformerMixin):
    ''' This transformer standardizes all numerical features'''
    def __init__(self, features_df, target=None):
        self.features_df = features_df
        
    def fit(self, features_df, target=None):
        return self
    
    def transform(self, features_df, target=None):
        col_names = numerical_features
        features = features_df[col_names]
        scaler = StandardScaler().fit(features)
        features_df = scaler.transform(features)
        return features_df     

    


class DelUnusedCols(BaseEstimator, TransformerMixin):
    '''This transformer deletes unused columns from a data pipeline
       Col 0 holds an extra column for 'national_inv' added through the categorical feats. pipeline.
       This row is no longer needed after new categorical features leveraging the column are engineered
    '''
    def __init__(self, features_df, target=None):
        self.features_df = features_df
        
    def fit(self, features_df, target=None):
        return self
    
    def transform(self, features_df, target=None):
        a = features_df
        a = np.delete(a,0,1)
        features_df = a
        return features_df


    
class SplitData(object):
    '''Takes prepared data, performs train test split
    '''
    prepared_features_df = pd.DataFrame()
    feats = None
    target = None
    train_feats = None
    test_feats = None
    train_target = None
    test_target = None
    
    def get_dataframe(prepared_features):
        '''Takes prepared features array, returns dataframe'''
        SplitData.prepared_features_df = pd.DataFrame(data=prepared_features)
    
        #get features and target data
        SplitData.feats = SplitData.prepared_features_df.drop([0], axis=1)
        SplitData.target = SplitData.prepared_features_df[0]
    
    def split_data(test_frac):
        X_train, X_test, y_train, y_test = train_test_split(SplitData.feats, SplitData.target,\
                                                            test_size=test_frac, random_state=seed,\
                                                            stratify=SplitData.target)
        #save datasets
        SplitData.train_feats = X_train
        SplitData.test_feats = X_test
        SplitData.train_target = y_train
        SplitData.test_target = y_test
        
        print('\nTrain and test data assigned\n')        
  
    
        
class Models(object):
    '''This class holds all modeling objects
       Note: Instantiate any additional models to test as class variables below    
    '''
    estimators = {}
    fitted_grid = {}
    param_grids = {}
    best_models = {}
    best_score = None 
    best_model = []
    best_params = {}
    best_model_params = []
    best_model_list = []
    best_estimator = {}
    lr = LogisticRegression(random_state=seed)
    sv = SVC(random_state=seed)
    ld = LinearDiscriminantAnalysis()
    sg = SGDClassifier(random_state=seed)
    rf = RandomForestClassifier(random_state=seed)
    gb = GradientBoostingClassifier(random_state = seed)
    
    results_dict={}
    
    def __init__(self, master, logreg, svc,lda, sgd, randforest, gradboost, 
                 n_iter, scoring, n_jobs, train_features):
        self.master = master
        self.logreg = logreg
        self.svc = svc
        self.lda = lda
        self.sgd = sgd
        self.randforest= randforest
        self.gradboost = gradboost
        self.n_iter = n_iter
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.train_features = train_features  
        
    def hyperparameters(logreg=True, svc=True, lda=True,
                        sgd=True, randforest=True, gradboost=True):
        '''Define model hyperparameters for tuning
           Add aditional models hyperparameters for tuning here as needed        
        '''
        if logreg:
            '''set lr hyperparameters for tuning'''
            #select hyperparameters for logreg
            lr_solver_options = ['saga']
            lr_C_options = [0.001,0.01,0.1,1,10,100]
            
            #set param grid for lr
            lr_param_grid = dict(solver = lr_solver_options,\
                                 C = lr_C_options)
            
            #add param grid to param_grids
            Models.param_grids['lr'] = lr_param_grid

        if svc:
            '''set svc hyperparameters for tuning'''
            #select hyperparameters for svc
            sv_kernel_options = ['linear','rbf', 'poly', 'sigmoid']
            sv_C_options = [0.001,0.01,0.1,1,10,100]
            
            #set param grid for svc
            sv_param_grid = dict(kernel = sv_kernel_options,\
                                 C = sv_C_options)
            
            #add param grid to param_grids
            Models.param_grids['sv'] = sv_param_grid
            
        if lda:
            '''set lda hyperparameters'''
            ld_solver_options = ['svd', 'lsqr', 'eigen']
            
            #set hyperparameter grid for lda
            ld_param_grid = dict(solver = ld_solver_options)
            
            #add param grid to param_grids
            Models.param_grids['ld'] = ld_param_grid
            
        if sgd:
            '''set sgd hyperparameters'''
            sg_max_iter_options = ['1000', '100000']
            sg_tol_options = [1e-3]
            
            #set hyperparameter grid for sgd
            sg_param_grid = dict(max_iter = sg_max_iter_options,\
                                 tol = sg_tol_options)
            
            #add param grid to param_grids
            Models.param_grids['sg'] = sg_param_grid
                                    
        if randforest:
            '''set rf hyperparameters for tuning'''
            #select hyperparameters for rf
            rf_n_estimators_options =[10,75,100,150,200,1000]
            rf_max_features_options = ['auto','sqrt', 'log2',  0.33]
            
            #set param grid for rf
            rf_param_grid = dict(n_estimators = rf_n_estimators_options,\
                                 max_features = rf_max_features_options)
            
            #add param grid to param_grids
            Models.param_grids['rf'] = rf_param_grid
        
        if gradboost:
            '''set gb hyperparameters for tuning'''
            #select hyperparameters for gb
            gb_n_estimators_options = [10, 100, 200, 1000]
            gb_learning_rate_options = [0.001, 0.01, 0.05, 0.1, 0.2]
            gb_max_depth_options = [1, 3, 5, 7, 9]
            gb_subsample_options = [0.5,0.7,1.0]
            
            #set param grid for gb
            gb_param_grid = dict(learning_rate = gb_learning_rate_options,\
                                 n_estimators = gb_n_estimators_options,\
                                 subsample = gb_subsample_options,\
                                 max_depth = gb_max_depth_options)
            
            #add param grid to param_grids
            Models.param_grids['gb'] = gb_param_grid
               
        print('Hyperparameter grid is set\n')
        
        
    def setting_gridsearch(logreg=False, svc=False, lda=False, sgd=False,\
                           randforest=False, gradboost=False):
        '''Selects estimators to tune and holds them in a dictionary
           Add any additional models for fiting here as needed        
        
           Create a class empty dict "estimators" to hold estimators for GridSearchCV
           Add estimators to estimators dict
        '''
        if logreg:
            Models.estimators['lr'] = Models.lr
        if svc:
            Models.estimators['sv'] = Models.sv
        if lda:
            Models.estimators['ld'] = Models.ld
        if sgd:
            Models.estimators['sg'] = Models.sg
        if randforest:
            Models.estimators['rf'] = Models.rf       
        if gradboost:
            Models.estimators['gb'] = Models.gb            
            
        print('\nGridSearch object set and ready for fitting\n')
        
                       
    def check_hyperparams_settings():
        '''Running code to check that hyperparameters is set up correctly.
        '''
        print('Validate gridsearch object set correctly...\n')
        for key in ['lr','sv','ld','sg','rf', 'gb']:
            if key in Models.param_grids:
                if type(Models.param_grids[key]) is dict:
                    print( key, 'was found in hyperparameters, and it is a grid.' )
                else:
                    print( key, 'was found in hyperparameters, but it is not a grid.' )
            else:
                print( key, 'was not found in hyperparameters')
                
                
    def fit_models(n_iter, scoring, n_jobs, train_features):
        '''Fits all models in GridSearch with k-folds cross validation
         Args:
           cv - Number of cross validation splits
           scoring - Scoring metric
           n_jobs - Number of processors to use if parrallel processing available(-1 means using all processors)
         
         Notes:
           Created a class dict 'fitted_grid' to hold each of fitted model
           
           GridSearch only stores results from cross_val for the last fitted model.
           We need to append results of each model fit to the dictionary above so we can access those attributes as needed.
        '''
        print('\nModel fitting started...')
        for Models.name, Models.estimator in Models.estimators.items():
            full_grid = RandomizedSearchCV(Models.estimators[Models.name], 
                                     Models.param_grids[Models.name], n_iter=n_iter, scoring = scoring,
                                     n_jobs = n_jobs)  
            
            #fit data to GridSearchCV object
            full_grid.fit(SplitData.train_feats, SplitData.train_target)    

            #store fitted model
            Models.fitted_grid[Models.name] = full_grid    
            
            print(Models.name,'has been fitted,')
        
        print('\nModel fitting completed.\n')
        
    
    def summarize_classification(y_test, y_pred):
        '''Takes test data, returns model summaries'''
        acc = accuracy_score(y_test, y_pred, normalize=True)
        num_acc = accuracy_score(y_test, y_pred, normalize=False)
        
        prec = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        
        return {'accuracy:': round(acc,4),
                'precision:': round(prec, 4),
                'recall:':round(recall, 4),
                'accuracy_count:':num_acc
               }

    
    def get_predictions():
               
        for Models.name, Models.model in Models.fitted_grid.items():
                        
            y_pred_train = Models.model.predict(SplitData.train_feats)
            
            y_pred_test = Models.model.predict(SplitData.test_feats)
            
            train_summary = Models.summarize_classification(SplitData.train_target, y_pred_train) #Note: Create class level variables for these
            test_summary = Models.summarize_classification(SplitData.test_target, y_pred_test)
            
            pred_results = pd.DataFrame({'y_test': SplitData.test_target,
                                         'y_pred': y_pred_test})
            
            model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
            
            Models.results_dict[Models.name] = {'training': train_summary,
                                  'test': test_summary,
                                  'confussion_matrix': model_crosstab
                                 }
            
        print('Predictions done...\n')
        
    
    def compare_results():
        
        print('\n{0:*^80}\n'.format(' Model Results '))
        
        for key in Models.results_dict:
            print('Classification: ', key)
                
            print()
            print('Training Data')
            for score in Models.results_dict[key]['training']:
                print(score, Models.results_dict[key]['training'][score])
                    
            print()
            print('Test Data')
            for score in Models.results_dict[key]['test']:
                    print(score, Models.results_dict[key]['test'][score])
                    
            print()
            print('Confussion Matrix')
            print(Models.results_dict[key]['confussion_matrix'])
                        
            print()
            print()
            print()
           

In [5]:
if __name__ == '__main__':
    
    #load data
    data = Data.get_data(file, sample=True, frac=0.1)
    
    #get Features
    features_df = Data.get_features()
    
    #get target
    target_df = Data.get_target()    
    
    '''Data Transformation Pipelines'''
    #categrical features to pass down the categorical pipeline 
    categorical_features = ['national_inv', 'went_on_backorder_Yes']
    
    #numerical features to pass down the numerical pipeline 
    numerical_features = ['national_inv','lead_time','in_transit_qty',\
                          'forecast_3_month']
    
    #define steps in the categorical pipeline 
    categorical_pipeline = Pipeline( steps = [ ('cat_selector', FeatureSelector(categorical_features)),
                                              
                                               ('cat_feats_add', CategoricalFeatsAdded()),
                                              
                                               ('delete_unused', DelUnusedCols(features_df))
                                              
                                             ])
    
    #define the steps in the numerical pipeline 
    numerical_pipeline = Pipeline( steps = [ ('num_selector', FeatureSelector(numerical_features)),                                        
                                       
                                             ('remove_negative_values', RemoveNegativeValues(features_df)),
                                            
                                             ('standard_trans', StandardScalerTransformer(features_df)),
                                        
                                             ('impute_missing', SimpleImputerTransformer(features_df)),
                                        
                                             ('cap_outliers', CapOutliers(features_df))                                        
                                   
                                           ] )    
    
    #combine numerical and categorical piepline into one full big pipeline horizontally using FeatureUnion
    full_pipeline = FeatureUnion(transformer_list = [('categorical_pipeline', categorical_pipeline),
                                                     
                                                     ('numerical_pipeline', numerical_pipeline)])
    
    
    #disable pandas chained_assignment warning
    pd.options.mode.chained_assignment = None
    
    #fit data to data transformation pipeline
    prepared_features = full_pipeline.fit_transform(features_df)
    
    '''Split Data'''
    #get prepared features dataframe
    SplitData.get_dataframe(prepared_features)
    
    #train test split
    SplitData.split_data(0.2)   
    
    '''Data Modeling'''
    #Get prepared features dataframe
    SplitData.get_dataframe(prepared_features)
    
    #Spit train and test sets
    SplitData.split_data(0.2)
    
    #Set hyperparameters grid
    Models.hyperparameters(logreg=True, svc=False, lda=True, sgd=False,\
                           randforest=False, gradboost=False)
    
    #Validate hyperparameters set correctly
    Models.check_hyperparams_settings()
    
    #Set models for fitting
    Models.setting_gridsearch(logreg=True, svc=False, lda=True, sgd=False,\
                              randforest=False, gradboost=False)
    
    #Fit models that were set in GridSearch object 
    Models.fit_models(n_iter=10, scoring= 'average_precision', n_jobs = -1,\
                      train_features = SplitData.prepared_features_df )   
    
    '''Modeling Results'''
    #Get predictions
    Models.get_predictions()
    
    #Get model performance results
    Models.compare_results()

(45267, 9) data loaded

(45267, 9) features assigned

(45267,) ...target rows loaded


Train and test data assigned


Train and test data assigned

Hyperparameter grid is set

Validate gridsearch object set correctly...

lr was found in hyperparameters, and it is a grid.
sv was not found in hyperparameters
ld was found in hyperparameters, and it is a grid.
sg was not found in hyperparameters
rf was not found in hyperparameters
gb was not found in hyperparameters

GridSearch object set and ready for fitting


Model fitting started...
lr has been fitted,
ld has been fitted,

Model fitting completed.

Predictions done...


******************************** Model Results *********************************

Classification:  lr

Training Data
accuracy: 0.8068
precision: 0.7709
recall: 0.5981
accuracy_count: 29216

Test Data
accuracy: 0.8088
precision: 0.7765
recall: 0.5987
accuracy_count: 7323

Confussion Matrix
y_test   0.0   1.0
y_pred            
0.0     5516  1211
1.0      520  1807



Cla

In [6]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:           9041        2410        4480         140        2150        6229
Swap:          2047           0        2047


# 1. Loading Data

In [7]:
#load data
data = Data.get_data(file, sample=True, frac=0.1)

(45267, 9) data loaded



In [8]:
#get Features
features_df = Data.get_features()

(45267, 9) features assigned



In [9]:
#get target
target_df = Data.get_target()

(45267,) ...target rows loaded



# 2. Transforming Data

In [10]:
#Categrical features to pass down the categorical pipeline 
categorical_features = ['national_inv', 'went_on_backorder_Yes']

In [11]:
#Numerical features to pass down the numerical pipeline 
numerical_features = ['national_inv','lead_time','in_transit_qty',
              'forecast_3_month']

In [12]:
#Defining steps in the categorical pipeline 
categorical_pipeline = Pipeline( steps = [ ('cat_selector', FeatureSelector(categorical_features)),
                                                                                                                          
                                           ('cat_feats_add', CategoricalFeatsAdded()),
                                          
                                           ('delete_unused', DelUnusedCols(features_df))
                                                                                                                                                                                                            
                                         ])

In [13]:
#Defining the steps in the numerical pipeline 
numerical_pipeline = Pipeline( steps = [ ('num_selector', FeatureSelector(numerical_features)),                                        
                                       
                                         ('remove_negative_values', RemoveNegativeValues(features_df)),
                                        
                                         ('standard_trans', StandardScalerTransformer(features_df)),
                                        
                                         ('impute_missing', SimpleImputerTransformer(features_df)),
                                        
                                         ('cap_outliers', CapOutliers(features_df))                                        
                                   
                                       ] )

In [14]:
#Combining numerical and categorical piepline into one full big pipeline horizontally using FeatureUnion
full_pipeline = FeatureUnion(transformer_list = [('categorical_pipeline', categorical_pipeline), 
                                                  
                                                 ('numerical_pipeline', numerical_pipeline)])

In [15]:
#Disabling pandas chained_assignment warning
pd.options.mode.chained_assignment = None

In [16]:
#Fitting our data to our data transformation pipeline
prepared_features = full_pipeline.fit_transform(features_df)

# 3. Split Data

In [17]:
#Get prepared features dataframe
SplitData.get_dataframe(prepared_features)

In [18]:
#Train test split
SplitData.split_data(0.2)


Train and test data assigned



# 4. Model Data

## Set Hyperparameter Grids

In [19]:
#Set hyperparameters grid
Models.hyperparameters(logreg=True, svc=False, lda=True, sgd=False,\
                       randforest=False, gradboost=False)

Hyperparameter grid is set



In [20]:
#Validate hyperparameters set correctly
Models.check_hyperparams_settings()

Validate gridsearch object set correctly...

lr was found in hyperparameters, and it is a grid.
sv was not found in hyperparameters
ld was found in hyperparameters, and it is a grid.
sg was not found in hyperparameters
rf was not found in hyperparameters
gb was not found in hyperparameters


## Set GridSearch

In [21]:
#Set models for fitting
Models.setting_gridsearch(logreg=True, svc=False, lda=True, sgd=False,
                          randforest=False, gradboost=False)


GridSearch object set and ready for fitting



In [22]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:           9041        2429        4447         153        2163        6197
Swap:          2047           0        2047


## Model Fitting

In [23]:
#Fitting models that were set manually in GridSearchCV 
Models.fit_models(n_iter=3, scoring= 'average_precision', n_jobs = -1,
                  train_features = SplitData.prepared_features_df )


Model fitting started...
lr has been fitted,
ld has been fitted,

Model fitting completed.



## Model Results

In [24]:
#get predictions
Models.get_predictions()

Predictions done...



In [25]:
#get model performance results
Models.compare_results()


******************************** Model Results *********************************

Classification:  lr

Training Data
accuracy: 0.7973
precision: 0.7424
recall: 0.6004
accuracy_count: 28874

Test Data
accuracy: 0.7893
precision: 0.7297
recall: 0.5842
accuracy_count: 7146

Confussion Matrix
y_test   0.0   1.0
y_pred            
0.0     5383  1255
1.0      653  1763



Classification:  ld

Training Data
accuracy: 0.7832
precision: 0.733
recall: 0.5499
accuracy_count: 28362

Test Data
accuracy: 0.775
precision: 0.7162
recall: 0.5384
accuracy_count: 7017

Confussion Matrix
y_test   0.0   1.0
y_pred            
0.0     5392  1393
1.0      644  1625





# 5. Analysis Summary

Our results show that the Support Vectors Classifier (svc) model had the best accuracy count on the test set (3077) without showing indications of overfitting.

While the Random Forest(rf) and Gradient Boosting (gb) classifiers had higher accuracy counts, these models appear to be overfitting the training data because their training accuracies are higher than thier test set accuracies.

However, all the above three models are resource  greedy and therefore <b/>we'll forcus on training the simplar logistic regression and linear discriminant analysis (lda) models</b> which have, too, provided descent results.