In [4]:
import collections
import matplotlib.pyplot as plt
from scipy import stats
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from category_encoders import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from xgboost import XGBClassifier, plot_importance as plot_importance_xgb
from lightgbm import LGBMClassifier, plot_importance as plot_importance_lgbm

In [10]:
import os, sys
import pandas as pd
import numpy as np
from os.path import join

DataPath = r'D:\GitWork\titanic\data'
TranFile = 'train.csv'
TestFile = 'test.csv'
OutFile  = 'gender_submission.csv'

absp_train = join(DataPath, TranFile)
df = pd.read_csv(absp_train)
print(df.info())
display(df.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [11]:
# Creating a categorical variable for Ages
df['AgeCat'] = ''
df['AgeCat'].loc[(df['Age'] < 18)] = 'young'
df['AgeCat'].loc[(df['Age'] >= 18) & (df['Age'] < 56)] = 'mature'
df['AgeCat'].loc[(df['Age'] >= 56)] = 'senior'


# Creating a categorical variable for Family Sizes
df['FamilySize'] = ''
df['FamilySize'].loc[(df['SibSp'] <= 2)] = 'small'
df['FamilySize'].loc[(df['SibSp'] > 2) & (df['SibSp'] <= 5 )] = 'medium'
df['FamilySize'].loc[(df['SibSp'] > 5)] = 'large'


# Creating a categorical variable to tell if the passenger is alone
df['IsAlone'] = ''
df['IsAlone'].loc[((df['SibSp'] + df['Parch']) > 0)] = 'no'
df['IsAlone'].loc[((df['SibSp'] + df['Parch']) == 0)] = 'yes'


# Creating a categorical variable to tell if the passenger is a Young/Mature/Senior male or a Young/Mature/Senior female
df['SexCat'] = ''
df['SexCat'].loc[(df['Sex'] == 'male') & (df['Age'] <= 21)] = 'youngmale'
df['SexCat'].loc[(df['Sex'] == 'male') & ((df['Age'] > 21) & (df['Age']) < 50)] = 'maturemale'
df['SexCat'].loc[(df['Sex'] == 'male') & (df['Age'] > 50)] = 'seniormale'
df['SexCat'].loc[(df['Sex'] == 'female') & (df['Age'] <= 21)] = 'youngfemale'
df['SexCat'].loc[(df['Sex'] == 'female') & ((df['Age'] > 21) & (df['Age']) < 50)] = 'maturefemale'
df['SexCat'].loc[(df['Sex'] == 'female') & (df['Age'] > 50)] = 'seniorfemale'

# Creating a categorical variable for the passenger's title
# Title is created by extracting the prefix before "Name" feature
# This title needs to be a feature because all female titles are grouped with each other
# Also, creating a column to tell if the passenger is married or not
# "Is_Married" is a binary feature based on the Mrs title. Mrs title has the highest survival rate among other female titles
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1
df['Title'] = df['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df['Title'] = df['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')


# Creating "Ticket Frequency" Feature
# There are too many unique Ticket values to analyze, so grouping them up by their frequencies makes things easier
df['Ticket_Frequency'] = df.groupby('Ticket')['Ticket'].transform('count')

df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeCat,FamilySize,IsAlone,SexCat,Title,Is_Married,Ticket_Frequency
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,mature,small,no,maturemale,Mr,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,mature,small,no,maturefemale,Miss/Mrs/Ms,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,mature,small,yes,maturefemale,Miss/Mrs/Ms,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,mature,small,no,maturefemale,Miss/Mrs/Ms,1,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,mature,small,yes,maturemale,Mr,0,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,,small,yes,maturemale,Mr,0,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,mature,small,yes,seniormale,Mr,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,young,medium,no,maturemale,Master,0,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,mature,small,no,maturefemale,Miss/Mrs/Ms,1,3
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,young,small,no,maturefemale,Miss/Mrs/Ms,1,2


In [12]:
def get_feature_names(df):
    # Splitting the target
    target = df['Survived']

    # Dropping unused columns from the feature set
    df.drop(['PassengerId', 'Survived', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)

    # Splitting categorical and numerical column dataframes
    categorical_df = df.select_dtypes(include=['object'])
    numeric_df = df.select_dtypes(exclude=['object'])

    # And then, storing the names of categorical and numerical columns.
    categorical_columns = list(categorical_df.columns)
    numeric_columns = list(numeric_df.columns)
    
    print("Categorical columns:\n", categorical_columns)
    print("\nNumeric columns:\n", numeric_columns)

    return target, categorical_columns, numeric_columns

target, categorical_columns, numeric_columns = get_feature_names(df)

Categorical columns:
 ['Sex', 'Embarked', 'AgeCat', 'FamilySize', 'IsAlone', 'SexCat', 'Title']

Numeric columns:
 ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Is_Married', 'Ticket_Frequency']


In [14]:
# Balancing Data

# You can call any of the functions below, if you wish, inside the "defineBestModelPipeline()" function

def balancingClassesRus(x_train, y_train):
    
    # Using RandomUnderSampler to balance our training data points
    rus = RandomUnderSampler(random_state=7)
    features_balanced, target_balanced = rus.fit_resample(x_train, y_train)
    
    print("Count for each class value after RandomUnderSampler:", collections.Counter(target_balanced))
    
    return features_balanced, target_balanced


def balancingClassesSmoteenn(x_train, y_train):
    
    # Using SMOTEEN to balance our training data points
    smn = SMOTEENN(random_state=7)
    features_balanced, target_balanced = smn.fit_resample(x_train, y_train)
    
    print("Count for each class value after SMOTEEN:", collections.Counter(target_balanced))
    
    return features_balanced, target_balanced


def balancingClassesSmote(x_train, y_train):

    # Using SMOTE to to balance our training data points
    sm = SMOTE(random_state=7)
    features_balanced, target_balanced = sm.fit_resample(x_train, y_train)

    print("Count for each class value after SMOTE:", collections.Counter(target_balanced))

    return features_balanced, target_balanced

In [15]:
# Function responsible for checking our model's performance on the test data
def testSetResultsClassifier(classifier, x_test, y_test):
    predictions = classifier.predict(x_test)
    
    results = []
    f1 = f1_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    
    results.append(f1)
    results.append(precision)
    results.append(recall)
    results.append(roc_auc)
    results.append(accuracy)
    
    print("\n\n#---------------- Test set results (Best Classifier) ----------------#\n")
    print("F1 score, Precision, Recall, ROC_AUC score, Accuracy:")
    print(results)
    
    return results

In [16]:
# Now, we are going to create our Pipeline, fitting several different data preprocessing, feature selection 
# and modeling techniques inside a RandomSearchCV, to check which group of techniques has better performance.

# Building a Pipeline inside RandomSearchCV, responsible for finding the best model and it's parameters
def defineBestModelPipeline(df, target, categorical_columns, numeric_columns):
    
    # Splitting original data into Train and Test BEFORE applying transformations
    # Later in RandomSearchCV, x_train will be splitted into train/val sets
    # The transformations are going to be fitted specifically on the train set,
    # and then applied to both train/test sets. This way, information leakage is avoided!
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.10, random_state=42)
    y_train = y_train.to_numpy() # Transforming training targets into numpy arrays
    y_test = y_test.to_numpy() # Transforming test targets into numpy arrays
    
    
    # # If desired, we can balance training classes using one of the functions below
    # # Obtaining balanced data for modeling using Random Under Sampling
    #x_train, y_train = balancingClassesRus(x_train, y_train)

    # # Obtaining balanced data for modeling using SMOTEENN
    #x_train, y_train = balancingClassesSmoteenn(x_train, y_train)

    # # Obtaining balanced data for modeling using SMOTE
    #x_train, y_train = balancingClassesSmote(x_train, y_train)
    
    
    # 1st -> Numeric Transformers
    # Here, we are creating different several different data transformation pipelines 
    # to be applied in our numeric features
    numeric_transformer_1 = Pipeline(steps=[('imp', IterativeImputer(max_iter=30, random_state=42)),
                                            ('scaler', MinMaxScaler())])
    
    numeric_transformer_2 = Pipeline(steps=[('imp', IterativeImputer(max_iter=20, random_state=42)),
                                            ('scaler', StandardScaler())])
    
    numeric_transformer_3 = Pipeline(steps=[('imp', SimpleImputer(strategy='mean')),
                                            ('scaler', MinMaxScaler())])
    
    numeric_transformer_4 = Pipeline(steps=[('imp', SimpleImputer(strategy='median')),
                                            ('scaler', StandardScaler())])
    
    
    # 2nd -> Categorical Transformer
    # Despite my option of not doing it, you can also choose to create different 
    # data transformation pipelines for your categorical features.
    categorical_transformer = Pipeline(steps=[('frequent', SimpleImputer(strategy='most_frequent')),
                                              ('onehot', OneHotEncoder(use_cat_names=True))])
    
    
    # 3rd -> Combining both numerical and categorical pipelines
    # Here, we are creating different ColumnTransformers, each one with a different numerical transformation
    data_transformations_1 = ColumnTransformer(transformers=[
        ('num', numeric_transformer_1, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])
    
    data_transformations_2 = ColumnTransformer(transformers=[
        ('num', numeric_transformer_2, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])
    
    data_transformations_3 = ColumnTransformer(transformers=[
        ('num', numeric_transformer_3, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])
    
    data_transformations_4 = ColumnTransformer(transformers=[
        ('num', numeric_transformer_4, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

    
    # And finally, we are going to apply these different data transformations to RandomSearchCV,
    # trying to find the best imputing strategy, the best feature engineering strategy
    # and the best model with it's respective parameters.
    # Below, we just need to initialize a Pipeline object with any transformations we want, on each of the steps.
    pipe = Pipeline(steps=[('data_transformations', data_transformations_1), # Initializing data transformation step by choosing any of the above
                           ('feature_eng', PCA()), # Initializing feature engineering step by choosing any desired method
                           ('clf', SVC())]) # Initializing modeling step of the pipeline with any model object
                           #memory='cache_folder') -> Used to optimize memory when needed
    
    
    # Now, we define the grid of parameters that RandomSearchCV will use. It will randomly chose
    # options for each step inside the dictionaries ('data transformations', 'feature_eng', 'clf'
    # and 'clf parameters'). In the end of it's iterations, RandomSearchCV will return the best options.
    params_grid = [{
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [KNeighborsClassifier()],
        'clf__n_neighbors': stats.randint(1, 50),
        'clf__metric': ['minkowski', 'euclidean']
    },{
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [LogisticRegression()],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': stats.uniform(0.01, 10)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [SVC()],
        'clf__C': stats.uniform(0.01, 1),
        'clf__gamma': stats.uniform(0.01, 1)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [DecisionTreeClassifier()],
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 5)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [RandomForestClassifier()],
        'clf__n_estimators': stats.randint(10, 175),
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 5)],
        'clf__random_state': stats.randint(1, 49)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [ExtraTreesClassifier()],
        'clf__n_estimators': stats.randint(10, 150),
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [GradientBoostingClassifier()],
        'clf__n_estimators': stats.randint(10, 100),
        'clf__learning_rate': stats.uniform(0.01, 0.7),
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [LGBMClassifier()],
        'clf__n_estimators': stats.randint(1, 100),
        'clf__learning_rate': stats.uniform(0.01, 0.7),
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [XGBClassifier()],
        'clf__n_estimators': stats.randint(5, 125),
        'clf__eta': stats.uniform(0.01, 1),
        'clf__max_depth': [None, stats.randint(1, 6)],
        'clf__gamma': stats.uniform(0.01, 1)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None, 
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [StackingClassifier(estimators=[
                ('svc', SVC(C=1, gamma=1)),
                ('rf', RandomForestClassifier(max_depth=7, max_features=None, n_estimators=60, n_jobs=-1, random_state=28)),
                ('xgb', XGBClassifier(eta=0.6, gamma=0.7, max_depth=None, n_estimators=30))
            ], final_estimator=LogisticRegression(C=1)
        )]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1]*0.9)),
            PCA(n_components=round(x_train.shape[1]*0.8)),
            PCA(n_components=round(x_train.shape[1]*0.7)),
            PolynomialFeatures(degree=2), PolynomialFeatures(degree=3)
        ],
        'clf': [VotingClassifier(estimators=[
                ('gbt', GradientBoostingClassifier(learning_rate=0.8, max_depth=None, n_estimators=30)),
                ('lgbm', LGBMClassifier(n_estimators=30, learning_rate=0.6, max_depth=None)),
                ('xgb', XGBClassifier(eta=0.8, gamma=0.8, max_depth=None, n_estimators=40))
            ], voting='soft')
        ]
    }]
    
    
    # Now, we fit a RandomSearchCV to search over the grid of parameters defined above
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    
    best_model_pipeline = RandomizedSearchCV(pipe, params_grid, n_iter=300, 
                                             scoring=metrics, refit='accuracy', 
                                             n_jobs=-1, cv=5, random_state=21)

    best_model_pipeline.fit(x_train, y_train)
    
        
    # At last, we check the final results
    outstr1 = "\n\n#---------------- Best Data Pipeline found in RandomSearchCV  ----------------#\n\n"
    print(outstr1, best_model_pipeline.best_estimator_[0])
    
    outstr2 = "\n\n#---------------- Best Feature Engineering technique found in RandomSearchCV  ----------------#\n\n"
    print(outstr2, best_model_pipeline.best_estimator_[1])
    
    outstr3 = "\n\n#---------------- Best Classifier found in RandomSearchCV  ----------------#\n\n"
    print(outstr3, best_model_pipeline.best_estimator_[2])
    
    outstr4 = "\n\n#---------------- Best Estimator's average Accuracy Score on CV (validation set) ----------------#\n\n"
    print(outstr4, best_model_pipeline.best_score_)
    
    return x_train, x_test, y_train, y_test, best_model_pipeline

In [None]:
# Calling the function above, returing train/test data and best model's pipeline
x_train, x_test, y_train, y_test, best_model_pipeline = defineBestModelPipeline(
    df, target, categorical_columns, numeric_columns
)

# Checking best model's performance on test data
test_set_results = testSetResultsClassifier(best_model_pipeline, x_test, y_test)

In [None]:
# Visualizing all results and metrics, from all models, obtained by the RandomSearchCV steps
df_results = pd.DataFrame(best_model_pipeline.cv_results_)

display(df_results)

In [None]:
# Now visualizing all results and metrics obtained only by the best classifier
display(df_results[df_results['rank_test_accuracy'] == 1])

In [None]:
# Here, we access the categorical feature names generated by OneHotEncoder, and then concatenate them
# with the numerical feature names, in the same order our pipeline is applying data transformations.
categorical_features_after_onehot = best_model_pipeline.best_estimator_.named_steps['data_transformations']\
                                        .transformers_[1][1].named_steps['onehot'].get_feature_names()

feature_names_in_order = numeric_columns + categorical_features_after_onehot

print(feature_names_in_order)

In [None]:
# # Plotting feature importances of the best model, if sklearn tree-based (top 5 features)
#print("\n#---------------- Bar plot with feature importances ----------------#")
#feat_importances = pd.Series(best_model_pipeline.best_estimator_.named_steps['clf'].feature_importances_, index=feature_names_in_order)
#feat_importances.nlargest(5).plot(kind='barh')


# # Plotting feature importances of the best model, if linear regression-based (top 5 features)
#print("\n#---------------- Bar plot with feature importances ----------------#")
#feat_importances = pd.Series(best_model_pipeline.best_estimator_.named_steps['clf'].coef_, index=feature_names_in_order)
#feat_importances.nlargest(5).plot(kind='barh')


# # Plotting feature importances for XGB Model
#plot_importance_xgb(best_model_pipeline.best_estimator_.named_steps['clf'], height=0.4, 
#title='Feature Importances for XGB Classifier', importance_type='gain')


# # Plotting feature importances for LGBM Model
#plot_importance_lgbm(best_model_pipeline.best_estimator_.named_steps['clf'], 
#                     figsize=(10, 4), title='Feature importances for LGBM Classifier',
#                     importance_type='gain', max_num_features=10)