In [231]:
# importing libraries
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
from sklearn.model_selection import train_test_split # train-test split
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import plot_tree                   # tree plots
from sklearn.model_selection import RandomizedSearchCV     # hyperparameter tuning
from sklearn.metrics import make_scorer 
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm


file="./GOT_character_predictions.xlsx"

GOT=pd.read_excel(io=file,
                 header=0,
                 sheet_name=0)
GOT.head(n=5)

Unnamed: 0,S.No,name,title,culture,dateOfBirth,mother,father,heir,house,spouse,...,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,popularity,isAlive
0,1,Viserys II Targaryen,,,,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,...,1.0,0.0,0.0,,0,0,,11,0.605351,0
1,2,Walder Frey,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,...,,,,1.0,1,1,97.0,1,0.896321,1
2,3,Addison Hill,Ser,,,,,,House Swyft,,...,,,,,0,1,,0,0.267559,1
3,4,Aemma Arryn,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,...,,,,0.0,1,1,23.0,0,0.183946,0
4,5,Sylva Santagar,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,...,,,,1.0,1,1,29.0,0,0.043478,1


In [232]:
#Getting the info of the data frame
GOT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   S.No                        1946 non-null   int64  
 1   name                        1946 non-null   object 
 2   title                       938 non-null    object 
 3   culture                     677 non-null    object 
 4   dateOfBirth                 433 non-null    float64
 5   mother                      21 non-null     object 
 6   father                      26 non-null     object 
 7   heir                        23 non-null     object 
 8   house                       1519 non-null   object 
 9   spouse                      276 non-null    object 
 10  book1_A_Game_Of_Thrones     1946 non-null   int64  
 11  book2_A_Clash_Of_Kings      1946 non-null   int64  
 12  book3_A_Storm_Of_Swords     1946 non-null   int64  
 13  book4_A_Feast_For_Crows     1946 

In [233]:
# Checking for missing values
GOT.isnull().any()

S.No                          False
name                          False
title                          True
culture                        True
dateOfBirth                    True
mother                         True
father                         True
heir                           True
house                          True
spouse                         True
book1_A_Game_Of_Thrones       False
book2_A_Clash_Of_Kings        False
book3_A_Storm_Of_Swords       False
book4_A_Feast_For_Crows       False
book5_A_Dance_with_Dragons    False
isAliveMother                  True
isAliveFather                  True
isAliveHeir                    True
isAliveSpouse                  True
isMarried                     False
isNoble                       False
age                            True
numDeadRelations              False
popularity                    False
isAlive                       False
dtype: bool

In [234]:
# creating a loop to split the name

# placeholder list
placeholder_lst_names = []


for index, col in GOT.iterrows():
    

    split_name = GOT.loc[index, 'name'].split(sep =' ')
    
   
    placeholder_lst_names.append(split_name)
    

# converting placeholder_lst into a DataFrame 
names_df = pd.DataFrame(placeholder_lst_names)


# displaying the results
names_df

Unnamed: 0,0,1,2,3,4,5
0,Viserys,II,Targaryen,,,
1,Walder,Frey,,,,
2,Addison,Hill,,,,
3,Aemma,Arryn,,,,
4,Sylva,Santagar,,,,
...,...,...,...,...,...,...
1941,Luwin,,,,,
1942,Reek,,,,,
1943,Symeon,Star-Eyes,,,,
1944,Coldhands,,,,,


In [235]:
# STEP 2: concatenating with original DataFrame

# safety measure in case of multiple concatenations
GOT2 = pd.DataFrame.copy(GOT)


# renaming column to concatenate
names_df.columns = ['First_Name','Lastname1','Lastname2','Lastname3','Lastname4','Lastname5']


# concatenating names_df with GOT2 DataFrame
GOT2 = pd.concat([GOT2, names_df],
                   axis = 1)


# printing value counts of personal_email_domain
print(GOT2.loc[: ,'Lastname1'].value_counts())
print(GOT2.loc[: ,'Lastname2'].value_counts())
print(GOT2.loc[: ,'First_Name'].value_counts())

GOT2.head(n=5)

Frey         92
Targaryen    41
Stark        40
Tyrell       34
Lannister    31
             ..
Mole          1
Boggs         1
Tumitis       1
Mertyns       1
Star-Eyes     1
Name: Lastname1, Length: 460, dtype: int64
(son         26
Targaryen    16
(daughter    13
Loraq         6
the           5
             ..
Whittler      1
Gloom         1
Lannister     1
Port)         1
Tongue        1
Name: Lastname2, Length: 81, dtype: int64
Jon          21
Jeyne        14
Aegon        12
Walder       10
Alyn         10
             ..
Kyleg         1
Kurz          1
Kromm         1
Koss          1
Coldhands     1
Name: First_Name, Length: 1442, dtype: int64


Unnamed: 0,S.No,name,title,culture,dateOfBirth,mother,father,heir,house,spouse,...,age,numDeadRelations,popularity,isAlive,First_Name,Lastname1,Lastname2,Lastname3,Lastname4,Lastname5
0,1,Viserys II Targaryen,,,,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,...,,11,0.605351,0,Viserys,II,Targaryen,,,
1,2,Walder Frey,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,...,97.0,1,0.896321,1,Walder,Frey,,,,
2,3,Addison Hill,Ser,,,,,,House Swyft,,...,,0,0.267559,1,Addison,Hill,,,,
3,4,Aemma Arryn,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,...,23.0,0,0.183946,0,Aemma,Arryn,,,,
4,5,Sylva Santagar,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,...,29.0,0,0.043478,1,Sylva,Santagar,,,,


In [236]:
#Imputing missing values
age_median=GOT2['age'].median()
GOT2['age'].fillna(value=age_median, inplace=True)


dateOfBirth_median=GOT2['dateOfBirth'].median()
GOT2['dateOfBirth'].fillna(value=dateOfBirth_median, inplace=True)


GOT2['isAliveMother'].fillna(value=0, inplace=True)
GOT2['isAliveFather'].fillna(value=0, inplace=True)
GOT2['isAliveHeir'].fillna(value=0, inplace=True)
GOT2['isAliveSpouse'].fillna(value=0, inplace=True)
GOT2['title'].fillna(value='Unknown', inplace=True)

GOT2.isnull().any()

S.No                          False
name                          False
title                         False
culture                        True
dateOfBirth                   False
mother                         True
father                         True
heir                           True
house                          True
spouse                         True
book1_A_Game_Of_Thrones       False
book2_A_Clash_Of_Kings        False
book3_A_Storm_Of_Swords       False
book4_A_Feast_For_Crows       False
book5_A_Dance_with_Dragons    False
isAliveMother                 False
isAliveFather                 False
isAliveHeir                   False
isAliveSpouse                 False
isMarried                     False
isNoble                       False
age                           False
numDeadRelations              False
popularity                    False
isAlive                       False
First_Name                    False
Lastname1                      True
Lastname2                   

In [237]:
#Last letter of first name column to guess gender
GOT2['first_name_last_letter']=0

for row, col in GOT2.iterrows():
     GOT2.loc[row,'first_name_last_letter']=GOT2.loc[row,'First_Name'][-1]
    
GOT2['first_name_last_letter'].head(n=5) 


0    s
1    r
2    n
3    a
4    a
Name: first_name_last_letter, dtype: object

In [238]:
#
GOT2['House2']=0

for row, col in GOT2.iterrows():
    if GOT2.loc[row,'Lastname1']=='Lannister' or GOT2.loc[row,'Lastname2']=='Lannister':
        GOT2.loc[row,'House2']='Lannister'
        
    elif GOT2.loc[row,'Lastname1']=='Baratheon' or GOT2.loc[row,'Lastname2']=='Baratheon':
        GOT2.loc[row,'House2']='Baratheon'
        
    elif GOT2.loc[row,'Lastname1']=='Targaryen' or GOT2.loc[row,'Lastname2']=='Targaryen':
        GOT2.loc[row,'House2']='Targaryen'
    
    elif GOT2.loc[row,'Lastname1']=='Stark' or GOT2.loc[row,'Lastname2']=='Stark':
        GOT2.loc[row,'House2']='Stark'
        
    elif GOT2.loc[row,'Lastname1']=='Tyrell' or GOT2.loc[row,'Lastname2']=='Tyrell':
        GOT2.loc[row,'House2']='Tyrell'
        
    elif GOT2.loc[row,'Lastname1']=='Frey' or GOT2.loc[row,'Lastname2']=='Frey':
        GOT2.loc[row,'House2']='Frey'
        
    elif GOT2.loc[row,'Lastname1']=='Arryn' or GOT2.loc[row,'Lastname2']=='Arryn':
        GOT2.loc[row,'House2']='Arryn'
        
    elif GOT2.loc[row,'Lastname1']=='Greyjoy' or GOT2.loc[row,'Lastname2']=='Greyjoy':
        GOT2.loc[row,'House2']='Greyjoy'
        
    else:
        GOT2.loc[row,'House2']='Other'

#Gender column        
GOT2['Gender']=0 

for row, col in GOT2.iterrows():
    if GOT2.loc[row,'first_name_last_letter']=='a' or GOT2.loc[row,'first_name_last_letter']=='s' or GOT2.loc[row,'first_name_last_letter']=='i' or 'Princess' in GOT2.loc[row,'title'] or 'Queen' in GOT2.loc[row,'title'] or 'Lady' in GOT2.loc[row,'title']:
        GOT2.loc[row,'Gender']='Female'
        
    else:
        GOT2.loc[row,'Gender']='Male'

#Dummy for gender
GOT2['IsMale']=0 

for row, col in GOT2.iterrows():
    if GOT2.loc[row,'Gender']=='Male':
        GOT2.loc[row,'IsMale']=1 

        
#Dummies for houses        
GOT2['House_Lannister']=0       
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Lannister':
        GOT2.loc[row,'House_Lannister']=1
    else:
        GOT2.loc[row,'House_Lannister']=0
        
GOT2['House_Baratheon']=0        
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Baratheon':
        GOT2.loc[row,'House_Baratheon']=1
    else:
        GOT2.loc[row,'House_Baratheon']=0
        
GOT2['House_Targaryen']=0        
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Targaryen':
        GOT2.loc[row,'House_Targaryen']=1
    else:
        GOT2.loc[row,'House_Targaryen']=0
        
GOT2['House_Stark']=0        
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Stark':
        GOT2.loc[row,'House_Stark']=1
    else:
        GOT2.loc[row,'House_Stark']=0
        
GOT2['House_Tyrell']=0        
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Tyrell':
        GOT2.loc[row,'House_Tyrell']=1
    else:
        GOT2.loc[row,'House_Tyrell']=0
        
GOT2['House_Frey']=0        
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Frey':
        GOT2.loc[row,'House_Frey']=1
    else:
        GOT2.loc[row,'House_Frey']=0
        
GOT2['House_Arryn']=0        
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Arryn':
        GOT2.loc[row,'House_Arryn']=1
    else:
        GOT2.loc[row,'House_Arryn']=0
        
GOT2['House_Greyjoy']=0        
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Greyjoy':
        GOT2.loc[row,'House_Greyjoy']=1
    else:
        GOT2.loc[row,'House_Greyjoy']=0

GOT2['House_Other']=0 
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'House2']=='Other':
        GOT2.loc[row,'House_Other']=1
    else:
        GOT2.loc[row,'House_Other']=0
        
#Dummy for title
GOT2['title_dummy']=0
for row, col in GOT2.iterrows():
    if GOT2.loc[row,'title']=='Unknown':
        GOT2.loc[row,'title_dummy']=1
        
#Dummy for age and year of birth
GOT2['age+yob']=0
for row, col in GOT2.iterrows():
    GOT2.loc[row,'age+yob']=GOT2.loc[row,'age']+GOT2.loc[row,'dateOfBirth']


GOT2.head(n=10)
   

Unnamed: 0,S.No,name,title,culture,dateOfBirth,mother,father,heir,house,spouse,...,House_Baratheon,House_Targaryen,House_Stark,House_Tyrell,House_Frey,House_Arryn,House_Greyjoy,House_Other,title_dummy,age+yob
0,1,Viserys II Targaryen,Unknown,,268.0,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,...,0,1,0,0,0,0,0,0,1,295
1,2,Walder Frey,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,...,0,0,0,0,1,0,0,0,0,305
2,3,Addison Hill,Ser,,268.0,,,,House Swyft,,...,0,0,0,0,0,0,0,1,0,295
3,4,Aemma Arryn,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,...,0,0,0,0,0,1,0,0,0,105
4,5,Sylva Santagar,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,...,0,0,0,0,0,0,0,1,0,305
5,6,Tommen Baratheon,Unknown,,268.0,Cersei Lannister,Robert Baratheon,Myrcella Baratheon,,,...,1,0,0,0,0,0,0,0,1,295
6,7,Valarr Targaryen,Hand of the King,Valyrian,183.0,,,,House Targaryen,Kiera of Tyrosh,...,0,1,0,0,0,0,0,0,0,209
7,8,Viserys I Targaryen,Unknown,,268.0,Alyssa Targaryen,Baelon Targaryen,Rhaenyra Targaryen,,,...,0,1,0,0,0,0,0,0,1,295
8,9,Wilbert,Ser,,268.0,,,,,,...,0,0,0,0,0,0,0,1,0,295
9,10,Wilbert Osgrey,Ser,,268.0,,,,House Osgrey,,...,0,0,0,0,0,0,0,1,0,295


In [239]:
#Creating a correlation matrix of the features with the target variable.
GOT3=pd.DataFrame.copy(GOT2)

GOT_corr=GOT3.corr().round(2)

GOT_corr['isAlive'].sort_values(ascending=False)

isAlive                       1.00
age+yob                       0.31
book4_A_Feast_For_Crows       0.27
House_Other                   0.11
House_Tyrell                  0.06
age                           0.05
House_Frey                    0.05
title_dummy                   0.04
book5_A_Dance_with_Dragons    0.03
House_Stark                   0.02
book3_A_Storm_Of_Swords       0.01
isAliveSpouse                -0.01
House_Greyjoy                -0.01
IsMale                       -0.02
House_Baratheon              -0.04
isNoble                      -0.04
isAliveFather                -0.04
House_Lannister              -0.05
dateOfBirth                  -0.05
isMarried                    -0.05
House_Arryn                  -0.06
book2_A_Clash_Of_Kings       -0.07
isAliveHeir                  -0.08
isAliveMother                -0.12
S.No                         -0.13
book1_A_Game_Of_Thrones      -0.15
popularity                   -0.18
numDeadRelations             -0.19
House_Targaryen     

<strong>LOGISTIC REGRESSION</strong><br>

In [240]:
# train/test split with the full model
GOT_x_data   = GOT2.loc[:, ['book1_A_Game_Of_Thrones', 
       'book4_A_Feast_For_Crows','popularity','age+yob','House_Targaryen',
        'numDeadRelations']]

GOT_target = GOT2.loc[: , 'isAlive']


# This is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            GOT_x_data ,
            GOT_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = GOT_target) #very important for 0 and 1 or multinomial


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(X_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(X_test)


# SCORING the results
print('Training ACCURACY:', logreg_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(X_test, y_test).round(4))
print('AUC Score:', roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


# saving scoring data for future use
logreg_train_score = logreg_fit.score(X_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(X_test, y_test).round(4) # accuracy
# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

Training ACCURACY: 0.7955
Testing  ACCURACY: 0.8667
AUC Score: 0.7793


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [241]:
# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")


True Negatives : 30
False Positives: 20
False Negatives: 6
True Positives : 139



<strong>LOGISTIC TUNED</strong><br>

In [242]:
########################################
# RandomizedSearchCV
########################################

# declaring a hyperparameter space
"""
C_range          = np.arange(0.1, 11.0, 0.1)
warm_start_range = [True, False]
solver_range     = ['newton-cg', 'sag', 'lbfgs']


# creating a hyperparameter grid
param_grid = {'C'          : C_range,
              'warm_start' : warm_start_range,
              'solver'     : solver_range}


# INSTANTIATING the model object without hyperparameters
lr_tuned = LogisticRegression(random_state = 219,
                              max_iter     = 1000) # increased for convergence


# GridSearchCV object
lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
                                 param_distributions = param_grid, # parameters to tune
                                 cv                  = 3,          # how many folds in cross-validation
                                 n_iter              = 250,        # number of combinations of hyperparameters to try
                                 random_state        = 219,        # starting point for random sequence
                                 scoring = make_scorer(
                                           roc_auc_score,
                                           needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
lr_tuned_cv.fit(GOT_x_data, GOT_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", lr_tuned_cv.best_params_)
print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

"""

'\nC_range          = np.arange(0.1, 11.0, 0.1)\nwarm_start_range = [True, False]\nsolver_range     = [\'newton-cg\', \'sag\', \'lbfgs\']\n\n\n# creating a hyperparameter grid\nparam_grid = {\'C\'          : C_range,\n              \'warm_start\' : warm_start_range,\n              \'solver\'     : solver_range}\n\n\n# INSTANTIATING the model object without hyperparameters\nlr_tuned = LogisticRegression(random_state = 219,\n                              max_iter     = 1000) # increased for convergence\n\n\n# GridSearchCV object\nlr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object\n                                 param_distributions = param_grid, # parameters to tune\n                                 cv                  = 3,          # how many folds in cross-validation\n                                 n_iter              = 250,        # number of combinations of hyperparameters to try\n                                 random_state        = 219,       

In [243]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
lr_tuned = LogisticRegression(C            = 10.9,
                              warm_start   = False,
                              solver       = 'lbfgs',
                              max_iter     = 1000,
                              random_state = 219)


# FITTING the model to the full dataset
lr_tuned.fit(GOT_x_data, GOT_target) # this is ok because already tuned


# PREDICTING based on the testing set
lr_tuned_pred = lr_tuned.predict(X_test)


# SCORING the results
print('Training ACCURACY:', lr_tuned.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', lr_tuned.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))


# saving scoring data for future use
lr_tuned_train_score = lr_tuned.score(X_train, y_train).round(4) # accuracy
lr_tuned_test_score  = lr_tuned.score(X_test, y_test).round(4)   # accuracy


# saving the AUC score
lr_tuned_auc_score   = roc_auc_score(y_true  = y_test,
                                     y_score = lr_tuned_pred).round(4) # auc

Training ACCURACY: 0.7984
Testing  ACCURACY: 0.8513
AUC Score        : 0.7428


In [244]:
# unpacking the confusion matrix
lr_tuned_tn, \
lr_tuned_fp, \
lr_tuned_fn, \
lr_tuned_tp = confusion_matrix(y_true = y_test, y_pred = lr_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {lr_tuned_tn}
False Positives: {lr_tuned_fp}
False Negatives: {lr_tuned_fn}
True Positives : {lr_tuned_tp}
""")


True Negatives : 26
False Positives: 24
False Negatives: 5
True Positives : 140



<strong>FULL TREE</strong><br>

In [245]:
# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(X_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(X_test)


# SCORING the model
print('Training ACCURACY:', full_tree_fit.score(X_train, y_train).round(4))
print('Testing ACCURACY :', full_tree_fit.score(X_test, y_test).round(4))


# AUC score
print('AUC Score:', roc_auc_score(y_true  = y_test, y_score = full_tree_pred).round(4))


# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(X_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(X_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

Training ACCURACY: 0.8818
Testing ACCURACY : 0.8974
AUC Score: 0.8328


In [246]:
# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {full_tree_tn}
False Positives: {full_tree_fp}
False Negatives: {full_tree_fn}
True Positives : {full_tree_tp}
""")


True Negatives : 35
False Positives: 15
False Negatives: 5
True Positives : 140



<strong>TUNED FULL TREE</strong><br>

In [247]:
"""
# declaring a hyperparameter space
criterion_range = ['gini', 'entropy']
splitter_range  = ['best', 'random']
depth_range     = np.arange(1, 25, 1)
leaf_range      = np.arange(1, 100, 1)


# creating a hyperparameter grid
param_grid = {'criterion'        : criterion_range,
              'splitter'         : splitter_range,
              'max_depth'        : depth_range,
              'min_samples_leaf' : leaf_range}


# INSTANTIATING the model object without hyperparameters
tuned_tree = DecisionTreeClassifier(random_state = 219)


# RandomizedSearchCV object
tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
                                   param_distributions   = param_grid,
                                   cv                    = 3,
                                   n_iter                = 1000,
                                   random_state          = 219,
                                   scoring = make_scorer(roc_auc_score,
                                             needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_tree_cv.fit(GOT_x_data, GOT_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_tree_cv.best_params_)
print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))
"""

'\n# declaring a hyperparameter space\ncriterion_range = [\'gini\', \'entropy\']\nsplitter_range  = [\'best\', \'random\']\ndepth_range     = np.arange(1, 25, 1)\nleaf_range      = np.arange(1, 100, 1)\n\n\n# creating a hyperparameter grid\nparam_grid = {\'criterion\'        : criterion_range,\n              \'splitter\'         : splitter_range,\n              \'max_depth\'        : depth_range,\n              \'min_samples_leaf\' : leaf_range}\n\n\n# INSTANTIATING the model object without hyperparameters\ntuned_tree = DecisionTreeClassifier(random_state = 219)\n\n\n# RandomizedSearchCV object\ntuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,\n                                   param_distributions   = param_grid,\n                                   cv                    = 3,\n                                   n_iter                = 1000,\n                                   random_state          = 219,\n                                   scoring = make_scorer(ro

In [248]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
tuned_tree = DecisionTreeClassifier(splitter         = 'best',
                                    min_samples_leaf = 4,
                                    max_depth        = 8,
                                    criterion        = 'entropy',
                                    random_state     = 219)


# FITTING to the FULL DATASET (due to cross-validation)
tuned_tree_fit = tuned_tree.fit(GOT_x_data, GOT_target)


# PREDICTING based on the testing set
tuned_tree_pred = tuned_tree.predict(X_test)


# SCORING the results
print('Training ACCURACY:', tuned_tree.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', tuned_tree.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tuned_tree_pred).round(4))


# saving scoring data for future use
tuned_tree_train_score = tuned_tree.score(X_train, y_train).round(4) # accuracy
tuned_tree_test_score  = tuned_tree.score(X_test, y_test).round(4)   # accuracy


# saving the AUC score
tuned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                       y_score = tuned_tree_pred).round(4) # auc

Training ACCURACY: 0.8567
Testing  ACCURACY: 0.9231
AUC Score        : 0.8631


In [249]:
# unpacking the confusion matrix
tuned_tree_tn, \
tuned_tree_fp, \
tuned_tree_fn, \
tuned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_tree_tn}
False Positives: {tuned_tree_fp}
False Negatives: {tuned_tree_fn}
True Positives : {tuned_tree_tp}
""")


True Negatives : 37
False Positives: 13
False Negatives: 2
True Positives : 143



<strong>PRUNED TREE</strong><br>

In [250]:
# INSTANTIATING a classification tree object
pruned_tree = DecisionTreeClassifier(max_depth=4,
                    min_samples_split=25,
                    random_state = 219)


# FITTING the training data
pruned_tree_fit = pruned_tree.fit(X_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree_fit.predict(X_test)


# SCORING the model
print('Training ACCURACY:', pruned_tree_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', pruned_tree_fit.score(X_test, y_test).round(4))


# AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = pruned_tree_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = pruned_tree_fit.score(X_train, y_train).round(4) # accuracy
pruned_tree_test_score  = pruned_tree_fit.score(X_test, y_test).round(4) # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = pruned_tree_pred).round(4) # auc

Training ACCURACY: 0.8281
Testing  ACCURACY: 0.8513
AUC Score        : 0.71


In [251]:
# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = pruned_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")


True Negatives : 21
False Positives: 29
False Negatives: 0
True Positives : 145



<strong>KNN</strong><br>

In [252]:
########################################
# optimal_neighbors
########################################
def optimal_neighbors(x_data,
                      y_data,
                      standardize = True,
                      pct_test=0.10,
                      seed=219,
                      response_type='class',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
x_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the x data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 219
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        # optionally standardizing x_data
        scaler             = StandardScaler()
        scaler.fit(x_data)
        x_scaled           = scaler.transform(x_data)
        x_scaled_df        = pd.DataFrame(x_scaled)
        x_data             = x_scaled_df



    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1


########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)



In [253]:
# determining the optimal number of neighbors
opt_neighbors = optimal_neighbors(x_data        = GOT_x_data,
                                  y_data        = GOT_target,
                                  response_type = 'class')

The optimal number of neighbors is: 4


In [254]:
# INSTANTIATING StandardScaler()
scaler = StandardScaler()


# FITTING the data
scaler.fit(GOT_x_data)


# TRANSFORMING the data
x_scaled     = scaler.transform(GOT_x_data)


# converting to a DataFrame
x_scaled_df  = pd.DataFrame(x_scaled) 


# train-test split with the scaled data
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            x_scaled_df,
            GOT_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = GOT_target)


# INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors = opt_neighbors)


# FITTING the training data
knn_fit = knn_opt.fit(x_train_scaled, y_train_scaled)


# PREDICTING based on the testing set
knn_pred = knn_fit.predict(x_test_scaled)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(x_train_scaled, y_train_scaled).round(4))
print('Testing  ACCURACY:', knn_fit.score(x_test_scaled, y_test_scaled).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4))


# saving scoring data
knn_train_score = knn_fit.score(x_train_scaled, y_train_scaled).round(4)
knn_test_score  = knn_fit.score(x_test_scaled, y_test_scaled).round(4)


# saving AUC score
knn_auc_score   = roc_auc_score(y_true  = y_test_scaled,
                                          y_score = knn_pred).round(4)

Training ACCURACY: 0.8087
Testing  ACCURACY: 0.7641
AUC Score        : 0.73


In [255]:
# unpacking the confusion matrix
knn_tree_tn, \
knn_tree_fp, \
knn_tree_fn, \
knn_tree_tp = confusion_matrix(y_true = y_test_scaled, y_pred = knn_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {knn_tree_tn}
False Positives: {knn_tree_fp}
False Negatives: {knn_tree_fn}
True Positives : {knn_tree_tp}
""")


True Negatives : 33
False Positives: 17
False Negatives: 29
True Positives : 116



<strong>RANDOM FOREST</strong><br>

In [256]:
# train/test split
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(
            GOT_x_data,
            GOT_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = GOT_target)

# INSTANTIATING a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 100,
                                    criterion        = 'gini',
                                    max_depth        = None,
                                    min_samples_leaf = 1,
                                    bootstrap        = True,
                                    warm_start       = False,
                                    random_state     = 219)

# FITTING the training data
rf_default_fit = rf_default.fit(X_train_RF, y_train_RF)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(X_test_RF)


# SCORING the results
print('Training ACCURACY:', rf_default_fit.score(X_train_RF, y_train_RF).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(X_test_RF, y_test_RF).round(4))


# AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test_RF,
                                          y_score = rf_default_fit_pred).round(4))

# saving scoring data
RF_train_score=rf_default_fit.score(X_train_RF, y_train_RF).round(4)
RF_test_score=rf_default_fit.score(X_test_RF, y_test_RF).round(4)

# saving AUC score
RF_auc_score=roc_auc_score(y_true  = y_test_RF,
                           y_score = rf_default_fit_pred).round(4)

Training ACCURACY: 0.8818
Testing  ACCURACY: 0.8923
AUC Score        : 0.8162


In [257]:
# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = rf_default_fit_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")


True Negatives : 33
False Positives: 17
False Negatives: 4
True Positives : 141



<strong>RANDOM FOREST TUNED</strong><br>

In [258]:
"""
# FITTING the training data
rf_default_fit = rf_default.fit(X_train, y_train)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(X_test)


# declaring a hyperparameter space
estimator_range  = np.arange(100, 1100, 250)
leaf_range       = np.arange(1, 31, 10)
criterion_range  = ['gini', 'entropy']
bootstrap_range  = [True, False]
warm_start_range = [True, False]


# creating a hyperparameter grid
param_grid = {'n_estimators'     : estimator_range,
              'min_samples_leaf' : leaf_range,
              'criterion'        : criterion_range,
              'bootstrap'        : bootstrap_range,
              'warm_start'       : warm_start_range}


# INSTANTIATING the model object without hyperparameters
forest_grid = RandomForestClassifier(random_state = 219)


# GridSearchCV object
forest_cv = RandomizedSearchCV(estimator           = forest_grid,
                               param_distributions = param_grid,
                               cv         = 3,
                               n_iter     = 1000,
                               scoring    = make_scorer(roc_auc_score,
                                            needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
forest_cv.fit(GOT_x_data, GOT_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", forest_cv.best_params_)
print("Tuned Training AUC:", forest_cv.best_score_.round(4))
"""

'\n# FITTING the training data\nrf_default_fit = rf_default.fit(X_train, y_train)\n\n\n# PREDICTING based on the testing set\nrf_default_fit_pred = rf_default_fit.predict(X_test)\n\n\n# declaring a hyperparameter space\nestimator_range  = np.arange(100, 1100, 250)\nleaf_range       = np.arange(1, 31, 10)\ncriterion_range  = [\'gini\', \'entropy\']\nbootstrap_range  = [True, False]\nwarm_start_range = [True, False]\n\n\n# creating a hyperparameter grid\nparam_grid = {\'n_estimators\'     : estimator_range,\n              \'min_samples_leaf\' : leaf_range,\n              \'criterion\'        : criterion_range,\n              \'bootstrap\'        : bootstrap_range,\n              \'warm_start\'       : warm_start_range}\n\n\n# INSTANTIATING the model object without hyperparameters\nforest_grid = RandomForestClassifier(random_state = 219)\n\n\n# GridSearchCV object\nforest_cv = RandomizedSearchCV(estimator           = forest_grid,\n                               param_distributions = param

In [259]:
# building a model based on hyperparameter tuning results

# INSTANTIATING with best_estimator
forest_tuned = RandomForestClassifier(criterion        = 'entropy',
                                      min_samples_leaf = 1,
                                      n_estimators     = 350,
                                      warm_start       = True,
                                      bootstrap        = False,
                                      random_state     = 219)


# FITTING to the FULL DATASET (due to cross-validation)
forest_tuned_fit = forest_tuned.fit(GOT_x_data, GOT_target)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(X_test)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(X_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(X_test, y_test).round(4))
print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                       y_score = forest_tuned_pred).round(4))


# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(X_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(X_test, y_test).round(4)   # accuracy


# saving the AUC score
forest_tuned_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc

Forest Tuned Training ACCURACY: 0.8806
Forest Tuned Testing  ACCURACY: 0.9333
Forest Tuned AUC Score        : 0.8897


In [260]:
# unpacking the confusion matrix
tuned_rf_tn, \
tuned_rf_fp, \
tuned_rf_fn, \
tuned_rf_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
""")


True Negatives : 40
False Positives: 10
False Negatives: 3
True Positives : 142



<strong>GBM</strong><br>

In [261]:
# train/test split
X_train_GBM, X_test_GBM, y_train_GBM, y_test_GBM = train_test_split(
            GOT_x_data,
            GOT_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = GOT_target)

# INSTANTIATING the model object without hyperparameters
full_gbm_default = GradientBoostingClassifier(loss          = 'deviance',
                                              learning_rate = 0.1,
                                              n_estimators  = 100,
                                              criterion     = 'friedman_mse',
                                              max_depth     = 3,
                                              warm_start    = False,
                                              random_state  = 219)


# FIT step is needed as we are not using .best_estimator
full_gbm_default_fit = full_gbm_default.fit(X_train_GBM, y_train_GBM)


# PREDICTING based on the testing set
full_gbm_default_pred = full_gbm_default_fit.predict(X_test_GBM)


# SCORING the results
print('Training ACCURACY:', full_gbm_default_fit.score(X_train_GBM, y_train_GBM).round(4))
print('Testing ACCURACY :', full_gbm_default_fit.score(X_test_GBM, y_test_GBM).round(4))


# AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test_GBM,
                                          y_score = full_gbm_default_pred).round(4))

# saving scoring data
GBM_train_score=rf_default_fit.score(X_train_GBM, y_train_GBM).round(4)
GBM_test_score=rf_default_fit.score(X_test_GBM, y_test_GBM).round(4)

# saving AUC score
GBM_auc_score=roc_auc_score(y_true  = y_test_GBM,
                           y_score = rf_default_fit_pred).round(4)

Training ACCURACY: 0.8607
Testing ACCURACY : 0.9026
AUC Score        : 0.8231


In [262]:
# unpacking the confusion matrix
gbm_default_tn, \
gbm_default_fp, \
gbm_default_fn, \
gbm_default_tp = confusion_matrix(y_true = y_test_GBM, y_pred = full_gbm_default_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_default_tn}
False Positives: {gbm_default_fp}
False Negatives: {gbm_default_fn}
True Positives : {gbm_default_tp}
""")


True Negatives : 33
False Positives: 17
False Negatives: 2
True Positives : 143



<strong>GBM TUNED</strong><br>

In [263]:
"""
# declaring a hyperparameter space
learn_range        = np.arange(0.1, 2.2, 0.5)
estimator_range    = np.arange(100, 501, 25)
depth_range        = np.arange(2, 11, 2)
warm_start_range   = [True, False]

# creating a hyperparameter grid
param_grid = {'learning_rate' : learn_range,
              'max_depth'     : depth_range,
              'n_estimators'  : estimator_range,
              'warm_start'    : warm_start_range}


# INSTANTIATING the model object without hyperparameters
full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# GridSearchCV object
full_gbm_cv = RandomizedSearchCV(estimator     = full_gbm_grid,
                           param_distributions = param_grid,
                           cv                  = 3,
                           n_iter              = 500,
                           random_state        = 219,
                           scoring             = make_scorer(roc_auc_score,
                                                 needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
full_gbm_cv.fit(GOT_x_data, GOT_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", full_gbm_cv.best_params_)
print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))
"""

'\n# declaring a hyperparameter space\nlearn_range        = np.arange(0.1, 2.2, 0.5)\nestimator_range    = np.arange(100, 501, 25)\ndepth_range        = np.arange(2, 11, 2)\nwarm_start_range   = [True, False]\n\n# creating a hyperparameter grid\nparam_grid = {\'learning_rate\' : learn_range,\n              \'max_depth\'     : depth_range,\n              \'n_estimators\'  : estimator_range,\n              \'warm_start\'    : warm_start_range}\n\n\n# INSTANTIATING the model object without hyperparameters\nfull_gbm_grid = GradientBoostingClassifier(random_state = 219)\n\n\n# GridSearchCV object\nfull_gbm_cv = RandomizedSearchCV(estimator     = full_gbm_grid,\n                           param_distributions = param_grid,\n                           cv                  = 3,\n                           n_iter              = 500,\n                           random_state        = 219,\n                           scoring             = make_scorer(roc_auc_score,\n                                 

In [264]:
# INSTANTIATING with best_estimator
gbm_tuned = GradientBoostingClassifier(learning_rate = 1.1,
                                       max_depth     = 2,
                                       n_estimators  = 175,
                                       warm_start    = False,
                                       random_state  = 219)


# FITTING to the FULL DATASET (due to cross-validation)
gbm_tuned_fit = gbm_tuned.fit(GOT_x_data, GOT_target)


# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(X_test)


# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

# saving scoring data
GBM_tuned_train_score=gbm_tuned_fit.score(X_train, y_train).round(4)
GBM_tuned_test_score=gbm_tuned_fit.score(X_test, y_test).round(4)

# saving AUC score
GBM_tuned_auc_score=roc_auc_score(y_true  = y_test,
                           y_score = rf_default_fit_pred).round(4)

Training ACCURACY: 0.8658
Testing  ACCURACY: 0.9231
AUC Score        : 0.8697


In [265]:
# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")


True Negatives : 38
False Positives: 12
False Negatives: 3
True Positives : 142



In [266]:
#Comparing results in dynamic string
print(f"""
Model                               Train Score      Test Score      AUC Score       Confussion Matrix       GAP     
-----                               -----------       ----------     ----------      ------------------      ----
Logistic Regression                   {logreg_train_score}            {logreg_test_score}        {logreg_auc_score}          {logreg_tn}, {logreg_fp}, {logreg_fn}, {logreg_tp}         {abs(logreg_train_score-logreg_test_score).round(2)}
Tuned Logistic Regression             {lr_tuned_train_score}            {lr_tuned_test_score}        {lr_tuned_auc}          {lr_tuned_tn}, {lr_tuned_fp}, {lr_tuned_fn}, {lr_tuned_tp}         {abs(lr_tuned_train_score-lr_tuned_test_score).round(2)}
Full Tree Regression                  {full_tree_train_score}            {full_tree_test_score}        {full_tree_auc_score}          {full_tree_tn}, {full_tree_fp}, {full_tree_fn}, {full_tree_tp}         {abs(full_tree_train_score-full_tree_test_score).round(2)}
Full Tree Regression Tuned            {tuned_tree_train_score}            {tuned_tree_test_score}        {tuned_tree_auc_score}          {tuned_tree_tn}, {tuned_tree_fp}, {tuned_tree_fn}, {tuned_tree_tp}         {abs(tuned_tree_train_score-tuned_tree_test_score).round(2)}
Pruned Tree Regression                {pruned_tree_train_score}            {pruned_tree_test_score}        {pruned_tree_auc_score}            {pruned_tree_tn}, {pruned_tree_fp}, {pruned_tree_fn}, {pruned_tree_tp}         {abs(pruned_tree_train_score-pruned_tree_test_score).round(2)}
KNN Regression                        {knn_train_score}            {knn_test_score}        {knn_auc_score}            {knn_tree_tn}, {knn_tree_fp}, {knn_tree_fn},{knn_tree_tp}         {abs(knn_train_score-knn_test_score).round(2)}
Random Forest                         {RF_train_score}            {RF_test_score}        {RF_auc_score}          {rf_tn}, {rf_fp}, {rf_fn}, {rf_tp}         {abs(RF_train_score-RF_test_score).round(2)}
Random Forest Tuned (FINAL MODEL!!!)  {forest_tuned_train_score}            {forest_tuned_test_score}        {forest_tuned_auc_score}          {tuned_rf_tn}, {tuned_rf_fp}, {tuned_rf_fn}, {tuned_rf_tp}         {abs(forest_tuned_train_score-forest_tuned_test_score).round(2)}
GBM Regression                        {GBM_train_score}            {GBM_test_score}        {GBM_auc_score}          {gbm_default_tn}, {gbm_default_fp}, {gbm_default_fn}, {gbm_default_tp}         {abs(GBM_train_score-GBM_test_score).round(2)}
GBM Regression Tuned                  {GBM_tuned_train_score}            {GBM_tuned_test_score}        {GBM_tuned_auc_score}          {gbm_tuned_tn}, {gbm_tuned_fp}, {gbm_tuned_fn}, {gbm_tuned_tp}         {abs(GBM_tuned_train_score-GBM_tuned_test_score).round(2)}
""")


Model                               Train Score      Test Score      AUC Score       Confussion Matrix       GAP     
-----                               -----------       ----------     ----------      ------------------      ----
Logistic Regression                   0.7955            0.8667        0.7793          30, 20, 6, 139         0.07
Tuned Logistic Regression             0.7984            0.8513        0.7428          26, 24, 5, 140         0.05
Full Tree Regression                  0.8818            0.8974        0.8328          35, 15, 5, 140         0.02
Full Tree Regression Tuned            0.8567            0.9231        0.8631          37, 13, 2, 143         0.07
Pruned Tree Regression                0.8281            0.8513        0.71            21, 29, 0, 145         0.02
KNN Regression                        0.8087            0.7641        0.73            33, 17, 29,116         0.04
Random Forest                         0.8818            0.8923        0.8162       