Using `encodedData_01.csv` dataset

In [19]:
# First code block is the import libraries  
import numpy as np
import pandas as pd
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


In [2]:
dataPath = '../../preprocess_train_dataset/encodedData_01.csv'

df_spaceship = pd.read_csv(dataPath)

df_spaceship.shape

(8693, 36)

In [3]:
train_x, train_y = df_spaceship.drop(columns=['Transported']), df_spaceship['Transported']

In [4]:
# drop categorical data, remain the one hot encoded ones 
train_x.drop(['PassengerId', 'HomePlanet', 'Cabin_deck', 'Cabin_side', 'Destination', 'First_Name', 'First_Name_le', 'Last_Name'], axis=1, inplace=True)



In [5]:
train_x.columns

Index(['Earth', 'Europa', 'Mars', 'CryoSleep', 'Cabin_deck_A', 'Cabin_deck_B',
       'Cabin_deck_C', 'Cabin_deck_D', 'Cabin_deck_E', 'Cabin_deck_F',
       'Cabin_deck_G', 'Cabin_deck_T', 'Cabin_num', 'Cabin_side_P',
       'Cabin_side_S', '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Total_spending', 'Last_Name_le'],
      dtype='object')

In [6]:
k_folds = KFold(n_splits = 8)

logisticReg_model = LogisticRegression(solver = 'liblinear', class_weight = 'balanced', max_iter =300, penalty = 'l1')

scores = cross_val_score(logisticReg_model, train_x, train_y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.78196872 0.75712971 0.79300828 0.80588776 0.79484821 0.77348066
 0.81860037 0.7771639 ]

Average CV Score:  0.7877609527294783


In [7]:
# drop the spending activities, only keep the total spending
train_x1 = train_x.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1, inplace=False)
train_x1

Unnamed: 0,Earth,Europa,Mars,CryoSleep,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,...,Cabin_num,Cabin_side_P,Cabin_side_S,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,Age,VIP,Total_spending,Last_Name_le
0,False,True,False,False,False,True,False,False,False,False,...,0,True,False,False,False,True,39.0,False,0.0,1431
1,True,False,False,False,False,False,False,False,False,True,...,0,False,True,False,False,True,24.0,False,736.0,2109
2,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,58.0,True,10383.0,1990
3,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,33.0,False,5176.0,1990
4,True,False,False,False,False,False,False,False,False,True,...,1,False,True,False,False,True,16.0,False,1091.0,1778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,True,False,False,True,False,False,False,False,False,...,98,True,False,True,False,False,41.0,True,8536.0,1416
8689,True,False,False,True,False,False,False,False,False,False,...,1499,False,True,False,True,False,18.0,False,0.0,1341
8690,True,False,False,False,False,False,False,False,False,False,...,1500,False,True,False,False,True,26.0,False,1873.0,470
8691,False,True,False,False,False,False,False,False,True,False,...,608,False,True,True,False,False,32.0,False,4637.0,996


In [8]:
# result of dropping the consumption details
scores = cross_val_score(logisticReg_model, train_x1, train_y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.71941122 0.71573137 0.75436983 0.72033119 0.71113155 0.71086556
 0.75690608 0.72836096]

Average CV Score:  0.7271384697098304


In [10]:
# use the normalized data for cross validation

normalizedDataPath = '../../preprocess_train_dataset/normalizedData_01.csv'

df_normalizedSpaceship = pd.read_csv(normalizedDataPath)

df_normalizedSpaceship.shape

(8693, 36)

In [11]:
df_normalizedSpaceship

Unnamed: 0,PassengerId,HomePlanet,Earth,Europa,Mars,CryoSleep,Cabin_deck,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,...,FoodCourt,ShoppingMall,Spa,VRDeck,Total_spending,First_Name,Last_Name,First_Name_le,Last_Name_le,Transported
0,0001_01,Europa,False,True,False,False,B,False,True,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,Maham,Ofracculy,1614,1431,False
1,0002_01,Earth,True,False,False,False,F,False,False,False,...,0.000302,0.001064,0.024500,0.001823,0.020452,Juanna,Vines,1407,2109,True
2,0003_01,Europa,False,True,False,False,A,True,False,False,...,0.119948,0.000000,0.299670,0.002030,0.288521,Altark,Susent,156,1990,False
3,0003_02,Europa,False,True,False,False,A,True,False,False,...,0.043035,0.015793,0.148563,0.007997,0.143830,Solam,Susent,2276,1990,False
4,0004_01,Earth,True,False,False,False,F,False,False,False,...,0.002348,0.006428,0.025214,0.000083,0.030317,Willy,Santantines,2642,1778,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,True,False,False,A,True,False,False,...,0.228726,0.000000,0.073322,0.003066,0.237197,Gravior,Noxnuther,1135,1416,False
8689,9278_01,Earth,True,False,False,True,G,False,False,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,Kurta,Mondalley,1492,1341,False
8690,9279_01,Earth,True,False,False,False,G,False,False,False,...,0.000000,0.079687,0.000045,0.000000,0.052047,Fayey,Connon,955,470,True
8691,9280_01,Europa,False,True,False,False,E,False,False,False,...,0.035186,0.000000,0.015753,0.134049,0.128852,Celeon,Hontichre,526,996,False


In [12]:
# drop categorical data, remain the encoded ones 
train_nx, train_ny = df_normalizedSpaceship.drop(columns=['Transported']), df_spaceship['Transported']
train_nx.drop(['PassengerId', 'HomePlanet', 'Cabin_deck', 'Cabin_side', 'Destination', 'First_Name', 'First_Name_le', 'Last_Name'], axis=1, inplace=True)


In [13]:
scores = cross_val_score(logisticReg_model, train_nx, train_ny, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.77092916 0.75712971 0.79944802 0.80404784 0.79300828 0.76519337
 0.8213628  0.77532228]

Average CV Score:  0.7858051838147468


In [14]:
# drop the spending activities, only keep the total spending
train_nx1 = train_nx.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1, inplace=False)
train_nx1

Unnamed: 0,Earth,Europa,Mars,CryoSleep,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,...,Cabin_num,Cabin_side_P,Cabin_side_S,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,Age,VIP,Total_spending,Last_Name_le
0,False,True,False,False,False,True,False,False,False,False,...,0,True,False,False,False,True,0.493671,False,0.000000,1431
1,True,False,False,False,False,False,False,False,False,True,...,0,False,True,False,False,True,0.303797,False,0.020452,2109
2,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,0.734177,True,0.288521,1990
3,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,0.417722,False,0.143830,1990
4,True,False,False,False,False,False,False,False,False,True,...,1,False,True,False,False,True,0.202532,False,0.030317,1778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,True,False,False,True,False,False,False,False,False,...,98,True,False,True,False,False,0.518987,True,0.237197,1416
8689,True,False,False,True,False,False,False,False,False,False,...,1499,False,True,False,True,False,0.227848,False,0.000000,1341
8690,True,False,False,False,False,False,False,False,False,False,...,1500,False,True,False,False,True,0.329114,False,0.052047,470
8691,False,True,False,False,False,False,False,False,True,False,...,608,False,True,True,False,False,0.405063,False,0.128852,996


In [15]:
# result of dropping the consumption details
scores = cross_val_score(logisticReg_model, train_nx1, train_ny, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.71941122 0.71481141 0.75436983 0.71941122 0.71021159 0.70994475
 0.75690608 0.72559853]

Average CV Score:  0.7263330783527406


In [20]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], 
    'max_iter': [100, 200, 300]
    
}
# param_grid = {
#     'penalty': ['l1', 'l2', 'elasticnet', 'none'],
#     'C': np.logspace(-4, 4, 20),
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'max_iter': [100, 1000,2500, 5000]
# }

grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=8)


X_train = train_nx
y_train = train_ny
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

4800 fits failed out of a total of 8640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lucas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solve

In [21]:
# Print the cross-validation score of the best model
print("Best Cross-Validation Score:", grid_search.best_score_)

print("\n\nBest Model parameters:", best_params) 

Best Cross-Validation Score: 0.7890266433541553


Best Model parameters: {'C': 545.5594781168514, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


In [22]:
fullParamGrid_result = pd.DataFrame(grid_search.cv_results_)
fullParamGrid_result.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'param_max_iter', 'param_penalty', 'param_solver', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

In [23]:
fullParamGrid_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_max_iter,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000609,0.000571,0.0,0.0,0.0001,100,l1,lbfgs,"{'C': 0.0001, 'max_iter': 100, 'penalty': 'l1'...",,,,,,,,,,,481
1,0.006422,0.000602,0.003439,0.002144,0.0001,100,l1,liblinear,"{'C': 0.0001, 'max_iter': 100, 'penalty': 'l1'...",0.49954,0.50138,0.513339,0.4885,0.49586,0.513812,0.57919,0.546961,0.517323,0.028714,475
2,0.000646,0.000502,0.0,0.0,0.0001,100,l1,newton-cg,"{'C': 0.0001, 'max_iter': 100, 'penalty': 'l1'...",,,,,,,,,,,481
3,0.000372,0.00048,0.0,0.0,0.0001,100,l1,newton-cholesky,"{'C': 0.0001, 'max_iter': 100, 'penalty': 'l1'...",,,,,,,,,,,481
4,0.000252,0.000437,0.0,0.0,0.0001,100,l1,sag,"{'C': 0.0001, 'max_iter': 100, 'penalty': 'l1'...",,,,,,,,,,,481


In [24]:
fullParamGrid_result.to_csv('logModel_gridResult.csv', index=True, index_label=True)

In [25]:
df_paramGrid = fullParamGrid_result[['param_C', 'param_max_iter', 'param_penalty',
    'param_solver', 'mean_test_score', 'rank_test_score']]

In [27]:
df_paramGrid.sort_values(by='mean_test_score', ascending=False,)

Unnamed: 0,param_C,param_max_iter,param_penalty,param_solver,mean_test_score,rank_test_score
907,545.559478,300,l2,liblinear,0.789027,1
889,545.559478,200,l2,liblinear,0.789027,1
871,545.559478,100,l2,liblinear,0.789027,1
739,29.763514,300,l1,liblinear,0.788911,4
757,78.475997,100,l1,liblinear,0.788911,4
...,...,...,...,...,...,...
1075,10000.0,300,elasticnet,liblinear,,481
1076,10000.0,300,elasticnet,newton-cg,,481
1077,10000.0,300,elasticnet,newton-cholesky,,481
1078,10000.0,300,elasticnet,sag,,481


In [33]:
# Drop rows that has NaN values on selected columns
df_paramGrid = df_paramGrid.dropna(subset=['mean_test_score'])

In [35]:
df_paramGrid.sort_values(by='mean_test_score', ascending=False,).head(20)

Unnamed: 0,param_C,param_max_iter,param_penalty,param_solver,mean_test_score,rank_test_score
907,545.559478,300,l2,liblinear,0.789027,1
889,545.559478,200,l2,liblinear,0.789027,1
871,545.559478,100,l2,liblinear,0.789027,1
739,29.763514,300,l1,liblinear,0.788911,4
757,78.475997,100,l1,liblinear,0.788911,4
721,29.763514,200,l1,liblinear,0.788911,4
775,78.475997,200,l1,liblinear,0.788911,4
793,78.475997,300,l1,liblinear,0.788911,4
908,545.559478,300,l2,newton-cg,0.788911,9
891,545.559478,200,l2,newton-cholesky,0.788911,9


Therefore, the perfered parameters of the model are:
|   param_solver       |    param_penalty	|   param_C             |   |   |
|----------------------|--------------------|-----------------------|---|---|
|       liblinear      |        l1          |   545.559478	        |   |   |
|                      |        l2          |   78.475997	        |   |   |
|                      |                    |   29.763514	        |   |   |

In [37]:
# run the second time of param_grid search
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [545.559478, 500, 350, 200, 100, 78.475997, 29.763514, 10, 1],
    'solver': ['liblinear'], 
    'max_iter': [100, 200, 300, 500, 1000, 3000]
    
}

grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=8)


X_train = train_nx
y_train = train_ny
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [38]:
# Print the cross-validation score of the best model
print("Best Cross-Validation Score:", grid_search.best_score_)

print("\n\nBest Model parameters:", best_params) 

Best Cross-Validation Score: 0.7893714177768064


Best Model parameters: {'C': 350, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


In [43]:
pd.DataFrame(grid_search.cv_results_).sort_values(by='mean_test_score', ascending=False,).head(30)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_max_iter,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,mean_test_score,std_test_score,rank_test_score
31,0.052693,0.00324,0.00174,0.000779,350.0,500,l2,liblinear,"{'C': 350, 'max_iter': 500, 'penalty': 'l2', '...",0.784729,0.76173,0.804048,0.791168,0.799448,0.778085,0.810313,0.785451,0.789371,0.014522,1
25,0.056412,0.003966,0.002237,0.000797,350.0,100,l2,liblinear,"{'C': 350, 'max_iter': 100, 'penalty': 'l2', '...",0.784729,0.76173,0.804048,0.791168,0.799448,0.778085,0.810313,0.785451,0.789371,0.014522,1
35,0.055916,0.00237,0.002711,0.001019,350.0,3000,l2,liblinear,"{'C': 350, 'max_iter': 3000, 'penalty': 'l2', ...",0.784729,0.76173,0.804048,0.791168,0.799448,0.778085,0.810313,0.785451,0.789371,0.014522,1
27,0.054564,0.002264,0.00192,0.000684,350.0,200,l2,liblinear,"{'C': 350, 'max_iter': 200, 'penalty': 'l2', '...",0.784729,0.76173,0.804048,0.791168,0.799448,0.778085,0.810313,0.785451,0.789371,0.014522,1
29,0.054975,0.003349,0.002158,0.000548,350.0,300,l2,liblinear,"{'C': 350, 'max_iter': 300, 'penalty': 'l2', '...",0.784729,0.76173,0.804048,0.791168,0.799448,0.778085,0.810313,0.785451,0.789371,0.014522,1
33,0.058093,0.003564,0.00284,0.000615,350.0,1000,l2,liblinear,"{'C': 350, 'max_iter': 1000, 'penalty': 'l2', ...",0.784729,0.76173,0.804048,0.791168,0.799448,0.778085,0.810313,0.785451,0.789371,0.014522,1
66,0.065179,0.066476,0.001676,0.00066,78.475997,500,l1,liblinear,"{'C': 78.475997, 'max_iter': 500, 'penalty': '...",0.786569,0.76173,0.804048,0.787489,0.797608,0.779006,0.810313,0.78453,0.788911,0.014259,7
60,0.068205,0.053608,0.002076,0.000604,78.475997,100,l1,liblinear,"{'C': 78.475997, 'max_iter': 100, 'penalty': '...",0.786569,0.76173,0.804048,0.787489,0.797608,0.779006,0.810313,0.78453,0.788911,0.014259,7
62,0.055116,0.052682,0.00227,0.000993,78.475997,200,l1,liblinear,"{'C': 78.475997, 'max_iter': 200, 'penalty': '...",0.786569,0.76173,0.804048,0.787489,0.797608,0.779006,0.810313,0.78453,0.788911,0.014259,7
64,0.031483,0.028757,0.00178,0.000517,78.475997,300,l1,liblinear,"{'C': 78.475997, 'max_iter': 300, 'penalty': '...",0.786569,0.76173,0.804048,0.787489,0.797608,0.779006,0.810313,0.78453,0.788911,0.014259,7


In [41]:
fullParamGrid_result02 = pd.DataFrame(grid_search.cv_results_)
df_paramGrid02 = fullParamGrid_result02[['param_C', 'param_max_iter', 'param_penalty',
    'param_solver', 'mean_test_score', 'rank_test_score']]
df_paramGrid02.sort_values(by='mean_test_score', ascending=False,).head(30)

Unnamed: 0,param_C,param_max_iter,param_penalty,param_solver,mean_test_score,rank_test_score
31,350.0,500,l2,liblinear,0.789371,1
25,350.0,100,l2,liblinear,0.789371,1
35,350.0,3000,l2,liblinear,0.789371,1
27,350.0,200,l2,liblinear,0.789371,1
29,350.0,300,l2,liblinear,0.789371,1
33,350.0,1000,l2,liblinear,0.789371,1
66,78.475997,500,l1,liblinear,0.788911,7
60,78.475997,100,l1,liblinear,0.788911,7
62,78.475997,200,l1,liblinear,0.788911,7
64,78.475997,300,l1,liblinear,0.788911,7


In [36]:
print(np.logspace(-4, 4, 20))

[1.00000000e-04 2.63665090e-04 6.95192796e-04 1.83298071e-03
 4.83293024e-03 1.27427499e-02 3.35981829e-02 8.85866790e-02
 2.33572147e-01 6.15848211e-01 1.62377674e+00 4.28133240e+00
 1.12883789e+01 2.97635144e+01 7.84759970e+01 2.06913808e+02
 5.45559478e+02 1.43844989e+03 3.79269019e+03 1.00000000e+04]


In [44]:
# run the second time of param_grid search
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [350, 200, 100, 78.475997, 29.763514, 10, 1],
    'solver': ['liblinear'], 
    'max_iter': [300, 500, 1000, 3000, 10000, 20000]
    
}

grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=8)


X_train = train_nx
y_train = train_ny
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [45]:
fullParamGrid_result02 = pd.DataFrame(grid_search.cv_results_)
df_paramGrid02 = fullParamGrid_result02[['param_C', 'param_max_iter', 'param_penalty',
    'param_solver', 'mean_test_score', 'rank_test_score']]
df_paramGrid02.sort_values(by='mean_test_score', ascending=False,).head(30)

Unnamed: 0,param_C,param_max_iter,param_penalty,param_solver,mean_test_score,rank_test_score
3,350.0,500,l2,liblinear,0.789371,1
5,350.0,1000,l2,liblinear,0.789371,1
7,350.0,3000,l2,liblinear,0.789371,1
1,350.0,300,l2,liblinear,0.789371,1
9,350.0,10000,l2,liblinear,0.789371,1
11,350.0,20000,l2,liblinear,0.789371,1
42,78.475997,3000,l1,liblinear,0.788911,7
46,78.475997,20000,l1,liblinear,0.788911,7
32,100.0,10000,l1,liblinear,0.788911,7
36,78.475997,300,l1,liblinear,0.788911,7


Therefore, the iteration is not a big factor of the accuracy of the model.