Using `pre06.csv` dataset

In [1]:
# First code block is the import libraries  
import numpy as np
import pandas as pd
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


In [8]:
dataPath = '../../preprocess_train_dataset/pre06_train.csv'
test_path = '../../preprocess_test_dataset/pre06_test.csv'
pid_path = '../../spaceship-titanic_rawData/sample_submission.csv'

df_spaceship = pd.read_csv(dataPath)
test = pd.read_csv(test_path)
pid = pd.read_csv(pid_path)

train_x, train_y = df_spaceship.drop(columns=['Transported']), df_spaceship['Transported']
sub_pid = pid['PassengerId']

df_spaceship

Unnamed: 0,CryoSleep,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,...,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,spending_Categ_Low,spending_Categ_Medium,spending_Categ_High,Transported
0,False,False,True,False,False,True,False,False,False,False,...,39.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False
1,False,True,False,False,False,False,False,False,False,True,...,24.0,109.0,9.0,25.0,549.0,44.0,False,True,False,True
2,False,False,True,False,True,False,False,False,False,False,...,58.0,43.0,3576.0,0.0,6715.0,49.0,False,False,True,False
3,False,False,True,False,True,False,False,False,False,False,...,33.0,0.0,1283.0,371.0,3329.0,193.0,False,False,True,False
4,False,True,False,False,False,False,False,False,False,True,...,16.0,303.0,70.0,151.0,565.0,2.0,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,False,True,False,True,False,False,False,False,False,...,41.0,0.0,6819.0,0.0,1643.0,74.0,False,False,True,False
8689,True,True,False,False,False,False,False,False,False,False,...,18.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False
8690,False,True,False,False,False,False,False,False,False,False,...,26.0,0.0,0.0,1872.0,1.0,0.0,False,True,False,True
8691,False,False,True,False,False,False,False,False,True,False,...,32.0,0.0,1049.0,0.0,353.0,3235.0,False,False,True,False


In [9]:
train_x.columns

Index(['CryoSleep', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Cabin_deck_A', 'Cabin_deck_B', 'Cabin_deck_C', 'Cabin_deck_D',
       'Cabin_deck_E', 'Cabin_deck_F', 'Cabin_deck_G', 'Cabin_deck_T',
       'Cabin_side_P', 'Cabin_side_S', 'Age', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'spending_Categ_Low',
       'spending_Categ_Medium', 'spending_Categ_High'],
      dtype='object')

Therefore, the iteration is not a big factor of the accuracy of the model.
|   param_solver       |    param_penalty	|       param_C         |  param_max_iter  |   |
|----------------------|--------------------|-----------------------|------------------|---|
|       liblinear      |        l2          |           350	        |   >= 100         |   |


Then, we output the prediction result to csv


In [7]:
k_folds = KFold(n_splits = 8)

logisticReg_model = LogisticRegression(solver = 'liblinear', penalty = 'l2', C=350, class_weight = 'balanced', max_iter =300, )

scores = cross_val_score(logisticReg_model, train_x, train_y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.78012879 0.74701012 0.79392824 0.81784729 0.80036799 0.78637201
 0.82228361 0.78453039]

Average CV Score:  0.7915585540482617


1. Get the test data and
2. drop string columns, remain the one-hot encoding columns
3. train the data with best param
4. export the predict result 

In [11]:
logisticReg_model.fit(train_x, train_y)
pred = logisticReg_model.predict(test)

log_submission = pd.DataFrame({'PassengerId': sub_pid, 'Transported': pred})

log_submission.isna().sum()

PassengerId    0
Transported    0
dtype: int64

In [12]:
log_submission.to_csv('../../output_prediction/log02v1_submission.csv', index=False, index_label='PassengerId')

Norm

In [17]:
# normalize the data
from sklearn.preprocessing import MinMaxScaler


def norm(df):
    age = df['Age']
    temp = df.iloc[:, 15:20]

    scaler = MinMaxScaler()
    age = scaler.fit_transform(pd.DataFrame(age))
    temp = scaler.fit_transform(temp)

    normalized = df
    normalized['Age'] = pd.DataFrame(age)
    normalized.iloc[:, 15:20] = temp

    return normalized

In [18]:
norm_train_x = norm(train_x)
norm_test = norm(test)

In [19]:
k_folds = KFold(n_splits = 8)

logisticReg_model = LogisticRegression(solver = 'liblinear', penalty = 'l2', C=350, class_weight = 'balanced', max_iter =300, )

scores = cross_val_score(logisticReg_model, norm_train_x, train_y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.77920883 0.74701012 0.79300828 0.81692732 0.79944802 0.7854512
 0.82044199 0.78268877]

Average CV Score:  0.7905230660018534


In [20]:
logisticReg_model.fit(norm_train_x, train_y)
pred = logisticReg_model.predict(norm_test)

log_submission = pd.DataFrame({'PassengerId': sub_pid, 'Transported': pred})
log_submission.to_csv('../../output_prediction/log02v2_submission.csv', index=False, index_label='PassengerId')