In [1]:
import pandas as pd
import numpy as np
from scipy import stats


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import optuna



In [2]:
train_df = pd.read_csv('C:/Users/91950/Desktop/spaceship_titanic/train.csv')
test_df = pd.read_csv('C:/Users/91950/Desktop/spaceship_titanic/test.csv')

In [3]:
train_df.dtypes[train_df.dtypes == 'object']

PassengerId    object
HomePlanet     object
CryoSleep      object
Cabin          object
Destination    object
VIP            object
Name           object
dtype: object

In [4]:
train_df.dtypes[train_df.dtypes != 'object']

Age             float64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
train_df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [7]:
train_df.describe(include=['object'])

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
count,8693,8492,8476,8494,8511,8490,8493
unique,8693,3,2,6560,3,2,8473
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,False,Gollux Reedall
freq,1,4602,5439,8,5915,8291,2


In [8]:
train_df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [9]:
columns_to_fill = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

for index, row in train_df.iterrows():
    row_mean = row[columns_to_fill].mean()
    train_df.loc[index, columns_to_fill] = row[columns_to_fill].fillna(row_mean)
for index, row in test_df.iterrows():
    row_mean = row[columns_to_fill].mean()
    test_df.loc[index, columns_to_fill] = row[columns_to_fill].fillna(row_mean)    


In [10]:
cat_columns_to_fill = ['CryoSleep', 'HomePlanet', 'Destination', 'VIP']

train_df[cat_columns_to_fill] = train_df[cat_columns_to_fill].fillna(train_df[cat_columns_to_fill].mode().iloc[0])
test_df[cat_columns_to_fill] = test_df[cat_columns_to_fill].fillna(test_df[cat_columns_to_fill].mode().iloc[0])


In [11]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

In [12]:
train_df = train_df.drop('Name', axis=1)
test_df = test_df.drop('Name', axis=1)

In [13]:
train_df['Cabin'].fillna('unknown', inplace=True)
test_df['Cabin'].fillna('unknown', inplace=True)


In [14]:
train_df.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

In [15]:
test_df.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [16]:
train_df['CryoSleep'] = train_df['CryoSleep'].astype(int)
test_df['CryoSleep'] = test_df['CryoSleep'].astype(int)
train_df['VIP'] = train_df['VIP'].astype(int)
test_df['VIP'] = test_df['VIP'].astype(int)

In [17]:
combined_df = pd.concat([train_df['Cabin'], test_df['Cabin']], ignore_index=True)

combined_df, unique_cabins = pd.factorize(combined_df)

train_df['Cabin'] = combined_df[:len(train_df)]
test_df['Cabin'] = combined_df[len(train_df):]

KeyError: 'Cabin'

In [18]:
combined_homeplanet = pd.concat([train_df['HomePlanet'], test_df['HomePlanet']], ignore_index=True)
combined_destination = pd.concat([train_df['Destination'], test_df['Destination']], ignore_index=True)

combined_homeplanet, unique_homeplanet = pd.factorize(combined_homeplanet)

combined_destination, unique_destination = pd.factorize(combined_destination)

train_df['HomePlanet'] = combined_homeplanet[:len(train_df)]
test_df['HomePlanet'] = combined_homeplanet[len(train_df):]

train_df['Destination'] = combined_destination[:len(train_df)]
test_df['Destination'] = combined_destination[len(train_df):]

In [19]:
train_df = train_df.drop('PassengerId', axis=1)

In [20]:
X = train_df.drop('Transported', axis=1)
Y = train_df['Transported']

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=25)

In [22]:
# Logistic Regression
logistic_regression = LogisticRegression(max_iter=8000)
logistic_regression.fit(X_train, Y_train)
logistic_regression_pred = logistic_regression.predict(X_test)
logistic_regression_score = accuracy_score(Y_test, logistic_regression_pred)
print("Logistic Regression Accuracy:", logistic_regression_score)


Logistic Regression Accuracy: 0.7751581368602645


In [23]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
decision_tree_pred = decision_tree.predict(X_test)
decision_tree_score = accuracy_score(Y_test, decision_tree_pred)
print("Decision Tree Accuracy:", decision_tree_score)

Decision Tree Accuracy: 0.7464059804485337


In [24]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)
random_forest_pred = random_forest.predict(X_test)
random_forest_score = accuracy_score(Y_test, random_forest_pred)
print("Random Forest Accuracy:", random_forest_score)

Random Forest Accuracy: 0.7843588269120184


In [25]:
bagging_classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10)
bagging_classifier.fit(X_train, Y_train)
bagging_classifier_pred = bagging_classifier.predict(X_test)
bagging_classifier_score = accuracy_score(Y_test, bagging_classifier_pred)
print("Bagging Classifier Accuracy:", bagging_classifier_score)



Bagging Classifier Accuracy: 0.7768832662449684


In [26]:
lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train, Y_train)
lgbm_classifier_pred = lgbm_classifier.predict(X_test)
lgbm_classifier_score = accuracy_score(Y_test, lgbm_classifier_pred)
print("LightGBM Classifier Accuracy:", lgbm_classifier_score)

[LightGBM] [Info] Number of positive: 3525, number of negative: 3429
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.506903 -> initscore=0.027612
[LightGBM] [Info] Start training from score 0.027612
LightGBM Classifier Accuracy: 0.7837837837837838


In [27]:
svm_classifier = SVC()
svm_classifier.fit(X_train, Y_train)
svm_classifier_pred = svm_classifier.predict(X_test)
svm_classifier_score = accuracy_score(Y_test, svm_classifier_pred)
print("SVM Classifier Accuracy:", svm_classifier_score)

SVM Classifier Accuracy: 0.7786083956296722


In [28]:
def objective(trial):
    params = {
        'lambda': trial.suggest_float('lambda', 0, 10.0),
        'alpha': trial.suggest_float('alpha', 0, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'max_depth': trial.suggest_categorical('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'num_parallel_tree': trial.suggest_int('num_parallel_tree', 1, 1)
    }

    model = XGBClassifier(**params)

    model.fit(X_train, Y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(Y_test, preds)
    
    return accuracy

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get the best parameters from the study
best_params = study.best_params
print("Best Parameters:", best_params)

# Train the final model with the best parameters
final_model = XGBClassifier(**best_params)
final_model.fit(X_train, Y_train)

# Evaluate the final model
final_preds = final_model.predict(X_test)
final_accuracy = accuracy_score(Y_test, final_preds)
print("Final Model Accuracy:", final_accuracy)





[I 2023-12-23 14:27:51,814] A new study created in memory with name: no-name-8e93ffdc-446d-450d-be2c-c18563fc461a
[I 2023-12-23 14:27:53,463] Trial 0 finished with value: 0.7722829212190915 and parameters: {'lambda': 8.070085309026277, 'alpha': 2.2367592073650644, 'colsample_bytree': 0.16811284805329707, 'subsample': 0.4220173113290896, 'learning_rate': 0.07123465579134933, 'n_estimators': 2813, 'max_depth': 6, 'min_child_weight': 4, 'num_parallel_tree': 1}. Best is trial 0 with value: 0.7722829212190915.
[I 2023-12-23 14:27:53,789] Trial 1 finished with value: 0.777458309373203 and parameters: {'lambda': 7.488428252736844, 'alpha': 2.2855674516849867, 'colsample_bytree': 0.3030418495436804, 'subsample': 0.49450037284693726, 'learning_rate': 0.043630199250113806, 'n_estimators': 764, 'max_depth': 2, 'min_child_weight': 10, 'num_parallel_tree': 1}. Best is trial 1 with value: 0.777458309373203.
[I 2023-12-23 14:27:55,807] Trial 2 finished with value: 0.7768832662449684 and parameters: {

Best Parameters: {'lambda': 3.699552278775611, 'alpha': 1.6752521498236774, 'colsample_bytree': 0.8476243151869602, 'subsample': 0.5570766719377115, 'learning_rate': 0.01345445944656374, 'n_estimators': 991, 'max_depth': 8, 'min_child_weight': 2, 'num_parallel_tree': 1}
Final Model Accuracy: 0.7912593444508338


In [None]:
best_params = {'lambda': 3.699552278775611, 'alpha': 1.6752521498236774, 'colsample_bytree': 0.8476243151869602, 'subsample': 0.5570766719377115, 'learning_rate': 0.01345445944656374, 'n_estimators': 991, 'max_depth': 8, 'min_child_weight': 2, 'num_parallel_tree': 1}

# Create and train the final model with the best parameters
final_model = XGBClassifier(**best_params)
final_model.fit(X,Y)


In [31]:
passenger_ids = test_df['PassengerId']

# Drop PassengerId for model prediction
test_df = test_df.drop(['PassengerId'], axis=1)

# Make predictions
predictions = final_model.predict(test_df)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Transported': predictions.astype(bool)
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission1.csv', index=False)