In [33]:
# import statements for binary classification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [34]:
# read the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [35]:
# drop the columns that are not needed
train_data = train_data.drop(['Name', 'PassengerId'], axis=1)

# save passenger ids for the test data
passenger_ids_test = test_data['PassengerId']
test_data = test_data.drop(['Name', 'PassengerId'], axis=1)

In [36]:
# put transported column in y
y = train_data['Transported']
X = train_data.drop(['Transported'], axis=1)

In [37]:
# combine RoomService,FoodCourt,ShoppingMall,Spa,VRDeck into one column named MoneySpent
X['MoneySpent'] = X['RoomService'] + X['FoodCourt'] + X['ShoppingMall'] + X['Spa'] + X['VRDeck']
X = X.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)

test_data['MoneySpent'] = test_data['RoomService'] + test_data['FoodCourt'] + test_data['ShoppingMall'] + test_data['Spa'] + test_data['VRDeck']
test_data = test_data.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)

print(X.head())


  HomePlanet CryoSleep  Cabin  Destination   Age    VIP  MoneySpent
0     Europa     False  B/0/P  TRAPPIST-1e  39.0  False         0.0
1      Earth     False  F/0/S  TRAPPIST-1e  24.0  False       736.0
2     Europa     False  A/0/S  TRAPPIST-1e  58.0   True     10383.0
3     Europa     False  A/0/S  TRAPPIST-1e  33.0  False      5176.0
4      Earth     False  F/1/S  TRAPPIST-1e  16.0  False      1091.0


In [38]:
# split the cabin column into deck, num and side columns
X[['Deck', 'Num', 'Side']] = X['Cabin'].str.split('/', expand=True)
X = X.drop(['Cabin'], axis=1)

test_data[['Deck', 'Num', 'Side']] = test_data['Cabin'].str.split('/', expand=True)
test_data = test_data.drop(['Cabin'], axis=1)

X.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,MoneySpent,Deck,Num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,736.0,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,10383.0,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,5176.0,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,1091.0,F,1,S


In [39]:
# find categorical columns
categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']

# print unique values in each categorical column
for col in categorical_cols:
    print(col, X[col].unique())

HomePlanet ['Europa' 'Earth' 'Mars' nan]
CryoSleep [False True nan]
Destination ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
VIP [False True nan]
Deck ['B' 'F' 'A' 'G' nan 'E' 'D' 'C' 'T']
Num ['0' '1' '2' ... '1892' '1893' '1894']
Side ['P' 'S' nan]


In [40]:
# check for missing values
missing_values = X.isnull().sum()
print(missing_values)

HomePlanet     201
CryoSleep      217
Destination    182
Age            179
VIP            203
MoneySpent     908
Deck           199
Num            199
Side           199
dtype: int64


In [41]:
# fill missing values in all columns 

# fill HomePlanet with Earth since it is the most frequent value
X['HomePlanet'] = X['HomePlanet'].fillna('Earth')
test_data['HomePlanet'] = test_data['HomePlanet'].fillna('Earth')

# fill CryoSleep with 0 since it is the most frequent value
X['CryoSleep'] = X['CryoSleep'].fillna(0)
test_data['CryoSleep'] = test_data['CryoSleep'].fillna(0)

# fill Destination with TRAPPIST-1e since it is the most frequent value
X['Destination'] = X['Destination'].fillna('TRAPPIST-1e')

# fill VIP with 0 since it is the most frequent value
X['VIP'] = X['VIP'].fillna(0)
test_data['VIP'] = test_data['VIP'].fillna(0)

# fill Age with the Median
X['Age'] = X['Age'].fillna(X['Age'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

# fill MoneySpent with the 0 since it is the most frequent value
X['MoneySpent'] = X['MoneySpent'].fillna(0)
test_data['MoneySpent'] = test_data['MoneySpent'].fillna(0)

# fill Side with 'Unknown' 
X['Side'] = X['Side'].fillna('Unknown')
test_data['Side'] = test_data['Side'].fillna('Unknown')

# fill Deck with 'Unknown'
X['Deck'] = X['Deck'].fillna('Unknown')
test_data['Deck'] = test_data['Deck'].fillna('Unknown')


# check for missing values
missing_values = X.isnull().sum()
print(missing_values)

HomePlanet       0
CryoSleep        0
Destination      0
Age              0
VIP              0
MoneySpent       0
Deck             0
Num            199
Side             0
dtype: int64


In [42]:
# get dtype of each column
print(X.dtypes)

HomePlanet      object
CryoSleep       object
Destination     object
Age            float64
VIP             object
MoneySpent     float64
Deck            object
Num             object
Side            object
dtype: object


In [43]:
from sklearn.preprocessing import OrdinalEncoder

# convert cateogrical columns to string
X[categorical_cols] = X[categorical_cols].astype(str)
test_data[categorical_cols] = test_data[categorical_cols].astype(str)

# encode categorical columns using ordinal encoding
encoder = OrdinalEncoder()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])
test_data[categorical_cols] = encoder.fit_transform(test_data[categorical_cols])


X.head()



Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,MoneySpent,Deck,Num,Side
0,1.0,1.0,2.0,39.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,2.0,24.0,1.0,736.0,5.0,0.0,1.0
2,1.0,1.0,2.0,58.0,2.0,10383.0,0.0,0.0,1.0
3,1.0,1.0,2.0,33.0,1.0,5176.0,0.0,0.0,1.0
4,0.0,1.0,2.0,16.0,1.0,1091.0,5.0,1.0,1.0


In [44]:
# encode categorical columns
X = pd.get_dummies(X)
test_data = pd.get_dummies(test_data)

print(X.head())

   CryoSleep   Age    VIP  MoneySpent  Num  HomePlanet_Earth  \
0      False  39.0  False         0.0  0.0             False   
1      False  24.0  False       736.0  0.0              True   
2      False  58.0   True     10383.0  0.0             False   
3      False  33.0  False      5176.0  0.0             False   
4      False  16.0  False      1091.0  1.0              True   

   HomePlanet_Europa  HomePlanet_Mars  Destination_55 Cancri e  \
0               True            False                    False   
1              False            False                    False   
2               True            False                    False   
3               True            False                    False   
4              False            False                    False   

   Destination_PSO J318.5-22  ...  Deck_A  Deck_B  Deck_C  Deck_D  Deck_E  \
0                      False  ...   False    True   False   False   False   
1                      False  ...   False   False   False   Fals

In [45]:
# scale Age, MoneySpent and Num columns
scaler = StandardScaler()
X[['Age', 'MoneySpent', 'Num']] = scaler.fit_transform(X[['Age', 'MoneySpent', 'Num']])
test_data[['Age', 'MoneySpent', 'Num']] = scaler.fit_transform(test_data[['Age', 'MoneySpent', 'Num']])

X.head()


Unnamed: 0,CryoSleep,Age,VIP,MoneySpent,Num,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,False,0.715553,False,-0.48693,-1.149753,False,True,False,False,False,...,False,True,False,False,False,False,False,False,True,False
1,False,-0.329408,False,-0.217376,-1.149753,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
2,False,2.039169,True,3.315756,-1.149753,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,False,0.297569,False,1.408736,-1.149753,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,True
4,False,-0.88672,False,-0.08736,-1.1478,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True


In [46]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# fit the logistic regression model


# Use the existing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline step for preprocessing
preprocessing = Pipeline([
    ('scaler', StandardScaler())
])

# Define your models and their parameter grids
models_and_parameters = [
    (LogisticRegression(), {'model__C': [0.1, 1, 10]}),
    (RandomForestClassifier(), {'model__n_estimators': [10, 50, 100]}),
    (GradientBoostingClassifier(), {'model__learning_rate': [0.01, 0.1, 0.5]})
]

best_models = []
best_scores = []

for model, param_grid in models_and_parameters:
    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline([
        # ('preprocessing', preprocessing),
        ('model', model)
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Store the best model and its score
    best_models.append(grid_search.best_estimator_)
    best_scores.append(grid_search.best_score_)

# Find the model with the best score
best_index = best_scores.index(max(best_scores))
best_model = best_models[best_index]

print(f"Best Model: {best_model}")
print(f"Best Cross-Validation Score: {best_scores[best_index]}")

# Optionally, evaluate the best model on the test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy}")

Best Model: Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('scaler', StandardScaler())])),
                ('model', GradientBoostingClassifier())])
Best Cross-Validation Score: 0.7509366999570725
Test Set Accuracy: 0.7573317998849913


In [133]:
#  use standard scaler to scale the test data
scaler = StandardScaler()
test_data = scaler.fit_transform(test_data)



In [47]:
# get the predictions with the best model
predictions = best_model.predict(test_data)

# save the predictions to a submission.csv file
output = pd.DataFrame({'PassengerId': passenger_ids_test, 'Transported': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


