## Data Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns  
from sklearn.impute import KNNImputer

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv') 
df_test['Transported'] = False
df = pd.concat([df_train, df_test], axis=0) 
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df[['Deck','Num','Side']] = df['Cabin'].str.split('/',expand=True)
df = df.drop(columns=['Cabin','Name'])
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [4]:
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df[spend_cols] = df[spend_cols].fillna(0)

df['CryoSleep'] = df.apply(
    lambda row: True if row[spend_cols].sum() == 0 else False 
    if pd.isnull(row['CryoSleep']) else row['CryoSleep'],
    axis=1
)


In [5]:
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

In [6]:
df['Deck'] = df['Deck'].map({'G': 8 , 'F': 1, 'E': 2, 'D': 3, 'C': 4, 'B': 5, 'A': 6, 'U': 7, 'T': 9})
df['Side'] = df['Side'].map({'P': 3, 'S': 1, 'U': 2})

In [7]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['VIP'] = df['VIP'].fillna(df['VIP'].mode()[0])
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')

df.isna().sum()

  df['VIP'] = df['VIP'].fillna(df['VIP'].mode()[0])


PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Deck            0
Num             0
Side            0
dtype: int64

In [8]:
df.HomePlanet = df.HomePlanet.map({
    'Earth': 1,
    'Mars': 2,
    'Europa': 3,
    'U': 4
})
df.Destination = df.Destination.map({
    'TRAPPIST-1e': 1,
    '55 Cancri e': 2,
    'PSO J318.5-22': 3,
    'U': 4
})

In [9]:
from sklearn.preprocessing import StandardScaler

num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,0001_01,3,True,1,0.720932,False,-0.34029,-0.281822,-0.292365,-0.269707,-0.2571,False,5,0,3
1,0002_01,1,False,1,-0.332557,False,-0.170439,-0.276082,-0.249566,0.22104,-0.219449,True,1,0,1
2,0003_01,3,False,1,2.05535,True,-0.273285,1.998823,-0.292365,5.732776,-0.21517,False,6,0,1
3,0003_02,3,False,1,0.299536,False,-0.34029,0.536429,0.342766,2.706059,-0.091947,False,6,0,1
4,0004_01,1,False,1,-0.894417,False,0.131863,-0.237179,-0.033861,0.235342,-0.255389,True,1,1,1


In [10]:
df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Deck            0
Num             0
Side            0
dtype: int64

## Creating Models

In [11]:
df_train = df.iloc[:len(df_train)]
df_test = df.iloc[len(df_train):]

X_train = df_train.drop(columns=['Transported'])
y_train = df_train['Transported']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### 1. Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(random_state=42)
model1.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
y_pred = model1.predict(X_val)
print("Random Forest Validation Accuracy:", accuracy_score(y_val, y_pred))

Random Forest Validation Accuracy: 0.7901092581943646


### 2. Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train the model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_logistic = logistic_model.predict(X_val)
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, y_pred_logistic))

Logistic Regression Validation Accuracy: 0.7740080506037953


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 3. Gradient Boost

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_gb = gb_model.predict(X_val)
print("Gradient Boosting Validation Accuracy:", accuracy_score(y_val, y_pred_gb))

Gradient Boosting Validation Accuracy: 0.7901092581943646


### 4. SVM

In [16]:
# from sklearn.svm import SVC

# # Initialize and train the model
# svm_model = SVC(kernel='linear', random_state=42)
# svm_model.fit(X_train, y_train)

# # Predict and evaluate
# y_pred_svm = svm_model.predict(X_val)
# print("SVM Validation Accuracy:", accuracy_score(y_val, y_pred_svm))

## Submission 

In [17]:
df_test.drop(columns=['Transported'], inplace=True)

pred = gb_model.predict(df_test)

final = pd.DataFrame()
final['PassengerId'] = df_test['PassengerId']
final['Transported'] = pred

# Write DataFrame to a CSV file without index
final.to_csv('output gb.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(columns=['Transported'], inplace=True)
