In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('train_spaceship.csv')

In [107]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector


def null_fill(x):
    for col in x.select_dtypes(include=[int, float]):
        x[col] = np.where(x[col].isna(), x[col].median(),x[col])
    for col in x.select_dtypes(include=[object]):
        x[col] = np.where(x[col].isna(), x[col].mode()[0],x[col])

    return x

def clean_df(x):
    x = x.copy()
    x['cabin'] = x['Cabin'].str.split('/', expand=True).iloc[:, 0]
    x['cabin_side'] = x['Cabin'].str.split('/', expand=True).iloc[:, 2]
    x['total'] = x[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    x['total_group'] = pd.cut(x['total'], bins=[-1, 0,1000,10000 ,np.inf], labels=[1, 2,3, 4]).astype(object)

    x['Age_group'] = pd.cut(x['Age'], bins=[-1, 10, 18, 30, 40, 50, 60, np.inf], labels=[1, 2, 3, 4, 5,6,7]).astype(object)
    x['route'] = x['HomePlanet'] + ' to ' + x['Destination']
    x = x.drop(['PassengerId', 'Name', 'Transported', 'Cabin'], axis=1)
    return x

log_transformer = make_pipeline(
    FunctionTransformer(np.log1p, validate=False, feature_names_out = 'one-to-one'),
    StandardScaler()
)

cat_transformer = make_pipeline(
    OneHotEncoder()
)

preprocess = ColumnTransformer(
    transformers=[
        ('log', log_transformer, make_column_selector(dtype_include=[int, float])),
        ('cat', cat_transformer, make_column_selector(dtype_include=[object]))
    ]
)

feat_eng = Pipeline([
    ('null_fill', FunctionTransformer(func=null_fill, validate=False)),
    ('df_cleaning', FunctionTransformer(func=clean_df, validate=False)),
    ('preprocessing', preprocess)
])

In [108]:
X = feat_eng.fit_transform(df)

In [109]:
X.shape

(8693, 47)

In [110]:
preprocess.get_feature_names_out()

array(['log__Age', 'log__RoomService', 'log__FoodCourt',
       'log__ShoppingMall', 'log__Spa', 'log__VRDeck', 'log__total',
       'cat__HomePlanet_Earth', 'cat__HomePlanet_Europa',
       'cat__HomePlanet_Mars', 'cat__CryoSleep_False',
       'cat__CryoSleep_True', 'cat__Destination_55 Cancri e',
       'cat__Destination_PSO J318.5-22', 'cat__Destination_TRAPPIST-1e',
       'cat__VIP_False', 'cat__VIP_True', 'cat__cabin_A', 'cat__cabin_B',
       'cat__cabin_C', 'cat__cabin_D', 'cat__cabin_E', 'cat__cabin_F',
       'cat__cabin_G', 'cat__cabin_T', 'cat__cabin_side_P',
       'cat__cabin_side_S', 'cat__total_group_1', 'cat__total_group_2',
       'cat__total_group_3', 'cat__total_group_4', 'cat__Age_group_1',
       'cat__Age_group_2', 'cat__Age_group_3', 'cat__Age_group_4',
       'cat__Age_group_5', 'cat__Age_group_6', 'cat__Age_group_7',
       'cat__route_Earth to 55 Cancri e',
       'cat__route_Earth to PSO J318.5-22',
       'cat__route_Earth to TRAPPIST-1e',
       'cat__rou

In [111]:
from sklearn.preprocessing import LabelEncoder
def target_conv(x):
    le = LabelEncoder()
    x = le.fit_transform(x)
    return x
y = target_conv(df.Transported)
y

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [112]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 0.25, stratify = y , random_state=42)

In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=1000)
lr.fit(Xtrain, ytrain)

pred = lr.predict(Xtest)

print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1079
           1       0.79      0.78      0.78      1095

    accuracy                           0.78      2174
   macro avg       0.78      0.78      0.78      2174
weighted avg       0.78      0.78      0.78      2174



In [114]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(Xtrain,ytrain)

pred = rf.predict(Xtest)

print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       0.76      0.83      0.80      1079
           1       0.82      0.74      0.78      1095

    accuracy                           0.79      2174
   macro avg       0.79      0.79      0.79      2174
weighted avg       0.79      0.79      0.79      2174



In [115]:
from sklearn.svm import SVC

svm = SVC(kernel='poly', degree=2, C=1)
svm.fit(Xtrain, ytrain)

pred = svm.predict(Xtest)

print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79      1079
           1       0.80      0.79      0.80      1095

    accuracy                           0.79      2174
   macro avg       0.79      0.79      0.79      2174
weighted avg       0.79      0.79      0.79      2174



In [116]:
import xgboost as xgb

clf = xgb.XGBClassifier(random_state=42)
clf.fit(Xtrain,ytrain)
pred = clf.predict(Xtest)

print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80      1079
           1       0.80      0.80      0.80      1095

    accuracy                           0.80      2174
   macro avg       0.80      0.80      0.80      2174
weighted avg       0.80      0.80      0.80      2174



In [117]:
from sklearn.model_selection import RandomizedSearchCV

clf1 = xgb.XGBClassifier(random_state=42)

param_distributions = {
    'n_estimators': [100, 200, 300, 400,500],
    'max_depth': [2,3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}

random_search = RandomizedSearchCV(estimator=clf1, param_distributions=param_distributions, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42)

random_search.fit(Xtrain, ytrain)

print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

best_clf1 = random_search.best_estimator_
pred = best_clf1.predict(Xtest)

print(classification_report(ytest, pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}
Best score found:  0.8044194850015772
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      1079
           1       0.80      0.82      0.81      1095

    accuracy                           0.80      2174
   macro avg       0.80      0.80      0.80      2174
weighted avg       0.80      0.80      0.80      2174



In [118]:
from sklearn.model_selection import GridSearchCV
clf2 = xgb.XGBClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=clf2, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

grid_search.fit(Xtrain, ytrain)

print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

best_clf2 = grid_search.best_estimator_
pred = best_clf2.predict(Xtest)

print(classification_report(ytest, pred))


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Best score found:  0.803805988069062
              precision    recall  f1-score   support

           0       0.82      0.79      0.80      1079
           1       0.80      0.82      0.81      1095

    accuracy                           0.81      2174
   macro avg       0.81      0.81      0.81      2174
weighted avg       0.81      0.81      0.81      2174



In [119]:
test = pd.read_csv('test_spaceship.csv')
test['Transported']=1
test.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,1
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,1
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,1
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,1


In [120]:
samp = feat_eng.transform(test)

In [121]:
samp.shape

(4277, 47)

In [122]:
prediction = best_clf2.predict(samp)

In [123]:
test['Transported']=prediction

In [124]:
sub = test[['PassengerId','Transported']]
sub

Unnamed: 0,PassengerId,Transported
0,0013_01,1
1,0018_01,0
2,0019_01,1
3,0021_01,1
4,0023_01,1
...,...,...
4272,9266_02,1
4273,9269_01,0
4274,9271_01,1
4275,9273_01,1


In [125]:
sub.Transported = sub.Transported.map({1:True, 0:False})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub.Transported = sub.Transported.map({1:True, 0:False})


In [126]:
sub.index = sub.PassengerId
sub = sub.drop('PassengerId',axis=1)
sub

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [127]:
sub.to_csv('space2.csv')