In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
train_df = pd.read_csv("train (4).csv")
train_labels = train_df.pop('Transported')

In [3]:
train_df.drop(["Name"], axis=1, inplace=True)

In [4]:
train_df[['deck', 'num', 'side']] = train_df['Cabin'].str.extract(r'(\w)(\d+)(\w)')

In [5]:
train_df.drop(["Cabin"], axis=1, inplace=True)

In [6]:
categorical_cols = train_df.select_dtypes(["bool_", "object_"]).columns

numeric_cols = train_df.select_dtypes(exclude=["bool_", "object_"]).columns

In [7]:
categorical_cols = categorical_cols.drop("PassengerId")

In [8]:
encoder = OrdinalEncoder()
train_df[categorical_cols] = encoder.fit_transform(train_df[categorical_cols])

In [9]:
train_df.isna().sum().sum()

6515

In [10]:
iterative_imputer = IterativeImputer()
train_df[numeric_cols] = pd.DataFrame(iterative_imputer.fit_transform(train_df[numeric_cols]), columns= numeric_cols)

In [11]:
categorical_imputer = SimpleImputer(strategy= "most_frequent")
train_df[categorical_cols] = pd.DataFrame(categorical_imputer.fit_transform(train_df[categorical_cols]), columns= categorical_cols)

In [12]:
train_df.isna().sum().sum()

0

In [13]:
train_df["group"] = train_df["PassengerId"].str.split("_").str[0] 
train_df["group"] = pd.to_numeric(train_df["group"]) 

In [14]:
train_df.drop("PassengerId", axis= 1, inplace= True)

In [15]:
scaler = StandardScaler()
new_col_names = [col + "_scaled" for col in numeric_cols]

train_df[new_col_names] = scaler.fit_transform(train_df[numeric_cols]) 

View the scaled columns

In [16]:
mi_scores = mutual_info_classif(train_df, train_labels)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=train_df.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

CryoSleep              0.118112
Spa                    0.079575
RoomService            0.075638
Spa_scaled             0.074728
RoomService_scaled     0.072656
VRDeck                 0.068725
ShoppingMall_scaled    0.056922
VRDeck_scaled          0.056830
ShoppingMall           0.054691
FoodCourt              0.046991
FoodCourt_scaled       0.046441
HomePlanet             0.031857
group                  0.022129
Age_scaled             0.016785
Destination            0.014282
Age                    0.011469
deck                   0.006273
num                    0.004650
VIP                    0.001721
side                   0.000000
Name: MI Scores, dtype: float64

In [17]:
train_df.drop(["Destination", "VIP"], axis= 1, inplace= True)

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, train_labels, train_size = 0.8)

In [19]:
rf = RandomForestClassifier(n_estimators = 1000)
rf.fit(X_train, y_train)

print(accuracy_score(y_valid, rf.predict(X_valid)))

0.7947096032202415


In [20]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=3000, random_state=42)
lr.fit(X_train, y_train)

print(accuracy_score(y_valid, lr.predict(X_valid)))

0.7929844738355377


In [21]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=200)
gbc.fit(X_train, y_train)

print(accuracy_score(y_valid, gbc.predict(X_valid)))

0.8148361127084531


In [22]:
from xgboost import XGBClassifier

class CustomXGBClassifier(XGBClassifier):
    
    def __init__(self, **params):
        
        super().__init__(**params)
        self.eval_set = params['eval_set']
    
    def fit(self, X, y):
        super().fit(X, y, eval_set=self.eval_set, verbose=100) 

In [23]:
xgb = CustomXGBClassifier(n_estimators=1000, learning_rate=0.01, eval_set=[(X_valid, y_valid)], early_stopping_rounds=40, objective='binary:logistic')
xgb.fit(X_train, y_train)

print(accuracy_score(y_valid, xgb.predict(X_valid)))

Parameters: { "eval_set" } are not used.

[0]	validation_0-logloss:0.68904


[100]	validation_0-logloss:0.49035
[200]	validation_0-logloss:0.43830
[300]	validation_0-logloss:0.41916
[400]	validation_0-logloss:0.41343
[500]	validation_0-logloss:0.41094
[600]	validation_0-logloss:0.40940
[700]	validation_0-logloss:0.40851
[800]	validation_0-logloss:0.40756
[900]	validation_0-logloss:0.40725
[999]	validation_0-logloss:0.40702
0.8085106382978723


In [24]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(
    estimators = [
        ('xgb', xgb),
        ('rf', rf),
        ('lr', lr),
        ('gbc', gbc)
    ],
    voting = 'soft'
)

model.fit(X_train, y_train)

print(accuracy_score(y_valid, model.predict(X_valid)))

Parameters: { "eval_set" } are not used.

[0]	validation_0-logloss:0.68904


[100]	validation_0-logloss:0.49035
[200]	validation_0-logloss:0.43830
[300]	validation_0-logloss:0.41916
[400]	validation_0-logloss:0.41343
[500]	validation_0-logloss:0.41094
[600]	validation_0-logloss:0.40940
[700]	validation_0-logloss:0.40851
[800]	validation_0-logloss:0.40756
[900]	validation_0-logloss:0.40725
[999]	validation_0-logloss:0.40702
0.8108108108108109


**Submit**

In [25]:
test_df = pd.read_csv("test (2).csv")

In [26]:
test_df[['deck', 'num', 'side']] = test_df['Cabin'].str.extract(r'(\w)(\d+)(\w)')

In [27]:
test_df.drop(["Cabin"], axis=1, inplace=True)

In [28]:
test_df[categorical_cols] = encoder.transform(test_df[categorical_cols])

In [29]:
test_df[numeric_cols] = pd.DataFrame(iterative_imputer.transform(test_df[numeric_cols]), columns= numeric_cols)
test_df[categorical_cols] = pd.DataFrame(categorical_imputer.transform(test_df[categorical_cols]), columns= categorical_cols)

In [30]:
test_df["group"] = test_df["PassengerId"].str.split("_").str[0]
test_df["group"] = pd.to_numeric(test_df["group"])

In [31]:
test_df[new_col_names] = scaler.transform(test_df[numeric_cols]) 

In [32]:
preds = [x == 1 for x in model.predict(test_df[train_df.columns])]

In [33]:
submission_df = pd.DataFrame({
    "PassengerId" : test_df["PassengerId"],
    "Transported" : preds
})

In [34]:
submission_df.to_csv("submission_v2.csv", index=False)