In [55]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import (
    RandomForestClassifier, 
    AdaBoostClassifier
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    ConfusionMatrixDisplay
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
pd.set_option('future.no_silent_downcasting', True)

In [56]:

class AgeTransfomer(BaseEstimator, TransformerMixin): 
    def __init__(self): 
        pass   
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        df=X.copy() 
        required_cols = ['Age']
        if 'Age' not in df.columns:
            raise ValueError("what are you doing man?")
        if  df["Age"].isna().sum()!=0: 
            df['Age'] = df['Age'].fillna(df['Age'].mean()) 
        df["Is.Minor"]=df["Age"].apply(lambda x : 1 if x<18 else 0) 
        df["Is.Senior"]=df["Age"].apply(lambda x : 1 if x>60 else 0) 
        Age_normalized = (df["Age"] - df["Age"].mean()) / df["Age"].std()

        OnehotAge_df = pd.concat([df[["Is.Minor"]], df[["Is.Senior"]]], axis=1)
        combined_df = pd.concat([Age_normalized, OnehotAge_df], axis=1)
        return combined_df.to_numpy()

class SpendTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    def fit(self, X, y=None):
        self.means_ = X[self.cols].mean()
        return self

    def transform(self, X):
        df = X.copy()

        for col in self.cols:
            if col not in df.columns:
                raise ValueError(f"Missing column: {col}")
        
        for col in self.cols:
            df[col] = df[col].fillna(self.means_[col])

        df['TotalSpend'] = df[self.cols].sum(axis=1)
        total_mean = df['TotalSpend'].mean()
        total_std = df['TotalSpend'].std()
        df['TotalSpendStd'] = (df['TotalSpend'] - total_mean) / total_std

        return df[['TotalSpendStd']].to_numpy()

class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols = ['VIP', 'HomePlanet', 'CryoSleep', 'Destination']
        self.fill_values = {}

    def fit(self, X, y=None):
        X_copy = X.copy()

        for col in self.cols:
            if col not in X_copy.columns:
                raise ValueError(f"Missing column: {col}")
            

            if X_copy[col].dtype == 'bool':
                self.fill_values[col] = X_copy[col].median()
            else:
                self.fill_values[col] = X_copy[col].mode()[0]
        return self

    def transform(self, X):
        df = X.copy()
        for col in self.cols:
            df[col] = df[col].fillna(self.fill_values[col])
        df_encoded = pd.get_dummies(df[self.cols], drop_first=True)

        return df_encoded.to_numpy()

class CabinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.onehot = OneHotEncoder(drop='first', sparse_output=False)
        self.scaler = StandardScaler()
        
    def fit(self, X, y=None):
        df = X.copy()
        df['Cabin'] = df['Cabin'].fillna("0/0/0")
        df[['Deck', 'Num', 'Side']] = df['Cabin'].apply(lambda x: pd.Series(str(x).split('/')))
        df['Num'] = df['Num'].astype(float)
        
        self.onehot.fit(df[['Deck', 'Side']])
        self.scaler.fit(df[['Num']])
        return self

    def transform(self, X):
        df = X.copy()
        df['Cabin'] = df['Cabin'].fillna("0/0/0")
        df[['Deck', 'Num', 'Side']] = df['Cabin'].apply(lambda x: pd.Series(str(x).split('/')))
        df['Num'] = df['Num'].astype(float)
        onehot_encoded = self.onehot.transform(df[['Deck', 'Side']])
        num_scaled = self.scaler.transform(df[['Num']])
        return np.hstack([num_scaled, onehot_encoded])

        
def TotalTransformer():
        column_transformer = ColumnTransformer([
        ('cabin', CabinTransformer(),['Cabin']), 
        ("cat",CategoricalTransformer(),['VIP', 'HomePlanet', 'CryoSleep', 'Destination']),
        ("spend",SpendTransformer(),['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']),
        ("age", AgeTransfomer(),['Age'])], remainder='drop')
        return column_transformer

In [57]:

def RF_Search(X,y):
    trans=TotalTransformer()
    trans.fit(X) 
    x=trans.transform(X)
    param_grid_rf = {
    'n_estimators': [40,60,80],
    'max_depth': [5,6,7],
    'class_weight': ['balanced'], 
    'max_features': ['sqrt','log2']
    }
    model=RandomForestClassifier()
    grid_search_rf = GridSearchCV(model, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search_rf.fit(x,y)
    best_params=grid_search_rf.best_params_
    return best_params

In [58]:
df=pd.read_csv("train.csv") 
X=df.drop(columns=["Transported"],axis=1).copy() 
y=df["Transported"] 
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y)
print(RF_Search(X_train,y_train))

{'class_weight': 'balanced', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 40}


In [59]:
trans=TotalTransformer()
model=RandomForestClassifier(class_weight="balanced",max_depth=7,max_features='sqrt',n_estimators=40)
Model=Pipeline([ ("transformer",trans),("mode",model)])
Model.fit(X_train,y_train) 
y_pred=Model.predict(X_test) 
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

       False       0.72      0.82      0.77      1079
        True       0.79      0.69      0.74      1095

    accuracy                           0.75      2174
   macro avg       0.76      0.75      0.75      2174
weighted avg       0.76      0.75      0.75      2174



In [60]:

def Ada_Search(X,y):
    trans=TotalTransformer()
    trans.fit(X) 
    x=trans.transform(X)
    param_grid_ada = {
    'n_estimators': [80,120],
    'learning_rate': [0.5,0.1],
    'estimator': [DecisionTreeClassifier(max_depth=4), 
                          DecisionTreeClassifier(max_depth=5),DecisionTreeClassifier(max_depth=5)]
    }
    model=AdaBoostClassifier()
    grid_search_rf = GridSearchCV(model, param_grid_ada, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search_rf.fit(x,y)
    best_params=grid_search_rf.best_params_
    return best_params
print(Ada_Search(X_train,y_train))


{'estimator': DecisionTreeClassifier(max_depth=5), 'learning_rate': 0.1, 'n_estimators': 120}


In [61]:
model=AdaBoostClassifier(learning_rate= 0.1, n_estimators=80,estimator=DecisionTreeClassifier(max_depth=5))
Model=Pipeline([ ("transformer",trans),("mode",model)])
Model.fit(X_train,y_train) 
y_pred=Model.predict(X_test) 
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

       False       0.72      0.84      0.77      1079
        True       0.81      0.67      0.73      1095

    accuracy                           0.75      2174
   macro avg       0.76      0.75      0.75      2174
weighted avg       0.76      0.75      0.75      2174



In [62]:


def Logistic_Search(X, y):
    trans = TotalTransformer()
    trans.fit(X)
    x = trans.transform(X)
    
    param_grid_log = {
        'C': [0.01, 0.1, 1.0, 10.0],           
        'penalty': ['l2', 'l1'],                
        'solver': ['liblinear'],       
        'max_iter': [100, 200]                 
    }
    
    model = LogisticRegression()
    grid_search_log = GridSearchCV(model, param_grid_log, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search_log.fit(x, y)
    best_params = grid_search_log.best_params_
    
    return best_params

In [63]:
model=LogisticRegression(C= 0.01,max_iter=100,penalty='l2')
Model=Pipeline([ ("transformer",trans),("mode",model)])
Model.fit(X_train,y_train) 
y_pred=Model.predict(X_test) 
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

       False       0.70      0.83      0.76      1079
        True       0.79      0.65      0.72      1095

    accuracy                           0.74      2174
   macro avg       0.75      0.74      0.74      2174
weighted avg       0.75      0.74      0.74      2174

