In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Data processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import datetime
import math
import statistics
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from itertools import product
from math import sqrt
import json
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import TransformerMixin #gives fit_transform method for free

#Models
from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense, LSTM, Dropout,Reshape
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import clone_model
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score
from sklearn.feature_selection import RFE

#Graphs
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import periodogram
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_pacf

In [None]:
np.random_seed=42

In [None]:
sample=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
test=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

In [None]:
#Lets first split our train dataset into the training portions and validation portions
X = train.drop(columns=['Transported'])
y = train['Transported']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Shows number of rows in X_train
X_train.shape[0]

In [None]:
#Next,lets have a look at our data

def data_sum(dataframe):
    nulls=[]
    count=[]
    unique=[]
    null_percentage=[]
    dtype=[]

    for col in dataframe.columns:
        nulls.append(dataframe[col].isnull().sum())
        count.append(dataframe[col].count())
        unique.append(dataframe[col].nunique())
        total_rows=dataframe.shape[0]
        null_percentage.append(dataframe[col].isnull().sum()*100/total_rows)
        dtype.append(dataframe[col].dtype)
    
    summary=pd.DataFrame({
        'Column': dataframe.columns,
        'Nulls': nulls,
        'Non-Null Count': count,
        'Unique Values': unique,
        'Null Percentage': null_percentage,
        'Data Type': dtype
    })

    return(summary)

X_train_summary=data_sum(X_train)
print(X_train_summary)

In [None]:
#Lets first drop some colums we wont be using
X_train=X_train.drop(columns=['PassengerId','Name'])
X_valid=X_valid.drop(columns=['PassengerId','Name'])
test=test.drop(columns=['PassengerId','Name'])






In [None]:
# X_train['TotalSpending'] = X_train[['FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].sum()

In [None]:
X_train

In [None]:
X_train.corr()

First lets split into deck num side 

In [None]:
X_train[["Deck","Num","Side"]]=X_train["Cabin"].str.split("/",expand = True)


In [None]:
X_valid[["Deck","Num","Side"]]=X_valid["Cabin"].str.split("/",expand = True)

In [None]:
test[["Deck","Num","Side"]]=test["Cabin"].str.split("/",expand = True)

In [None]:
X_train.drop(columns = ['Cabin'], inplace = True)

In [None]:
X_valid.drop(columns = ['Cabin'], inplace = True)

In [None]:
test.drop(columns = ['Cabin'], inplace = True)

In [None]:
#Now im trying to impute using interpolation
# Age & Amenities has a relationship. The older, the more spending.
X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')

In [None]:
#Now im trying to impute using interpolation
# Age & Amenities has a relationship. The older, the more spending.
test[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = test[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')

In [None]:
#Now im trying to impute using interpolation
# Age & Amenities has a relationship. The older, the more spending.
X_valid[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = X_valid[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')

In [None]:
#lets see how the dataframe is like now
X_train_2_summary=data_sum(X_train)
X_train_2_summary

Looks like we have completed the imputing for numerical features now lets work on the categorical features.



<!--  --> from sklearn.impute import SimpleImputer. Constant strategy is better for one-hot encoding


In [None]:
categorical = X_train.dtypes == 'object'
categorical

Look at the columns with categorical data.
Starting from HomePlanet.

Aus help to put some imputation for Arden
Arden - Above
Austin - Below

In [None]:
#Created a new feature
X_train["TotalSpending"]=X_train[["RoomService","Spa","FoodCourt","VRDeck",'ShoppingMall']].sum(axis=1)
X_valid["TotalSpending"]=X_valid[["RoomService","Spa","FoodCourt","VRDeck",'ShoppingMall']].sum(axis=1)
test["TotalSpending"]=test[["RoomService","Spa","FoodCourt","VRDeck",'ShoppingMall']].sum(axis=1)

Lets impute HomePlanet

In [None]:
#For rows where ["Destination"]=="PSO J318.5-22",let the rows with missing values in ["HomePlanet"] be imputed with "Earth"
X_train.loc[X_train["Destination"]=="PSO J318.5-22","HomePlanet"] = X_train.loc[X_train["Destination"]=="PSO J318.5-22","HomePlanet"].fillna("Earth")
#For rows where ["Destination"]=="55 Cancri e",let the rows with missing values in ["HomePlanet"] be imputed with "Europa"
X_train.loc[X_train["Destination"]=="55 Cancri e","HomePlanet"]=X_train.loc[X_train["Destination"]=="55 Cancri e","HomePlanet"].fillna("Europa")
#For rows where ["Destination"]=="TRAPPIST-1e",let the rows with missing values in ["HomePlanet"] be imputed with "Earth"
X_train.loc[X_train["Destination"]=="TRAPPIST-1e","HomePlanet"]=X_train.loc[X_train["Destination"]=="TRAPPIST-1e","HomePlanet"].fillna("Earth")
X_train.loc[X_train['Deck']=='A','HomePlanet']=X_train.loc[X_train['Deck']=='A','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='B','HomePlanet']=X_train.loc[X_train['Deck']=='B','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='C','HomePlanet']=X_train.loc[X_train['Deck']=='C','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='T','HomePlanet']=X_train.loc[X_train['Deck']=='T','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='G','HomePlanet']=X_train.loc[X_train['Deck']=='G','HomePlanet'].fillna("Earth")
X_train.loc[X_train['Destination'] == 'TRAPPIST-1e', 'HomePlanet'] = X_train.loc[X_train['Destination'] == 'TRAPPIST-1e', 'HomePlanet'].fillna('Earth')
X_train['HomePlanet']=X_train['HomePlanet'].fillna('Earth')

X_valid.loc[X_valid["Destination"]=="PSO J318.5-22","HomePlanet"]=X_valid.loc[X_valid["Destination"]=="PSO J318.5-22","HomePlanet"].fillna("Earth")
test.loc[test["Destination"]=="PSO J318.5-22","HomePlanet"]=test.loc[test["Destination"]=="PSO J318.5-22","HomePlanet"].fillna("Earth")
X_valid.loc[X_valid["Destination"]=="55 Cancri e","HomePlanet"]=X_valid.loc[X_valid["Destination"]=="55 Cancri e","HomePlanet"].fillna("Europa")
test.loc[test["Destination"]=="55 Cancri e","HomePlanet"]=test.loc[test["Destination"]=="55 Cancri e","HomePlanet"].fillna("Europa")
X_valid.loc[X_valid["Destination"]=="TRAPPIST-1e","HomePlanet"]=X_valid.loc[X_valid["Destination"]=="TRAPPIST-1e","HomePlanet"].fillna("Earth")
test.loc[test["Destination"]=="TRAPPIST-1e","HomePlanet"]=test.loc[test["Destination"]=="TRAPPIST-1e","HomePlanet"].fillna("Earth")
X_valid.loc[X_valid['Deck']=='A','HomePlanet']=X_valid.loc[X_valid['Deck']=='A','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='B','HomePlanet']=X_valid.loc[X_valid['Deck']=='B','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='C','HomePlanet']=X_valid.loc[X_valid['Deck']=='C','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='T','HomePlanet']=X_valid.loc[X_valid['Deck']=='T','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='G','HomePlanet']=X_valid.loc[X_valid['Deck']=='G','HomePlanet'].fillna("Earth")
X_valid.loc[X_valid['Destination'] == 'TRAPPIST-1e', 'HomePlanet'] = X_valid.loc[X_valid['Destination'] == 'TRAPPIST-1e', 'HomePlanet'].fillna('Earth')
X_valid['HomePlanet']=X_valid['HomePlanet'].fillna('Earth')

test.loc[test['Deck']=='A','HomePlanet']=test.loc[test['Deck']=='A','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='B','HomePlanet']=test.loc[test['Deck']=='B','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='C','HomePlanet']=test.loc[test['Deck']=='C','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='T','HomePlanet']=test.loc[test['Deck']=='T','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='G','HomePlanet']=test.loc[test['Deck']=='G','HomePlanet'].fillna("Earth")
test.loc[test['Destination'] == 'TRAPPIST-1e', 'HomePlanet'] = test.loc[test['Destination'] == 'TRAPPIST-1e', 'HomePlanet'].fillna('Earth')
test['HomePlanet']=test['HomePlanet'].fillna('Earth')



Lets impute CryoSleep

In [None]:
X_train.loc[X_train["TotalSpending"]>=5000,"CryoSleep"]=X_train.loc[X_train["TotalSpending"]>=5000,"CryoSleep"].fillna(False)
X_train.loc[X_train["TotalSpending"]==0,"CryoSleep"]=X_train.loc[X_train["TotalSpending"]==0,"CryoSleep"].fillna(True)
X_train["CryoSleep"]=X_train["CryoSleep"].fillna(False)

X_valid.loc[X_valid["TotalSpending"]>=5000,"CryoSleep"]=X_valid.loc[X_valid["TotalSpending"]>=5000,"CryoSleep"].fillna(False)
X_valid.loc[X_valid["TotalSpending"]==0,"CryoSleep"]=X_valid.loc[X_valid["TotalSpending"]==0,"CryoSleep"].fillna(True)
X_valid["CryoSleep"]=X_valid["CryoSleep"].fillna(False)

test.loc[test["TotalSpending"]>=5000,"CryoSleep"]=test.loc[test["TotalSpending"]>=5000,"CryoSleep"].fillna(False)
test.loc[test["TotalSpending"]==0,"CryoSleep"]=test.loc[test["TotalSpending"]==0,"CryoSleep"].fillna(True)
test["CryoSleep"]=test["CryoSleep"].fillna(False)

Lets impute Destination

In [None]:
X_train.loc[X_train['VIP'] == 'FALSE','Destination'] = X_train.loc[X_train['VIP'] == 'FALSE','Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'ffill')
X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'bfill')
X_train.loc[X_train['HomePlanet'] == 'Europa', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Europa', 'Destination'].fillna('55 Cancri e')
X_train.loc[X_train['HomePlanet'] == 'Mars', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Mars', 'Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['HomePlanet'] == 'Earth','Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth','Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['VIP'] == False, 'Destination'] = X_train.loc[X_train['VIP'] == False, 'Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'].fillna('TRAPPIST-1e')

X_valid.loc[X_valid['VIP'] == 'FALSE','Destination'] = X_valid.loc[X_valid['VIP'] == 'FALSE','Destination'].fillna('TRAPPIST-1e')
X_valid.loc[X_valid['HomePlanet'] == 'Earth', 'Destination'] = X_valid.loc[X_valid['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'ffill')
X_valid.loc[X_valid['HomePlanet'] == 'Earth', 'Destination'] = X_valid.loc[X_valid['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'bfill')
X_valid.loc[X_valid['HomePlanet'] == 'Europa', 'Destination'] = X_valid.loc[X_valid['HomePlanet'] == 'Europa', 'Destination'].fillna('55 Cancri e')
X_valid.loc[X_valid['HomePlanet'] == 'Mars', 'Destination'] = X_valid.loc[X_valid['HomePlanet'] == 'Mars', 'Destination'].fillna('TRAPPIST-1e')
X_valid.loc[X_valid['HomePlanet'] == 'Earth','Destination'] = X_valid.loc[X_valid['HomePlanet'] == 'Earth','Destination'].fillna('TRAPPIST-1e')
X_valid.loc[X_valid['VIP'] == False, 'Destination'] = X_valid.loc[X_valid['VIP'] == False, 'Destination'].fillna('TRAPPIST-1e')
X_valid.loc[X_valid['HomePlanet'] == 'Earth', 'Destination'] = X_valid.loc[X_valid['HomePlanet'] == 'Earth', 'Destination'].fillna('TRAPPIST-1e')

test.loc[test['VIP'] == 'FALSE','Destination'] = test.loc[test['VIP'] == 'FALSE','Destination'].fillna('TRAPPIST-1e')
test.loc[test['HomePlanet'] == 'Earth', 'Destination'] = test.loc[test['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'ffill')
test.loc[test['HomePlanet'] == 'Earth', 'Destination'] = test.loc[test['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'bfill')
test.loc[test['HomePlanet'] == 'Europa', 'Destination'] = test.loc[test['HomePlanet'] == 'Europa', 'Destination'].fillna('55 Cancri e')
test.loc[test['HomePlanet'] == 'Mars', 'Destination'] = test.loc[test['HomePlanet'] == 'Mars', 'Destination'].fillna('TRAPPIST-1e')
test.loc[test['HomePlanet'] == 'Earth','Destination'] = test.loc[test['HomePlanet'] == 'Earth','Destination'].fillna('TRAPPIST-1e')
test.loc[test['VIP'] == False, 'Destination'] = test.loc[test['VIP'] == False, 'Destination'].fillna('TRAPPIST-1e')
test.loc[test['HomePlanet'] == 'Earth', 'Destination'] = test.loc[test['HomePlanet'] == 'Earth', 'Destination'].fillna('TRAPPIST-1e')

Lets impute VIP

In [None]:
X_train.loc[X_train['TotalSpending'] >= 5000, 'VIP'] = X_train.loc[X_train['TotalSpending'] >= 5000, 'VIP'].fillna(True)
X_train.loc[X_train['TotalSpending'] < 5000, 'VIP'] = X_train.loc[X_train['TotalSpending'] < 5000, 'VIP'].fillna(False)

X_valid.loc[X_valid['TotalSpending'] >= 5000, 'VIP'] = X_valid.loc[X_valid['TotalSpending'] >= 5000, 'VIP'].fillna(True)
X_valid.loc[X_valid['TotalSpending'] < 5000, 'VIP'] = X_valid.loc[X_valid['TotalSpending'] < 5000, 'VIP'].fillna(False)

test.loc[test['TotalSpending'] >= 5000, 'VIP'] = test.loc[test['TotalSpending'] >= 5000, 'VIP'].fillna(True)
test.loc[test['TotalSpending'] < 5000, 'VIP'] = test.loc[test['TotalSpending'] < 5000, 'VIP'].fillna(False)

Lets impute Deck

In [None]:
#Deck has a uneven distribution with deck G and F having a far larger value then the other decks. Lets just impute all nulls  to be F
X_train['Deck']=X_train['Deck'].fillna('F')
X_valid['Deck']=X_valid['Deck'].fillna('F')
test['Deck']=test['Deck'].fillna('F')

Lets  change datatype then impute Num

In [None]:
#Convert ['Num'] to numerical feature
X_train['Num'] = X_train['Num'].str.replace(' ', '').astype(float)
print(f"There are initially {X_train['Num'].isna().sum()} nulls")
#Num has too much unique values and we cant do simple imputation because each unique value has such a low count that setting all 72 nulls to that value will skew the results so we just randomly impute the nulls using the existing values 
num_values = X_train['Num'].dropna().values

X_valid['Num'] = X_valid['Num'].str.replace(' ', '').astype(float)
print(f"There are initially {X_valid['Num'].isna().sum()} nulls")
#Num has too much unique values and we cant do simple imputation because each unique value has such a low count that setting all 72 nulls to that value will skew the results so we just randomly impute the nulls using the existing values 
num_values = X_valid['Num'].dropna().values

test['Num'] = test['Num'].str.replace(' ', '').astype(float)
print(f"There are initially {test['Num'].isna().sum()} nulls")
#Num has too much unique values and we cant do simple imputation because each unique value has such a low count that setting all 72 nulls to that value will skew the results so we just randomly impute the nulls using the existing values 
num_values = test['Num'].dropna().values


# Shuffle non-missing values multiple times
num_shuffled = num_values.copy()  # Make a copy to shuffle multiple times
np.random.shuffle(num_shuffled)
num_shuffled=pd.Series(num_shuffled)

non_numeric_values = num_shuffled[np.isnan(num_shuffled)]
if non_numeric_values.size == 0:
    print("All values in num_shuffled are numerical.")
else:
    print(f"There are {non_numeric_values.size} non-numeric values in num_shuffled.")
num_unique_values = np.unique(num_shuffled).size
print(f"There are {num_unique_values} unique values in the num_shuffled array.")


# Fill missing values with the shuffled values
X_train['Num']=X_train['Num'].fillna(num_shuffled)
print(f"Now there are {X_train['Num'].isna().sum()} nulls")

X_valid['Num']=X_valid['Num'].fillna(num_shuffled)

test['Num']=test['Num'].fillna(num_shuffled)
                                  
#for some reason the number of nulls dropped to 34 but wont drop further, im gonna impute the last 34 nulls with the most frequent value for now until I can solve why
X_train['Num']=X_train['Num'].fillna(82)
print(f"Now there are {X_train['Num'].isna().sum()} nulls")

X_valid['Num']=X_valid['Num'].fillna(82)
print(f"Now there are {X_valid['Num'].isna().sum()} nulls")

test['Num']=test['Num'].fillna(82)
print(f"Now there are {test['Num'].isna().sum()} nulls")
    



Lets impute side

In [None]:
#For Side, they are almost equal and rightfully so, lets just impute the nulls equally
for value in ['S','P']:
    null_indices = X_train[X_train['Side'].isnull()].sample(71).index
    X_train.loc[null_indices, 'Side'] = value
    
for value in ['S','P']:
    null_indices = X_valid[X_valid['Side'].isnull()].sample(28).index
    X_valid.loc[null_indices, 'Side'] = value
X_valid.loc[X_valid[X_valid["Side"].isna()].index,"Side"]="S"
    
for value in ['S','P']:
    null_indices = test[test['Side'].isnull()].sample(50).index
    test.loc[null_indices, 'Side'] = value    

In [None]:
X_valid.loc[X_valid[X_valid["Side"].isna()].index,"Side"]="S"

In [None]:
X_train_3_summary=data_sum(X_valid)
X_train_3_summary

Imputation complete
<br>
Now lets encode
<br>
Cardinality:
<br>
Low: 2-5
<br>
Moderate: 6-46
<br>
High: 50 and above

HomePlanet:Label Encoding
<br>
CryoSleep:Label Encoding
<br>
Destination:Label Encoding
<br>
VIP:Label Encoding
<br>
Deck:OHE
<br>
Side:Label Encoding
<br>
Remaining numerical features:Standardization

In [None]:

class MyLabelEncoder(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelEncoder(*args, **kwargs)
    def fit(self, x, y=0):
        print(f" The shape of the fitted is {x.shape}")
        print(f" The shape of the y is {y}")
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        print(x.shape)
        return self.encoder.transform(x)
    def fit_transform(self, x, y=0):
        #print(f" The shape of the fitted is {x.shape}")
        #x is a dataframe
        columns=x.columns
        transformed_features=[]
        for column in columns:
            #print(f" The column is {column}")
            feature_column = x[column]
            #print(f" Each column has a shape of {feature_column.shape}")
            self.encoder.fit(feature_column)
            new_feature_column=self.encoder.transform(feature_column)
            transformed_features.append(new_feature_column)
        transformed_features=np.transpose(transformed_features)
        return transformed_features
        
        
            
            
            
            
        
            
        

In [None]:
#print(X_train['Deck'].value_counts())
#print(X_valid['Deck'].value_counts())
#print(test['Deck'].value_counts())

l_c_c_f=["HomePlanet", "CryoSleep", "Destination", "VIP", "Side"]
m_c_c_f=['Deck']
n_f=X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()



# Create transformers for categorical features
l_c_categorical_transformer = Pipeline(steps=[
    ('encoder', MyLabelEncoder())  
])

m_c_categorical_transformer = Pipeline(steps=[
    ('encoder',OneHotEncoder(handle_unknown='ignore'))  
])


# Create transformers for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, n_f),
        ('m_c_cat', m_c_categorical_transformer, m_c_c_f),
        ('l_c_cat', l_c_categorical_transformer, l_c_c_f)
    ])

# Create the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply the preprocessing pipeline to your data
encoded_data_X_train = preprocessing_pipeline.fit_transform(X_train)
print(encoded_data_X_train.shape)
columns=np.concatenate((n_f,['A','B','C','D','E','F','G','T'] , l_c_c_f))
# Convert the transformed data back to a DataFrame
X_train_encoded = pd.DataFrame(encoded_data_X_train,columns=columns)

#Lets fit the pipeline to X_valid and test too
encoded_data_X_valid = preprocessing_pipeline.fit_transform(X_valid)
print(encoded_data_X_valid.shape)
columns=np.concatenate((n_f,['A','B','C','D','E','F','G',] , l_c_c_f))
X_valid = pd.DataFrame(encoded_data_X_valid,columns=columns)
X_valid['T']=0
# Get the column you want to move
column_to_move = X_valid.pop('T')
# Insert the column in the 16th position
X_valid.insert(15, 'T', column_to_move)
encoded_data_test = preprocessing_pipeline.fit_transform(test)
columns=np.concatenate((n_f,['A','B','C','D','E','F','G','T'] , l_c_c_f))
test = pd.DataFrame(encoded_data_test,columns=columns)

In [None]:
X_train_encoded

Fantastic we are now done with encoding so lets move on to polynomial features, thing is we dont know how many degrees is the most optimal so lets just create multiple sets of different degrees and see which degree performs the best

In [None]:
optimal_num_of_features=[]
best_scores_for_each_degree=[]

max_degree=3 #Theres already 2024 features when degree is 3

for degree in range(3,max_degree+1):
    poly = PolynomialFeatures(degree=degree)
    X_train_poly=poly.fit_transform(X_train_encoded)
    X_valid_poly=poly.transform(X_valid)
    max_num_of_features=X_train_poly.shape[1]
    num_of_features=X_train_poly.shape[1]-1
    rf_classifier = RandomForestClassifier(random_state=42)
    val_scores=[]
    new_X_train=X_train_poly
    new_X_valid=X_valid_poly
    full_features=list(sorted(range(1, num_of_features), reverse=True))
    
    while num_of_features>1:
        rfe = RFE(estimator=rf_classifier, n_features_to_select=num_of_features)
        rfe.fit(new_X_train, y_train)
        selected_features = rfe.support_
        #print(selected_features)
        #print(X_train_poly.shape)
        new_X_train=new_X_train[:,selected_features]
        #print(new_X_train.shape)
        #print(X_valid.shape)
        new_X_valid=new_X_valid[:,selected_features]
        rf_classifier.fit(new_X_train,y_train)
        y_pred=rf_classifier.predict(new_X_valid)
        mcc = matthews_corrcoef(y_valid, y_pred)
        val_scores.append(mcc)
        num_of_features-=1
        print(f"New number of features are {num_of_features}")
        
    sns.scatterplot(x=full_features, y=val_scores)
    # Add labels and a title
    plt.xlabel('Number of features')
    plt.ylabel('Validation Scores')
    plt.title('Variation of MCC with number of features')
    # Show the plot
    plt.show()
    
    max_index = val_scores.index(max(val_scores))
    best_number_of_features=X_train_poly.shape[1]-1-max_index
    optimal_num_of_features.append(best_number_of_features)
    best_scores_for_each_degree.append(max(val_scores))
    print(f"Degree {degree} done")
    
print(optimal_num_of_features, best_scores_for_each_degree)

optimal_num_of_features=[]
best_scores_for_each_degree=[]

max_degree=3 #Theres already 2024 features when degree is 3

for degree in range(1,max_degree+1):
    poly = PolynomialFeatures(degree=degree)
    X_train_poly=poly.fit_transform(X_train_encoded)
    X_valid_poly=poly.transform(X_valid)
    max_num_of_features=X_train_poly.shape[1]
    num_of_features=X_train_poly.shape[1]-1
    rf_classifier = RandomForestClassifier(random_state=42)
    val_scores=[]
    new_X_train=X_train_poly
    new_X_valid=X_valid_poly
    full_features=list(sorted(range(1, num_of_features), reverse=True))
    
    while num_of_features>1:
        rfe = RFE(estimator=rf_classifier, n_features_to_select=num_of_features)
        rfe.fit(new_X_train, y_train)
        selected_features = rfe.support_
        #print(selected_features)
        #print(X_train_poly.shape)
        new_X_train=new_X_train[:,selected_features]
        #print(new_X_train.shape)
        #print(X_valid.shape)
        new_X_valid=new_X_valid[:,selected_features]
        rf_classifier.fit(new_X_train,y_train)
        y_pred=rf_classifier.predict(new_X_valid)
        mcc = matthews_corrcoef(y_valid, y_pred)
        val_scores.append(mcc)
        num_of_features-=1
        print(f"New number of features are {num_of_features}")
        
    sns.scatterplot(x=full_features, y=val_scores)
    # Add labels and a title
    plt.xlabel('Number of features')
    plt.ylabel('Validation Scores')
    plt.title('Variation of MCC with number of features')
    # Show the plot
    plt.show()
    
    max_index = val_scores.index(max(val_scores))
    best_number_of_features=X_train_poly.shape[1]-1-max_index
    optimal_num_of_features.append(best_number_of_features)
    best_scores_for_each_degree.append(max(val_scores))
    print(f"Degree {degree} done")
    

Lets start choosing our models, I think random forest is one good option to try out

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], # Regularization parameter 
              'penalty': ['l1', 'l2'], # Regularization type 
              'solver': ['liblinear', 'saga']} # Algorithm to use for optimization

In [None]:
logreg = LogisticRegression()
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1) 
grid_search.fit(X_train_encoded, y_train)

In [None]:
best_params = grid_search.best_params_ 
best_model = grid_search.best_estimator_
best_model

In [None]:
from sklearn.metrics import accuracy_score
degrees = [1, 2, 3]  # List of polynomial degrees to try

best_degree = None
best_accuracy = 0

for degree in degrees:
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train_encoded)
    X_valid_poly = poly.transform(X_valid)

    # Fit a logistic regression model
    model = LogisticRegression()
    model.fit(X_train_poly, y_train)

    # Make predictions
    y_pred = model.predict(X_valid_poly)

    # Evaluate the model
    accuracy = accuracy_score(y_valid, y_pred)

    # Check if this degree gives the best accuracy

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_degree = degree

    print(f"Degree {degree} Accuracy: {accuracy}")

print(f"Best Degree: {best_degree} (Accuracy: {best_accuracy})")


so according to the polynomial features, degree 2 yields the highest accuracy.