# Import libraries

In [24]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import re
import requests
import zipfile
from io import BytesIO
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.experimental import enable_iterative_imputer  # Enable experimental feature
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from scipy.stats import chi2_contingency
from sklearn.linear_model import Ridge

# Data understanding

In [25]:
# Import dataset
train_df = pd.read_csv("C:/Git_files/Obesity_ML_case/train.csv")
test_df = pd.read_csv("C:/Git_files/Obesity_ML_case/test.csv")

print(train_df.shape)
print(test_df.shape)

print("Train dataframe columns: ", train_df.columns)
print("Test dataframe columns: ", test_df.columns)

(891, 12)
(418, 11)
Train dataframe columns:  Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Test dataframe columns:  Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [26]:
# Create separate survived from dataset
y_df = train_df[['Survived']].copy()
train_only_df = train_df.drop(columns='Survived')
print(train_only_df.shape)
print(test_df.shape)

(891, 11)
(418, 11)


In [27]:
# Concatenate train and test datasets
full_df = pd.concat([train_only_df, test_df], ignore_index=False)

print(full_df.shape)

print(full_df.head())

(1309, 11)
   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  


In [28]:
# Explore the dataframes

print(full_df.columns)
print(full_df.info())

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB
None


## Data Preparation

In [29]:
# Split name into "Last Name" and "Rest" using ", " as the delimiter
full_df[['lname', 'rest']] = full_df['Name'].str.split(", ", expand=True)

# Further split "Rest" into "Title" and "First Name(s)" using the first space
full_df[['title', 'fnames']] = full_df['rest'].str.split(" ", n=1, expand=True)

# Drop the original "Name" and "Rest" columns (if not needed)
full_df = full_df.drop(columns=['Name', 'rest'])

print(full_df.head())


   PassengerId  Pclass     Sex   Age  SibSp  Parch            Ticket     Fare  \
0            1       3    male  22.0      1      0         A/5 21171   7.2500   
1            2       1  female  38.0      1      0          PC 17599  71.2833   
2            3       3  female  26.0      0      0  STON/O2. 3101282   7.9250   
3            4       1  female  35.0      1      0            113803  53.1000   
4            5       3    male  35.0      0      0            373450   8.0500   

  Cabin Embarked      lname  title                                 fnames  
0   NaN        S     Braund    Mr.                            Owen Harris  
1   C85        C    Cumings   Mrs.  John Bradley (Florence Briggs Thayer)  
2   NaN        S  Heikkinen  Miss.                                  Laina  
3  C123        S   Futrelle   Mrs.          Jacques Heath (Lily May Peel)  
4   NaN        S      Allen    Mr.                          William Henry  


In [30]:
# Get the unique titles
title_counts = full_df['title'].value_counts()
title_df = pd.DataFrame({
    'Title': title_counts.index,
    'Count': title_counts.values})

print(title_df)


        Title  Count
0         Mr.    757
1       Miss.    260
2        Mrs.    197
3     Master.     61
4        Rev.      8
5         Dr.      8
6        Col.      4
7       Mlle.      2
8      Major.      2
9         Ms.      2
10      Lady.      1
11       Sir.      1
12       Mme.      1
13       Don.      1
14      Capt.      1
15        the      1
16  Jonkheer.      1
17      Dona.      1


In [31]:
# Get the unique last names
name_counts = full_df['lname'].value_counts()
name_counts_df = pd.DataFrame({
    'lname': name_counts.index,
    'Count': name_counts.values})

print(name_counts_df[name_counts_df['Count'] == 2])

              lname  Count
105         Larsson      2
106           Angle      2
107            Carr      2
108          Betros      2
109          Snyder      2
..              ...    ...
233         Spencer      2
234            Cook      2
235           Pears      2
236  de Messemaeker      2
237         Christy      2

[133 rows x 2 columns]


In [32]:

# Group by 'lname' and 'Ticket' to find family groups
family_groups = full_df.groupby(['lname', 'Ticket'])

# Create a 'family_size' column to record the size of the family group
full_df['family_size'] = family_groups['lname'].transform('count')  # Selecting 'lname' or any other column

print(full_df.head())

   PassengerId  Pclass     Sex   Age  SibSp  Parch            Ticket     Fare  \
0            1       3    male  22.0      1      0         A/5 21171   7.2500   
1            2       1  female  38.0      1      0          PC 17599  71.2833   
2            3       3  female  26.0      0      0  STON/O2. 3101282   7.9250   
3            4       1  female  35.0      1      0            113803  53.1000   
4            5       3    male  35.0      0      0            373450   8.0500   

  Cabin Embarked      lname  title                                 fnames  \
0   NaN        S     Braund    Mr.                            Owen Harris   
1   C85        C    Cumings   Mrs.  John Bradley (Florence Briggs Thayer)   
2   NaN        S  Heikkinen  Miss.                                  Laina   
3  C123        S   Futrelle   Mrs.          Jacques Heath (Lily May Peel)   
4   NaN        S      Allen    Mr.                          William Henry   

   family_size  
0            1  
1            2  

In [33]:
# Create cabin number for missing values
full_df.loc[full_df.Cabin.isnull(), 'Cabin'] = 'U0'

# Create deck and room number columns
# Extracting the first letter as 'Deck' and the rest as 'Room Number'
full_df['deck'] = full_df['Cabin'].str[0]
full_df['rest2'] = full_df['Cabin'].str[1:]

print(full_df[['deck','rest2']].head(10))

  deck rest2
0    U     0
1    C    85
2    U     0
3    C   123
4    U     0
5    U     0
6    E    46
7    U     0
8    U     0
9    U     0


In [34]:
# add additional elements rest column
full_df['rest2'] = full_df['rest2'] + ' nostr'

# Replace irregular numbers with real number
full_df['rest2'] = full_df['rest2'].str.replace(' G73', '373')
full_df['rest2'] = full_df['rest2'].str.replace(' E69', '269')
full_df['rest2'] = full_df['rest2'].str.replace(' G63', '363')
full_df['rest2'] = full_df['rest2'].str.replace(' E46', '246')
full_df['rest2'] = full_df['rest2'].str.replace(' E57', '257')

# Remove spaces
full_df['rest2'] = full_df['rest2'].str.replace(' ', '')

# Define a function to extract the first numeric element from a string
def extract_first_numeric(s):
    match = re.search(r'\d+', s)
    if match:
        return int(match.group())
    else:
        return 0

# Apply the function to the 'rest2' column to create the 'room' column
full_df['room'] = full_df['rest2'].apply(extract_first_numeric).astype(int)



print(full_df.head())

   PassengerId  Pclass     Sex   Age  SibSp  Parch            Ticket     Fare  \
0            1       3    male  22.0      1      0         A/5 21171   7.2500   
1            2       1  female  38.0      1      0          PC 17599  71.2833   
2            3       3  female  26.0      0      0  STON/O2. 3101282   7.9250   
3            4       1  female  35.0      1      0            113803  53.1000   
4            5       3    male  35.0      0      0            373450   8.0500   

  Cabin Embarked      lname  title                                 fnames  \
0    U0        S     Braund    Mr.                            Owen Harris   
1   C85        C    Cumings   Mrs.  John Bradley (Florence Briggs Thayer)   
2    U0        S  Heikkinen  Miss.                                  Laina   
3  C123        S   Futrelle   Mrs.          Jacques Heath (Lily May Peel)   
4    U0        S      Allen    Mr.                          William Henry   

   family_size deck     rest2  room  
0           

# Group low-occurring, related titles together
full_df.loc[full_df['title'] == 'Jonkheer.', 'title'] = 'Master.'
full_df.loc[full_df['title'].isin(['Ms.', 'Mlle.']), 'title'] = 'Miss.'
full_df.loc[full_df['title'] == 'Mme.', 'title'] = 'Mrs.'
full_df.loc[full_df['title'].isin(['Capt.', 'Don.', 'Major.', 'Col.', 'Sir.']), 'title'] = 'Sir.'
full_df.loc[full_df['title'].isin(['Dona.', 'Lady.', 'the']), 'title'] = 'Lady.'

# Print categories of title
print(pd.DataFrame({
    'Title': full_df['title'].value_counts().index,
    'Count': full_df['title'].value_counts().values
}))

In [35]:
print(full_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Sex          1309 non-null   object 
 3   Age          1046 non-null   float64
 4   SibSp        1309 non-null   int64  
 5   Parch        1309 non-null   int64  
 6   Ticket       1309 non-null   object 
 7   Fare         1308 non-null   float64
 8   Cabin        1309 non-null   object 
 9   Embarked     1307 non-null   object 
 10  lname        1309 non-null   object 
 11  title        1309 non-null   object 
 12  fnames       1309 non-null   object 
 13  family_size  1309 non-null   int64  
 14  deck         1309 non-null   object 
 15  rest2        1309 non-null   object 
 16  room         1309 non-null   int32  
dtypes: float64(2), int32(1), int64(5), object(9)
memory usage: 179.0+ KB
None


In [36]:
# Create group IDs based on Ticket number
# Group by 'Ticket' and assign group IDs
full_df['group_id'] = full_df.groupby('Ticket').ngroup().add(1)

# Count the occurrences of each group_id
group_id_counts = full_df['group_id'].value_counts()

# Get the group_ids where count is 1
group_ids_to_replace = group_id_counts[group_id_counts == 1].index.tolist()


print(full_df[['group_id']])

     group_id
0         721
1         817
2         915
3          66
4         650
..        ...
413       712
414       835
415       873
416       580
417       263

[1309 rows x 1 columns]


In [37]:
# Check NAs in each column
print("Number of NAs per column, full_df: ", full_df.isna().sum())
print("Percentage of NAs per column, full_df: ", full_df.isna().mean() * 100)

Number of NAs per column, full_df:  PassengerId      0
Pclass           0
Sex              0
Age            263
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin            0
Embarked         2
lname            0
title            0
fnames           0
family_size      0
deck             0
rest2            0
room             0
group_id         0
dtype: int64
Percentage of NAs per column, full_df:  PassengerId     0.000000
Pclass          0.000000
Sex             0.000000
Age            20.091673
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.076394
Cabin           0.000000
Embarked        0.152788
lname           0.000000
title           0.000000
fnames          0.000000
family_size     0.000000
deck            0.000000
rest2           0.000000
room            0.000000
group_id        0.000000
dtype: float64


In [38]:
# Fill NAs except Age
group_fare = full_df.groupby('group_id')['Fare'].transform('mean')

# Fill missing values in the 'Fare' column with the group fare
full_df['Fare'] = full_df['Fare'].fillna(group_fare)

# Display the DataFrame after filling missing values
print("Number of NAs per column, full_df: ", full_df.isna().sum())
print("Percentage of NAs per column, full_df: ", full_df.isna().mean() * 100)

Number of NAs per column, full_df:  PassengerId      0
Pclass           0
Sex              0
Age            263
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin            0
Embarked         2
lname            0
title            0
fnames           0
family_size      0
deck             0
rest2            0
room             0
group_id         0
dtype: int64
Percentage of NAs per column, full_df:  PassengerId     0.000000
Pclass          0.000000
Sex             0.000000
Age            20.091673
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.076394
Cabin           0.000000
Embarked        0.152788
lname           0.000000
title           0.000000
fnames          0.000000
family_size     0.000000
deck            0.000000
rest2           0.000000
room            0.000000
group_id        0.000000
dtype: float64


In [39]:
# Check Age NAs
age_nas = full_df[full_df['Age'].isna()]
print(age_nas.head(100))

     PassengerId  Pclass     Sex  Age  SibSp  Parch  Ticket     Fare Cabin  \
5              6       3    male  NaN      0      0  330877   8.4583    U0   
17            18       2    male  NaN      0      0  244373  13.0000    U0   
19            20       3  female  NaN      0      0    2649   7.2250    U0   
26            27       3    male  NaN      0      0    2631   7.2250    U0   
28            29       3  female  NaN      0      0  330959   7.8792    U0   
..           ...     ...     ...  ...    ...    ...     ...      ...   ...   
470          471       3    male  NaN      0      0  323592   7.2500    U0   
475          476       1    male  NaN      0      0  110465  52.0000   A14   
481          482       2    male  NaN      0      0  239854   0.0000    U0   
485          486       3  female  NaN      3      1    4133  25.4667    U0   
490          491       3    male  NaN      1      0   65304  19.9667    U0   

    Embarked       lname  title                   fnames  famil

In [40]:
# Drop irrelevant columns
# Create list of columns to drop
columns_to_drop = ['PassengerId', 'Ticket', 'Cabin', 'lname', 'fnames', 'rest2']

# Drop columns
full_df1 = full_df.drop(columns=columns_to_drop)

print(full_df1.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       1309 non-null   int64  
 1   Sex          1309 non-null   object 
 2   Age          1046 non-null   float64
 3   SibSp        1309 non-null   int64  
 4   Parch        1309 non-null   int64  
 5   Fare         1308 non-null   float64
 6   Embarked     1307 non-null   object 
 7   title        1309 non-null   object 
 8   family_size  1309 non-null   int64  
 9   deck         1309 non-null   object 
 10  room         1309 non-null   int32  
 11  group_id     1309 non-null   int64  
dtypes: float64(2), int32(1), int64(5), object(4)
memory usage: 127.8+ KB
None


In [41]:
# Fill NAs in Embarked feature with the most frequent value
most_frequent_embarked = full_df1['Embarked'].mode().iloc[0]

# Fill missing values in the 'Embarked' column with the most frequent value
full_df1['Embarked'] = full_df1['Embarked'].fillna(most_frequent_embarked)

# Check NAs in each column
print("Number of NAs per column, full_df: ", full_df1.isna().sum())

Number of NAs per column, full_df:  Pclass           0
Sex              0
Age            263
SibSp            0
Parch            0
Fare             1
Embarked         0
title            0
family_size      0
deck             0
room             0
group_id         0
dtype: int64


In [42]:
print(full_df.columns)
print(full_df.info())

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'lname', 'title', 'fnames', 'family_size',
       'deck', 'rest2', 'room', 'group_id'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Sex          1309 non-null   object 
 3   Age          1046 non-null   float64
 4   SibSp        1309 non-null   int64  
 5   Parch        1309 non-null   int64  
 6   Ticket       1309 non-null   object 
 7   Fare         1308 non-null   float64
 8   Cabin        1309 non-null   object 
 9   Embarked     1307 non-null   object 
 10  lname        1309 non-null   object 
 11  title        1309 non-null   object 
 12  fnames       1309 non-null   object 
 13  family_size  1309 non-null   int64  
 14  de

#### Create Dummies for categorical columns"

In [43]:
# Create dummy variables for Sex column
full_df2 = pd.get_dummies(full_df1, columns=["Sex"], drop_first=True)

print(full_df2.columns)


Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'title',
       'family_size', 'deck', 'room', 'group_id', 'Sex_male'],
      dtype='object')


In [44]:
# Create dummy variables for the specified columns
full_df3 = pd.get_dummies(full_df2, columns=["Embarked", "title", "deck"], drop_first=False)



In [45]:
# Print the column names to see the dummy variables
print(full_df3.head())


   Pclass   Age  SibSp  Parch     Fare  family_size  room  group_id  Sex_male  \
0       3  22.0      1      0   7.2500            1     0       721      True   
1       1  38.0      1      0  71.2833            2    85       817     False   
2       3  26.0      0      0   7.9250            1     0       915     False   
3       1  35.0      1      0  53.1000            2   123        66     False   
4       3  35.0      0      0   8.0500            1     0       650      True   

   Embarked_C  ...  title_the  deck_A  deck_B  deck_C  deck_D  deck_E  deck_F  \
0       False  ...      False   False   False   False   False   False   False   
1        True  ...      False   False   False    True   False   False   False   
2       False  ...      False   False   False   False   False   False   False   
3       False  ...      False   False   False    True   False   False   False   
4       False  ...      False   False   False   False   False   False   False   

   deck_G  deck_T  deck_U 

In [47]:
# Create afunction for impute NAs

def impute_na(data):
    # Create an IterativeImputer with default estimator (BayesianRidge)
    imputer = IterativeImputer(max_iter=20, random_state=42)

    # Fit the imputer to the DataFrame and transform it to fill in missing values
    data_imputed = imputer.fit_transform(data)

    # Create a new DataFrame with the imputed values, keeping the original column names
    data_imputed_df = pd.DataFrame(data_imputed, columns=data.columns)
    
    return data_imputed_df

In [48]:
full_df4= impute_na(full_df3)
print(full_df4.isna().mean())

Pclass             0.0
Age                0.0
SibSp              0.0
Parch              0.0
Fare               0.0
family_size        0.0
room               0.0
group_id           0.0
Sex_male           0.0
Embarked_C         0.0
Embarked_Q         0.0
Embarked_S         0.0
title_Capt.        0.0
title_Col.         0.0
title_Don.         0.0
title_Dona.        0.0
title_Dr.          0.0
title_Jonkheer.    0.0
title_Lady.        0.0
title_Major.       0.0
title_Master.      0.0
title_Miss.        0.0
title_Mlle.        0.0
title_Mme.         0.0
title_Mr.          0.0
title_Mrs.         0.0
title_Ms.          0.0
title_Rev.         0.0
title_Sir.         0.0
title_the          0.0
deck_A             0.0
deck_B             0.0
deck_C             0.0
deck_D             0.0
deck_E             0.0
deck_F             0.0
deck_G             0.0
deck_T             0.0
deck_U             0.0
dtype: float64


In [50]:
# Split full_df into training and testing sets
X_train_df1 = full_df4.iloc[:891]  # First 891 rows for training
X_test_df1 = full_df4.iloc[891:]   # Last 418 rows for testing

# Reset index of X_test_df1
X_test_df1 = X_test_df1.reset_index(drop=True)

# Verify the shapes of the resulting DataFrames
print("Shape of X_train_df1:", X_train_df1.shape)
print("Shape of X_test_df1:", X_test_df1.shape)

Shape of X_train_df1: (891, 39)
Shape of X_test_df1: (418, 39)


In [51]:
print(X_test_df1)
print(test_df)

     Pclass        Age  SibSp  Parch      Fare  family_size   room  group_id  \
0       3.0  34.500000    0.0    0.0    7.8292          1.0    0.0     377.0   
1       3.0  47.000000    1.0    0.0    7.0000          1.0    0.0     583.0   
2       2.0  62.000000    0.0    0.0    9.6875          1.0    0.0     185.0   
3       3.0  27.000000    0.0    0.0    8.6625          1.0    0.0     367.0   
4       3.0  22.000000    1.0    1.0   12.2875          2.0    0.0     339.0   
..      ...        ...    ...    ...       ...          ...    ...       ...   
413     3.0  28.927151    0.0    0.0    8.0500          1.0    0.0     712.0   
414     1.0  39.000000    0.0    0.0  108.9000          1.0  105.0     835.0   
415     3.0  38.500000    0.0    0.0    7.2500          1.0    0.0     873.0   
416     3.0  28.643903    0.0    0.0    8.0500          1.0    0.0     580.0   
417     3.0   3.421430    1.0    1.0   22.3583          3.0    0.0     263.0   

     Sex_male  Embarked_C  ...  title_t

In [52]:
#### Create X_train, y-train and X_test narrays ready for scaling and modelling
# Convert DataFrames to numpy arrays
X_train = X_train_df1.values
X_test = X_test_df1.values
y_train = np.ravel(y_df)

# Verify the conversion
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)

X_train: (891, 39)
X_test: (418, 39)
y_train: (891,)


In [53]:

# Create a function for scaling the DataFrame
def scale_dataset1(data):
    # Initialize the MinMaxScaler with the desired feature range
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # Fit the scaler to the data and transform it
    X_scaled = scaler.fit_transform(data)
    
    # Return the scaled data
    return X_scaled

In [54]:
# scale the training and test dataset
X_train_scaled = scale_dataset1(X_train)
X_test_scaled = scale_dataset1(X_test)

### Model Evaluation

In [59]:
# Define KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

# Define the models to be used
models = {
    'KNN': KNeighborsClassifier(n_neighbors=1),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(), 
    'Logistic regression' : LogisticRegression(max_iter=5000),
    'Naive Bayes' : GaussianNB()
}

In [60]:
# Evaluate the models with unscaled variables
model_results = {}
for model_name, model in models.items():
    # Use cross_val_score to evaluate the pipeline with KFold
    scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    model_results[model_name] = scores.mean()  # Store the mean accuracy of the model

print("Cross-validation results (mean accuracy):")
for model_name, mean_score in model_results.items():
    print(f"{model_name}: {mean_score:.4f}")

Cross-validation results (mean accuracy):
KNN: 0.6936
Random Forest: 0.8283
Decision Tree: 0.7812
SVM: 0.6745
Logistic regression: 0.8249
Naive Bayes: 0.7576


In [61]:
# Evaluate the models with scaled variables
model_results = {}
for model_name, model in models.items():
    # Use cross_val_score to evaluate the pipeline with KFold
    scores = cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring='accuracy')
    model_results[model_name] = scores.mean()  # Store the mean accuracy of the model

print("Cross-validation results (mean accuracy):")
for model_name, mean_score in model_results.items():
    print(f"{model_name}: {mean_score:.4f}")

Cross-validation results (mean accuracy):
KNN: 0.7777
Random Forest: 0.8260
Decision Tree: 0.7778
SVM: 0.8171
Logistic regression: 0.8260
Naive Bayes: 0.7296


### Model parameters tuning

In [63]:
# Define a Random Forest model
rf = RandomForestClassifier()

# Define a parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
}
# Define a GridSearchCV instance
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Output the best parameters
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

243 fits failed out of a total of 486.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
184 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Majeed Win10\AppData\Local

Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best score: 0.8439955106621774


In [64]:
# Fit the grid-search to scaled data
grid_search.fit(X_train_scaled, y_train)

# Output the best parameters
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

243 fits failed out of a total of 486.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Majeed Win10\AppData\Local

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best score: 0.8417508417508417


In [65]:
# Define a basic Logistic Regression model
logreg = LogisticRegression()

# Define a parameter grid for tuning
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  # Penalties supported by different solvers
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs', 'saga'],  # Compatible solvers for different penalties
    'l1_ratio': [0, 0.5, 1],  # Used with 'elasticnet' penalty
    'class_weight': [None, 'balanced'],  # For class imbalance
    'max_iter': [100, 200]  # Maximum iterations
}

# Set up GridSearchCV for Logistic Regression
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the unscaled training data
grid_search.fit(X_train, y_train)

# Output the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best score:", best_score)

Best parameters: {'C': 1, 'class_weight': None, 'l1_ratio': 0.5, 'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'}
Best score: 0.8249158249158249


720 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    s

In [66]:
# Fit the grid search to the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Output the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best score:", best_score)

Best parameters: {'C': 1, 'class_weight': None, 'l1_ratio': 0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.8305274971941637


720 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Majeed Win10\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    s

### Build Best Models

In [73]:
# Train the Chosen final model
best_model = RandomForestClassifier(max_depth= 20, max_features= 'sqrt', min_samples_leaf=1, min_samples_split= 10,
                                       n_estimators=200)
# fit
best_model.fit(X_train, y_train)

# predict the test set and store in a dataframe
survived_pred = pd.DataFrame(best_model.predict(X_test), columns=['Survived'])

# Create df for PassengerID
passid_df = test_df.loc[:, ['PassengerId']]

# create submission df by merging survived and passenderID dfs

submission_df = pd.concat([passid_df, survived_pred], axis=1)

# Verify the result
print(submission_df.head())

# Save predictions to a CSV file
submission_df.to_csv('submission.csv',index=False )


   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1


# THE END