Step 1: Install Necessary Libraries

If you don't have the necessary libraries, install them using pip:

In [1]:
pip install pandas scikit-learn matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.


Step 2: Import Libraries

Now, let's import the necessary libraries for data manipulation, visualization, and machine learning:

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


Step 3: Load the Titanic Dataset

You can download the Titanic dataset from Kaggle's Titanic Competition. For this guide, let's assume the dataset is saved as titanic.csv.

In [4]:
# Load the Titanic dataset
df = pd.read_csv('Titanic-Dataset (1).csv')

# Display the first few rows of the dataset
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


Step 4: Data Exploration

Before building the model, let's explore the dataset to understand its structure and check for missing values.

In [6]:
# Check basic info of the dataset
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Display descriptive statistics of numeric columns
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int6

Step 5: Data Preprocessing

5.1: Handle Missing Data

For columns like Age, Embarked, we need to handle missing values. We can fill missing Age with the median, and Embarked with the most frequent value.

In [7]:
# Fill missing 'Age' with the median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing 'Embarked' with the most frequent value
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


5.2: Feature Encoding

For categorical variables like Sex, Embarked, we need to convert them into numeric format using Label Encoding or One-Hot Encoding.

In [8]:
# Encode 'Sex' (Male = 0, Female = 1)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Encode 'Embarked' (C = 0, Q = 1, S = 2)
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Check the transformed dataset
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare Cabin  Embarked  
0         A/5 21171   7.2500   NaN         2  
1          PC 17599  71.2833   C85         0  
2  STON/O2. 3101282   7.9250   NaN         2  
3            113803  53.1000  C123         2  
4            373450   8.0500   NaN         2  


5.3: Drop Irrelevant Features

Some features like Name, Ticket, PassengerId may not provide useful information for the model. We can drop them:

In [9]:
# Drop irrelevant columns
df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# Display the cleaned dataset
print(df.head())


   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Cabin  Embarked
0         0       3    0  22.0      1      0   7.2500   NaN         2
1         1       1    1  38.0      1      0  71.2833   C85         0
2         1       3    1  26.0      0      0   7.9250   NaN         2
3         1       1    1  35.0      1      0  53.1000  C123         2
4         0       3    0  35.0      0      0   8.0500   NaN         2


Step 6: Split the Data

Now that the data is clean, we can split it into features (X) and target (y), and further split it into training and testing sets.

In [10]:
# Split the data into features (X) and target (y)
X = df.drop(columns=['Survived'])
y = df['Survived']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting datasets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(712, 8) (179, 8) (712,) (179,)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
print(f'Best Hyperparameters: {grid_search.best_params_}')

# Re-train the model with the best parameters
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best * 100:.2f}%')


Fitting 3 folds for each of 9 candidates, totalling 27 fits
