In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('titanic_dataset.csv')

# Display the first few rows of the dataset
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


Import the necessary libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


Handle missing values

In [4]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values in the 'Age' column with the median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing values in the 'Embarked' column with the mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column due to high number of missing values
df.drop('Cabin', axis=1, inplace=True)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Encode categorical variables

In [5]:
# Encode the 'Sex' column
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])

# Encode the 'Embarked' column
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])


Feature scaling:

In [6]:
# Scale the numerical features using StandardScaler
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])


Split the dataset into features and target variable:

In [7]:
X = df.drop('Survived', axis=1)  # Features
y = df['Survived']               # Target variable


Feature Engineering

In [8]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


Handling Outliers

In [9]:
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1

# Remove outliers below the lower bound
df = df[df['Fare'] >= (Q1 - 1.5 * IQR)]

# Remove outliers above the upper bound
df = df[df['Fare'] <= (Q3 + 1.5 * IQR)]


In [10]:
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Replace outliers below the lower bound with the median age
df.loc[df['Age'] < (Q1 - 1.5 * IQR), 'Age'] = df['Age'].median()

# Replace outliers above the upper bound with the median age
df.loc[df['Age'] > (Q3 + 1.5 * IQR), 'Age'] = df['Age'].median()


In [14]:
# Load the pre-processed dataset
df = pd.read_csv('titanic_dataset.csv')

# Remove the 'Name' column
df.drop('Name', axis=1, inplace=True)

# Split the dataset into features (X) and target variable (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Continue with the kNN and SVM model creation and evaluation as shown in the previous example


knn and svm models


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the pre-processed dataset
df = pd.read_csv('titanic_dataset.csv')

# Handle missing values
df.fillna(df.median(), inplace=True)

# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['int', 'float']).columns.tolist()

# Encode non-numeric columns
label_encoder = LabelEncoder()
for column in non_numeric_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Perform one-hot encoding on remaining categorical columns
categorical_columns = ['Pclass', 'SibSp', 'Parch']
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
df_encoded = pd.DataFrame(onehot_encoder.fit_transform(df[categorical_columns]))

# Create column names for the one-hot encoded features
encoded_columns = []
for i, col in enumerate(categorical_columns):
    for category in onehot_encoder.categories_[i]:
        encoded_columns.append(f"{col}_{category}")

df_encoded.columns = encoded_columns

# Concatenate the encoded features with the original dataset and drop the original categorical columns
df = pd.concat([df, df_encoded], axis=1).drop(categorical_columns, axis=1)

# Split the dataset into features (X) and target variable (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# kNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Evaluate model performance
accuracy_knn = accuracy_score(y_test, y_pred_knn)

print("kNN Accuracy:", accuracy_knn)


kNN Accuracy: 0.6703910614525139


  df.fillna(df.median(), inplace=True)


In [23]:
# Evaluate model performance
accuracy_knn = accuracy_score(y_test, y_pred_knn)

print("kNN Accuracy:", accuracy_knn)

kNN Accuracy: 0.6703910614525139


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# SVM model
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Evaluate model performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print("SVM Accuracy:", accuracy_svm)


SVM Accuracy: 0.6424581005586593
