In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

In [None]:
df = sns.load_dataset("titanic")

print("Dataset shape:", df.shape)
print(df.head())

In [None]:
# Drop irrelevant cols
df = df.drop(columns=["deck", "embark_town", "alive", "class", "who", "adult_male", "alone"], errors="ignore")

# Step 2: Handle missing values

df["age"] = df["age"].fillna(df["age"].median())  # Fill age with median
df["embarked"] = df["embarked"].fillna(df["embarked"].mode()[0])  # Fill embarked with mode
df["embarked"] = df["embarked"].astype(str)  # ensure categorical


In [None]:
#  Define features & target

X = df.drop(columns=["survived"])  # features
y = df["survived"]                 # target

In [None]:
# Identify categorical & numeric columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical:", cat_cols)
print("Numerical:", num_cols)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Label Encoding Pipeline

X_train_le = X_train.copy()
X_test_le = X_test.copy()

le = LabelEncoder()
for col in cat_cols:
    X_train_le[col] = le.fit_transform(X_train_le[col])
    X_test_le[col] = le.transform(X_test_le[col])

# Scale numeric data
scaler = StandardScaler()
X_train_le[num_cols] = scaler.fit_transform(X_train_le[num_cols])
X_test_le[num_cols] = scaler.transform(X_test_le[num_cols])

# Train model
model_le = LogisticRegression(max_iter=1000)
model_le.fit(X_train_le, y_train)
y_pred_le = model_le.predict(X_test_le)

acc_le = accuracy_score(y_test, y_pred_le)
print("Accuracy with Label Encoding:", acc_le)

In [None]:
#One-Hot Encoding Pipeline

ct = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"), cat_cols),
        ("scale", StandardScaler(), num_cols)
    ]
)

X_train_ohe = ct.fit_transform(X_train)
X_test_ohe = ct.transform(X_test)

model_ohe = LogisticRegression(max_iter=1000)
model_ohe.fit(X_train_ohe, y_train)
y_pred_ohe = model_ohe.predict(X_test_ohe)

acc_ohe = accuracy_score(y_test, y_pred_ohe)
print("Accuracy with One-Hot Encoding:", acc_ohe)

In [None]:
import matplotlib.pyplot as plt

# Compare accuracies
acc_label = acc_le  # from Label Encoding part
acc_onehot = acc_ohe  # from One-Hot Encoding part

# Bar chart
plt.figure(figsize=(6,4))
plt.bar(["Label Encoding", "One-Hot Encoding"], [acc_label, acc_onehot], color=["skyblue", "lightgreen"])
plt.ylabel("Accuracy")
plt.title("Comparison of Encoding Techniques on Titanic Dataset")
plt.ylim(0,1)
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Label Encoding model
model_label = DecisionTreeClassifier(random_state=42)
model_label.fit(X_train_le, y_train)
acc_label = model_label.score(X_test_le, y_test)

# One-Hot Encoding model
model_ohe = DecisionTreeClassifier(random_state=42)
model_ohe.fit(X_train_ohe, y_train)
acc_ohe = model_ohe.score(X_test_ohe, y_test)

print("Decision Tree Accuracy (Label Encoding):", acc_label)
print("Decision Tree Accuracy (One-Hot Encoding):", acc_ohe)


In [None]:
# Logistic Regression
log_label = LogisticRegression(max_iter=1000, random_state=42)
log_label.fit(X_train_le, y_train)
acc_log_le = log_label.score(X_test_le, y_test)

log_ohe = LogisticRegression(max_iter=1000, random_state=42)
log_ohe.fit(X_train_ohe, y_train)
acc_log_ohe = log_ohe.score(X_test_ohe, y_test)

# Decision Tree
tree_label = DecisionTreeClassifier(random_state=42)
tree_label.fit(X_train_le, y_train)
acc_tree_le = tree_label.score(X_test_le, y_test)

tree_ohe = DecisionTreeClassifier(random_state=42)
tree_ohe.fit(X_train_ohe, y_train)
acc_tree_ohe = tree_ohe.score(X_test_ohe, y_test)

# Plot results
models = ["Logistic Regression", "Decision Tree"]
encodings = ["Label Encoding", "One-Hot Encoding"]

acc_matrix = np.array([
    [acc_log_le, acc_log_ohe],
    [acc_tree_le, acc_tree_ohe]
])

x = np.arange(len(models))  # model positions
width = 0.35  # bar width

fig, ax = plt.subplots(figsize=(8,5))
rects1 = ax.bar(x - width/2, acc_matrix[:,0], width, label="Label Encoding")
rects2 = ax.bar(x + width/2, acc_matrix[:,1], width, label="One-Hot Encoding")

# Labels and formatting
ax.set_ylabel("Accuracy")
ax.set_title("Model Accuracy: Label Encoding vs One-Hot Encoding")
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

# Show values above bars
for rect in rects1 + rects2:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2., height,
            f'{height:.2f}', ha='center', va='bottom')

plt.show()
