In [4]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.




import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Load Titanic dataset from OpenML
data = fetch_openml(name='titanic', version=2)
df = pd.DataFrame(data=data['data'], columns=data['feature_names'])

# Display first few rows of the dataset
df.head()

# --- Question 5: Label Encoding vs One-Hot Encoding ---
# Label Encoding for 'Sex' feature
label_encoder = LabelEncoder()
df['Sex_LabelEncoded'] = label_encoder.fit_transform(df['sex'])

# One-Hot Encoding for 'Sex' feature
onehot_encoder = pd.get_dummies(df['sex'], prefix='Sex')

# Show Label Encoding and One-Hot Encoding results
print("Label Encoded 'Sex' feature:\n", df[['sex', 'Sex_LabelEncoded']].head())
print("One-Hot Encoded 'Sex' feature:\n", onehot_encoder.head())

# --- Question 6: Combining Feature Scaling Techniques ---
# Selecting features and target
X = df[['age', 'fare', 'pclass']]
y = df['survived']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply Min-Max Scaling
min_max_scaler = MinMaxScaler()
X_train_min_max = min_max_scaler.fit_transform(X_train)
X_test_min_max = min_max_scaler.transform(X_test)

# Apply Standardization (Z-score scaling)
standard_scaler = StandardScaler()
X_train_standard = standard_scaler.fit_transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

# --- Question 7: Handling Multiple Categorical Features ---
# One-Hot Encoding for 'Embarked'
onehot_embarked = pd.get_dummies(df['embarked'], prefix='Embarked')

# Ordinal Encoding for 'Pclass' (passenger class)
ordinal_encoder = OrdinalEncoder(categories=[['1', '2', '3']])
df['Pclass_OrdinalEncoded'] = ordinal_encoder.fit_transform(df[['pclass']])

# Show One-Hot Encoding for 'Embarked' and Ordinal Encoding for 'Pclass'
print("One-Hot Encoded 'Embarked':\n", onehot_embarked.head())
print("Ordinal Encoded 'Pclass':\n", df[['pclass', 'Pclass_OrdinalEncoded']].head())

# --- Question 9: Impact of Scaling on Different Algorithms ---
# Decision Tree Model without scaling
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

# Decision Tree Model with scaling (Min-Max Scaling)
dt_model.fit(X_train_min_max, y_train)
y_pred_dt_min_max = dt_model.predict(X_test_min_max)
dt_accuracy_min_max = accuracy_score(y_test, y_pred_dt_min_max)

# SVM Model without scaling
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)

# SVM Model with scaling (Standardization)
svm_model.fit(X_train_standard, y_train)
y_pred_svm_standard = svm_model.predict(X_test_standard)
svm_accuracy_standard = accuracy_score(y_test, y_pred_svm_standard)

# Print accuracy comparison
print(f"Decision Tree Accuracy (no scaling): {dt_accuracy:.4f}")
print(f"Decision Tree Accuracy (Min-Max Scaling): {dt_accuracy_min_max:.4f}")
print(f"SVM Accuracy (no scaling): {svm_accuracy:.4f}")
print(f"SVM Accuracy (Standardization): {svm_accuracy_standard:.4f}")

# --- Question 10: Custom Transformation for High Cardinality Categorical Features ---
# Custom Transformation for high cardinality feature 'ticket' (example)
ticket_data = df['ticket'].apply(lambda x: len(str(x)))  # Encoding by the length of the ticket number

# Show the transformed 'ticket' feature
print("Custom transformed 'ticket' feature:\n", ticket_data.head())

KeyError: 'sex'