In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data-mining-cse572/titanic-dataset-hw2/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data-mining-cse572/titanic-dataset-hw2/test.csv')

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_df['Title'] = train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Extract Leading Letter:
train_df['Ticket_2letter'] = train_df.Ticket.apply(lambda x: x[:2])
test_df['Ticket_2letter'] = test_df.Ticket.apply(lambda x: x[:2])

# Extract Ticket Lenght:
train_df['Ticket_len'] = train_df.Ticket.apply(lambda x: len(x))
test_df['Ticket_len'] = test_df.Ticket.apply(lambda x: len(x))

# Extract Number of Cabins:
train_df['Cabin_num'] = train_df.Ticket.apply(lambda x: len(x.split()))
test_df['Cabin_num'] = test_df.Ticket.apply(lambda x: len(x.split()))

# Extract Leading Letter:
train_df['Cabin_lletter'] = train_df.Ticket.apply(lambda x: x[:1])
test_df['Cabin_lletter'] = test_df.Ticket.apply(lambda x: x[:1])

train_df['Fam_size'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Fam_size'] = test_df['SibSp'] + test_df['Parch'] + 1

train_df['Fam_type'] = pd.cut(train_df.Fam_size, [0, 1, 4, 7, 11], labels=['Solo', 'Small', 'Big', 'Very big'])
test_df['Fam_type'] = pd.cut(test_df.Fam_size, [0, 1, 4, 7, 11], labels=['Solo', 'Small', 'Big', 'Very big'])

In [6]:
# Define numerical & categorical columns
numerical_cols   = ['Fare']
categorical_cols = ['Pclass', 'Title', 'Embarked', 'Fam_type', 'Ticket_len', 'Ticket_2letter']

# Numerical pipeline: impute missing with median + scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: impute missing with most frequent + one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [7]:
y = train_df['Survived']
features = ['Pclass', 'Fare', 'Title', 'Embarked', 'Fam_type', 'Ticket_len', 'Ticket_2letter']
X = train_df[features]
X.head()

Unnamed: 0,Pclass,Fare,Title,Embarked,Fam_type,Ticket_len,Ticket_2letter
0,3,7.25,Mr,S,Small,9,A/
1,1,71.2833,Mrs,C,Small,8,PC
2,3,7.925,Miss,S,Solo,16,ST
3,1,53.1,Mrs,S,Small,6,11
4,3,8.05,Mr,S,Solo,6,37


In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, solver="liblinear"),  # works with sparse OHE
    "Support Vector Machines":          SVC(),                         # default RBF, fine for .score
    "LinearSVC":          LinearSVC(max_iter=2000),
    "RandomForest":       RandomForestClassifier(n_estimators=100, random_state=42),
    "KNeighbors":         KNeighborsClassifier(n_neighbors=3),
    "Perceptron":         Perceptron(max_iter=1000),
    "SGDClassifier":      SGDClassifier(max_iter=1000),
    "DecisionTree":       DecisionTreeClassifier(random_state=42),
}

results = []
for name, model in models.items():
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    acc = pipe.score(X_val, y_val)
    results.append((name, acc))

# sort & print
results = sorted(results, key=lambda x: x[1], reverse=True)
print("Validation accuracy results:")
for name, acc in results:
    print(f"{name:>16}: {acc:.4f}")


Validation accuracy results:
       LinearSVC: 0.8268
    RandomForest: 0.8156
   SGDClassifier: 0.8156
LogisticRegression: 0.8101
Support Vector Machines: 0.8101
      KNeighbors: 0.8101
    DecisionTree: 0.7989
      Perceptron: 0.6480
