In [None]:
import pandas as pd      
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load training and test data
train_pd = pd.read_csv('train.csv')
test_pd = pd.read_csv('test.csv')
display(train_pd)

In [None]:
# Separate target variable (Survived) from training data
survived = train_pd['Survived']
train_pd = train_pd.drop(['Survived'], axis=1)

display(train_pd)

In [None]:
# Store passenger IDs to realign predictions later
train_idx = train_pd['PassengerId']
test_idx = test_pd['PassengerId']

In [None]:
# Combine train and test for unified preprocessing
combines_pd = pd.concat([train_pd, test_pd]).reset_index(drop=True)

display(combines_pd)

In [None]:
# Check for null values
print('Null Values:')
print(combines_pd.isnull().sum())

In [None]:
# --- Fill Missing Age Values Based on Pclass Median ---

# Plot full age distribution
y = combines_pd['Age'].value_counts()
x = y.index.values
plt.figure(figsize=(10,5))
plt.bar(x, y)
plt.title('All Ages')
plt.show()


In [None]:
# Age distribution per class
class_age = combines_pd[['Pclass', 'Age']]
for i in range(1, 4):
    plt.figure(figsize=(10,5))
    plt.title('Class ' + str(i))
    y = class_age[class_age['Pclass'] == i]['Age'].value_counts()
    x = y.index.values
    plt.bar(x, y)
    plt.show()


In [None]:
# Fill missing Age with median for each class
medians = class_age.groupby('Pclass').median()
for i in range(3):
    idx = np.where((combines_pd['Pclass'] == i+1) & (combines_pd['Age'].isnull()))[0]
    combines_pd.loc[idx, 'Age'] = medians.values[i][0]

display(combines_pd)


In [None]:
# Fill missing Fare with median of similar passengers
display(combines_pd[combines_pd['Fare'].isnull()])
sim_fares = combines_pd[(combines_pd['Pclass'] == 3) & (combines_pd['Embarked'] == 'S')]['Fare']
print('Median:', sim_fares.median())
plt.hist(sim_fares)
plt.plot()
combines_pd['Fare'] = combines_pd['Fare'].fillna(sim_fares.median())


In [None]:
# Fill missing Embarked with most common similar fare
display(combines_pd[combines_pd['Embarked'].isnull()])
sim_emb = combines_pd[(combines_pd['Pclass'] == 1) & (combines_pd['Fare'] >= 70) & (combines_pd['Fare'] <= 90)]['Embarked']
print(sim_emb.value_counts())
combines_pd['Embarked'] = combines_pd['Embarked'].fillna('C')

In [None]:
# Process Cabin feature: keep only the first letter, fill NA with 'M'
combines_pd['Cabin'] = combines_pd['Cabin'].fillna('M')
combines_pd['Cabin'] = combines_pd['Cabin'].str[0]
idx = np.where(combines_pd['Cabin'] == 'T')[0]
combines_pd.loc[idx, 'Cabin'] = 'M'

print(combines_pd.isnull().sum())
print(combines_pd.nunique())


In [None]:
# ----------------------------- Feature Engineering -----------------------------

# Extract Titles from Names
names = combines_pd['Name']
last_names, titles, first_names = [], [], []

for name in names:
    if ',' not in name:
        last_names.append('')
    else:
        last, name = name.split(',', 1)
        last_names.append(last)
    if '.' not in name:
        titles.append('')
    else:
        title, first = name.split('.', 1)
        titles.append(title)
        first_names.append(first)

In [None]:
# Group less common titles
titles = np.array(titles)
titles[np.isin(titles, ['Capt','Col','Major'])] = 'Military'
titles[np.isin(titles, ['Don','Dona','Jonkheet','Lady','Sir','Master','the Countess'])] = 'Nobility'
titles[np.isin(titles, ['Miss','Mlle','Ms'])] = 'Ms'
titles[np.isin(titles, ['Mr','Mme'])] = 'Mrs'
combines_pd['Title'] = titles


In [None]:
# Create Age Bins
bins = np.array([0,10,20,30,40,50,60,70,80])
combines_pd['Age_Bin'] = pd.cut(combines_pd['Age'], bins)


In [None]:
# Count tickets to estimate fare per person
ticke_dict = dict(combines_pd['Ticket'].value_counts())
combines_pd['tkt_count'] = combines_pd['Ticket'].map(ticke_dict)
combines_pd['Fare_per_Ticket'] = combines_pd['Fare'] / combines_pd['tkt_count']
bins = [0,20,40,60,80,150]
combines_pd['Fare_Bin'] = pd.cut(combines_pd['Fare_per_Ticket'], bins)


In [None]:
# Add total number of family members onboard
combines_pd['Num_Family'] = combines_pd['SibSp'] + combines_pd['Parch'] + 1


In [None]:
# Drop irrelevant features
combines_pd.drop(['Name', 'Age', 'Ticket', 'Fare', 'tkt_count', 'Fare_per_Ticket'], axis=1, inplace=True)


In [None]:
# ----------------------------- Data Encoding & Modeling -----------------------------

# Encode categorical values
from sklearn.preprocessing import LabelEncoder
label_enc = combines_pd.copy().astype(str)
label_enc = label_enc.apply(LabelEncoder().fit_transform)
one_hot = pd.get_dummies(label_enc, columns=['Sex', 'Embarked', 'Title'])


In [None]:
# Set PassengerId as index
combines_pd.set_index('PassengerId', inplace=True)

In [None]:
# Prepare training data
x = one_hot.loc[train_idx].values
y = survived.values


In [None]:
# Scale features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)


In [None]:
# Split into train/test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=0)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
# ---- Train with Random Forest Classifier ----
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


In [None]:
# Initial training
clf = RandomForestClassifier(random_state=0)
clf.fit(x_train, y_train)
print("Initial model accuracy:", clf.score(x_test, y_test))

In [None]:
# Randomized hyperparameter search
params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [10, 30, 40, 45],
    'min_samples_split': [5, 10, 30, 50, 56],
    'max_features': ['sqrt', 'log2'],
    'max_samples': [0.1, 0.15, 0.18, 0.2],
    'criterion': ['gini', 'entropy']
}

rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=0), params, scoring='accuracy', random_state=0, cv=5)
rand_search.fit(x_train, y_train)

rand_params = rand_search.best_params_
print(rand_params)
print('Train Acc (CV):', rand_search.best_score_)
preds = rand_search.predict(x_test)
print('Test Acc:', accuracy_score(preds, y_test))

In [None]:
# Random search over wide parameter space
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [20, 50, 100, 200, 300, 400, 500, 800, 1000],
    'max_depth': np.arange(3, 50),
    'min_samples_split': np.arange(2, 1000),
    'max_features': ['sqrt', 'log2'],
    'max_samples': np.linspace(0.1, 0.9, 10)
}
rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=0), params, scoring='accuracy', random_state=0, cv=5)
rand_search.fit(x_train, y_train)

# Best parameters from random search
rand_params = rand_search.best_params_
print(rand_params, '\n')
print('Train Acc (CV):', rand_search.best_score_)

# Evaluate on test set
preds = rand_search.predict(x_test)
print('Test Acc:', accuracy_score(preds, y_test))

In [None]:
# Grid search around best parameters from random search
n_estimators = [rand_params['n_estimators'] - 10, rand_params['n_estimators'], rand_params['n_estimators'] + 10]
params = {
    'n_estimators': n_estimators,
    'max_depth': [rand_params['max_depth']],
    'min_samples_split': [rand_params['min_samples_split']],
    'max_features': [rand_params['max_features']],
    'max_samples': [rand_params['max_samples']],
    'criterion': [rand_params['criterion']]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=0), params, scoring='accuracy', cv=5)
grid_search.fit(x_train, y_train)

grid_params = grid_search.best_params_
print(grid_params)
print('Train Acc (CV):', grid_search.best_score_)
preds = grid_search.predict(x_test)
print('Test Acc:', accuracy_score(preds, y_test))