##  Import and Load Data




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [None]:
train = pd.read_csv('/content/drive/MyDrive/Advanced ML/titanictrain.csv')
test = pd.read_csv('/content/drive/MyDrive/Advanced ML/titanictest.csv')

## Data Exploration

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
print("Train dtypes:\n", train.dtypes)

Train dtypes:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [None]:
print("\nTrain Description:\n", train.describe(include='all'))


Train Description:
         PassengerId    Survived      Pclass                     Name   Sex  \
count    891.000000  891.000000  891.000000                      891   891   
unique          NaN         NaN         NaN                      891     2   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
freq            NaN         NaN         NaN                        1   577   
mean     446.000000    0.383838    2.308642                      NaN   NaN   
std      257.353842    0.486592    0.836071                      NaN   NaN   
min        1.000000    0.000000    1.000000                      NaN   NaN   
25%      223.500000    0.000000    2.000000                      NaN   NaN   
50%      446.000000    0.000000    3.000000                      NaN   NaN   
75%      668.500000    1.000000    3.000000                      NaN   NaN   
max      891.000000    1.000000    3.000000                      NaN   NaN   

               Age       SibSp       Parch

In [None]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
train.isna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


## Encoding

In [None]:
y = train['Survived']
X = train.drop(['Survived'], axis=1, errors='ignore')

In [None]:
# train_ids = X['PassengerId']  # Only if you want the training IDs
X = X.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, errors='ignore')

In [None]:
print(test.columns)

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


In [None]:
test = test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, errors='ignore')

In [None]:
# Drop columns if they exist (use errors='ignore' to avoid KeyError)
test = test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, errors='ignore')

print("X shape:", X.shape)
print("y shape:", y.shape)
print("test shape:", test.shape)

X shape: (891, 11)
y shape: (891,)
test shape: (418, 7)


In [None]:
# Concatenate X and test for consistent feature engineering
combined = pd.concat([X, test], axis=0).reset_index(drop=True)
print("Combined shape:", combined.shape)
combined.head()

Combined shape: (1309, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# === Part 5: Handle Missing Values ===

# For demonstration, let's fill 'Age' and 'Fare' with their medians.
combined['Age'] = combined['Age'].fillna(combined['Age'].median())
combined['Fare'] = combined['Fare'].fillna(combined['Fare'].median())

# Embarked – fill with the most common port
if 'Embarked' in combined.columns:
    combined['Embarked'] = combined['Embarked'].fillna(combined['Embarked'].mode()[0])

# Check for any remaining nulls
print(combined.isnull().sum())
combined.head()

PassengerId     418
Pclass            0
Name            418
Sex               0
Age               0
SibSp             0
Parch             0
Ticket          418
Fare              0
Cabin          1105
Embarked          0
dtype: int64


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# === Part 6: Encode Categorical Variables & Create Additional Features ===

# One-hot encode 'Sex' and 'Embarked' (if they exist)
categorical_cols = []
for col in ['Sex','Embarked']:
    if col in combined.columns:
        categorical_cols.append(col)

combined = pd.get_dummies(combined, columns=categorical_cols, drop_first=True)

# Example: Create 'FamilySize' feature
if 'SibSp' in combined.columns and 'Parch' in combined.columns:
    combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1

# Example: 'IsAlone'
if 'FamilySize' in combined.columns:
    combined['IsAlone'] = (combined['FamilySize'] == 1).astype(int)

combined.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S,FamilySize,IsAlone
0,1.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,True,False,True,2,0
1,2.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,False,False,False,2,0
2,3.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,False,False,True,1,1
3,4.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,False,False,True,2,0
4,5.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,True,False,True,1,1


In [None]:
# === Part 7: Optional KMeans or PCA Features ===
combined = combined.select_dtypes(include=[np.number])
combined_scaled = scaler.fit_transform(combined)
combined = combined.fillna(combined.median())
# For demonstration, let's do KMeans on 'Fare' (if it exists)
if 'Fare' in combined.columns:
    kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
    combined['FareCluster'] = kmeans.fit_predict(combined[['Fare']])

# Example: PCA
# 1) Scale the combined data
scaler = StandardScaler()
combined_scaled = scaler.fit_transform(combined)

# 2) Apply PCA with 5 components
pca = PCA(n_components=5, random_state=42)
combined_pca = pca.fit_transform(combined_scaled)
pca_cols = [f'PCA_{i}' for i in range(5)]
combined_pca_df = pd.DataFrame(combined_pca, columns=pca_cols)

# Merge back with combined
combined = combined.reset_index(drop=True)
combined_pca_df = combined_pca_df.reset_index(drop=True)
combined = pd.concat([combined, combined_pca_df], axis=1)
print("Combined shape after PCA:", combined.shape)
combined.head()

Combined shape after PCA: (1309, 14)


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,FareCluster,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4
0,1.0,3,22.0,1,0,7.25,2,0,0,0.326028,1.353299,-2.12793,-0.142865,-0.481526
1,2.0,1,38.0,1,0,71.2833,2,0,2,1.666466,-2.270289,-2.268219,-0.286179,-0.76434
2,3.0,3,26.0,0,0,7.925,1,1,0,-1.3606,0.759687,-2.073615,-0.05119,0.247287
3,4.0,1,35.0,1,0,53.1,2,0,2,1.579776,-2.025917,-2.257287,-0.397106,-0.81099
4,5.0,3,35.0,0,0,8.05,1,1,0,-1.39823,0.473329,-2.06114,0.55522,0.120458


## ML

In [None]:
# === Part 8: Split Combined Back Into X & Test ===

# We know how many rows were in the original training set
ntrain = len(X)  # recall X was the old train features
print("ntrain:", ntrain)

final_X = combined.iloc[:ntrain, :].copy()
final_test = combined.iloc[ntrain:, :].copy()

print("final_X shape:", final_X.shape)
print("final_test shape:", final_test.shape)

ntrain: 891
final_X shape: (891, 14)
final_test shape: (418, 14)


In [None]:
# === Part 9: Train/Test Split & Model Training ===

# We'll do a final train/validation split on final_X and y
X_train, X_val, y_train, y_val = train_test_split(
    final_X, y, test_size=0.2, random_state=42
)

# Example: Gradient Boosting
gbc = GradientBoostingClassifier(random_state=42)

# Example parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5],
}

grid_search = GridSearchCV(
    estimator=gbc,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Best CV Score: 0.6797892248596474


## Submission

In [None]:
# === Part 10: Validation & Final Predictions ===

# 1) Check accuracy on validation set
y_pred_val = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", val_accuracy)

# 2) Predict on the test set
final_preds = best_model.predict(final_test)

# 3) Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_ids,   # from Part 3
    'Survived': final_preds
})

submission.head()

# 4) Save to CSV (for Kaggle)
submission.to_csv('submission.csv', index=False)
print("Submission file created: 'submission.csv'")

Validation Accuracy: 0.7318435754189944
Submission file created: 'submission.csv'


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>