In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


sns.set(style='darkgrid')
%matplotlib inline
# Cell 2: load dataset (assumes you've downloaded train.csv from Kaggle)
from google.colab import files

# Upload file manually
uploaded = files.upload()

train = pd.read_csv('train.csv')
# optional: test = pd.read_csv('test.csv')
train.shape, train.columns
# Cell 3: head & info
train.head()


# Cell 4: missing values
train.isnull().sum()


# Cell 5: distributions and survival by sex/class
print(train['Survived'].value_counts(normalize=True))
print('\nSurvival rate by Sex:')
print(train.groupby('Sex')['Survived'].mean())
print('\nSurvival rate by Pclass:')
print(train.groupby('Pclass')['Survived'].mean())


# Visuals (histogram + countplots)
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
train['Age'].hist(bins=30)
plt.title('Age distribution')
plt.subplot(1,2,2)
sns.countplot(x='Survived', hue='Sex', data=train)
plt.title('Survival by Sex')
plt.tight_layout()
# Cell 6: select features and target
# We'll use common features: Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
X = train[features]
y = train['Survived']


# Column lists
numeric_features = ['Age','SibSp','Parch','Fare']
categorical_features = ['Pclass','Sex','Embarked']


# Pipelines
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', pd.get_dummies) # placeholder: we'll use sklearn OneHotEncoder below
])


# Use sklearn ColumnTransformer with OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline


preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
])


# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Cell 7: logistic regression pipeline
pipe_lr = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000))])
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))])


# Train
pipe_lr.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)


# Predict
y_pred_lr = pipe_lr.predict(X_val)
y_pred_rf = pipe_rf.predict(X_val)


# Metrics function
def print_metrics(y_true, y_pred, label='Model'):
    print(f"{label} - Accuracy: {accuracy_score(y_true,y_pred):.4f}, "
          f"Precision: {precision_score(y_true,y_pred):.4f}, "
          f"Recall: {recall_score(y_true,y_pred):.4f}, "
          f"F1: {f1_score(y_true,y_pred):.4f}")
    print(classification_report(y_true, y_pred))



print_metrics(y_val, y_pred_lr, 'Logistic Regression')
print_metrics(y_val, y_pred_rf, 'Random Forest')
# Cell 8: cross-validation scores
from sklearn.model_selection import cross_val_score
cv_scores_lr = cross_val_score(pipe_lr, X, y, cv=5, scoring='accuracy')
cv_scores_rf = cross_val_score(pipe_rf, X, y, cv=5, scoring='accuracy')
print('LR CV mean:', cv_scores_lr.mean())
print('RF CV mean:', cv_scores_rf.mean())


# If you want to tune RandomForest quickly
from sklearn.model_selection import GridSearchCV
param_grid = {'classifier__n_estimators':[100,200], 'classifier__max_depth':[None,5,8]}
gs = GridSearchCV(pipe_rf, param_grid, cv=4, scoring='accuracy', n_jobs=-1)
gs.fit(X, y)
print('Best params', gs.best_params_)
print('Best CV score', gs.best_score_)
# Cell 9: feature importance (after preprocessing)
# To get feature names after OneHotEncoder, reconstruct
onehot = gs.best_estimator_.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'] if 'gs' in globals() else pipe_rf.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
num_cols = numeric_features
cat_cols = onehot.get_feature_names_out(categorical_features)
all_cols = list(num_cols) + list(cat_cols)
importances = (gs.best_estimator_.named_steps['classifier'].feature_importances_ if 'gs' in globals() else pipe_rf.named_steps['classifier'].feature_importances_)
feat_imp = pd.Series(importances, index=all_cols).sort_values(ascending=False)
print(feat_imp)


sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title('Feature importances')

Saving titanic.zip to titanic (1).zip


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
# Look at first 5 rows
print(train.head())

# Check missing values
print("\nMissing values:\n", train.isnull().sum())

# Check survival rate
print("\nSurvival counts:\n", train['Survived'].value_counts())

# Survival by gender
print("\nSurvival by Gender:\n", train.groupby('Sex')['Survived'].mean())

# Survival by class
print("\nSurvival by Passenger Class:\n", train.groupby('Pclass')['Survived'].mean())


NameError: name 'train' is not defined

In [None]:
from google.colab import files
import zipfile
import pandas as pd
import os

# Step 1: Upload the Titanic zip file
uploaded = files.upload()

# Get the uploaded filename automatically
zip_file = list(uploaded.keys())[0]

# Step 2: Extract all files from the zip
with zipfile.ZipFile(zip_file, "r") as zip_ref:
    zip_ref.extractall("titanic_data")

# Step 3: Check extracted files
print("Extracted files:", os.listdir("titanic_data"))

# Step 4: Load datasets
train = pd.read_csv("titanic_data/train.csv")
test = pd.read_csv("titanic_data/test.csv")
gender_submission = pd.read_csv("titanic_data/gender_submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Gender submission shape:", gender_submission.shape)


Saving titanic.zip to titanic (2).zip
Extracted files: ['train.csv', 'test.csv', 'gender_submission.csv']
Train shape: (891, 12)
Test shape: (418, 11)
Gender submission shape: (418, 2)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Copy the dataset
df = train.copy()

# Drop irrelevant columns
df = df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

# Handle missing values
imputer = SimpleImputer(strategy='median')
df['Age'] = imputer.fit_transform(df[['Age']])
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Encode categorical variables
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

# Features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Training and evaluation
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy_score(y_val, preds):.4f}")
    print(f"Precision: {precision_score(y_val, preds):.4f}")
    print(f"Recall: {recall_score(y_val, preds):.4f}")
    print(f"F1 Score: {f1_score(y_val, preds):.4f}")
    print(classification_report(y_val, preds))



Logistic Regression:
Accuracy: 0.8101
Precision: 0.7857
Recall: 0.7432
F1 Score: 0.7639
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


Random Forest:
Accuracy: 0.8212
Precision: 0.8088
Recall: 0.7432
F1 Score: 0.7746
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

