In [3]:
!pip install pandas numpy scikit-learn xgboost



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
train.head()

Train shape: (891, 12)
Test shape: (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# ===================
# ENHANCED FEATURE ENGINEERING (FIXED)
# ===================

def engineer_features(df):
    """Apply all feature engineering to a dataframe"""
    
    # 1. Title extraction
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.')
    df['Title'] = df['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Mlle', 
                                        'Ms', 'Mme', 'Don', 'Lady', 'Sir', 
                                        'Capt', 'Countess', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4})
    df['Title'].fillna(4, inplace=True)  # Fill any unmapped titles as Rare
    
    # 2. Family features
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['Family_Size'] == 1).astype(int)
    
    # 3. Age - fill FIRST, then bin
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                          labels=[0, 1, 2, 3, 4])
    df['AgeBin'] = df['AgeBin'].astype(float).fillna(2).astype(int)  # Fill edge cases
    
    # 4. Fare - fill FIRST, then bin
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['FareBin'] = pd.qcut(df['Fare'].rank(method='first'), q=4, labels=[0, 1, 2, 3])
    df['FareBin'] = df['FareBin'].astype(float).fillna(1).astype(int)  # Fill edge cases
    
    # 5. Cabin - has cabin or not
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    
    # 6. Encode categorical
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # 7. Drop unnecessary columns
    df.drop(['Name', 'Ticket', 'Cabin', 'Age', 'Fare'], axis=1, inplace=True)
    
    return df

# Reload fresh data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Apply to train
train_id = train['PassengerId']
train.drop('PassengerId', axis=1, inplace=True)
y = train['Survived']
train.drop('Survived', axis=1, inplace=True)

train = engineer_features(train)

print("Features created!")
print(train.head())
print(f"\nShape: {train.shape}")
print(f"Columns: {list(train.columns)}")

Features created!
   Pclass  Sex  SibSp  Parch  Embarked  Title  Family_Size  IsAlone  AgeBin  \
0       3    0      1      0       0.0      0            2        0       2   
1       1    1      1      0       1.0      2            2        0       3   
2       3    1      0      0       0.0      1            1        1       2   
3       1    1      1      0       0.0      2            2        0       2   
4       3    0      0      0       0.0      0            1        1       2   

   FareBin  HasCabin  
0        0         0  
1        3         1  
2        1         0  
3        3         1  
4        1         0  

Shape: (891, 11)
Columns: ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Title', 'Family_Size', 'IsAlone', 'AgeBin', 'FareBin', 'HasCabin']


In [5]:
# Check for NaNs
print("NaN counts per column:")
print(train.isna().sum())
print(f"\nTotal NaNs: {train.isna().sum().sum()}")

NaN counts per column:
Pclass         0
Sex            0
SibSp          0
Parch          0
Embarked       2
Title          0
Family_Size    0
IsAlone        0
AgeBin         0
FareBin        0
HasCabin       0
dtype: int64

Total NaNs: 2


In [6]:
# Fill Embarked NaNs with mode (most common value)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

# Confirm no NaNs
print(f"Total NaNs: {train.isna().sum().sum()}")

Total NaNs: 2


In [7]:
# Check what's in Embarked
print(train['Embarked'].value_counts(dropna=False))

Embarked
0.0    644
1.0    168
2.0     77
NaN      2
Name: count, dtype: int64


In [8]:
# Check what's in Embarked
print(train['Embarked'].value_counts(dropna=False))

Embarked
0.0    644
1.0    168
2.0     77
NaN      2
Name: count, dtype: int64


In [9]:
# Fill NaNs with 0 (Southampton - most common)
train['Embarked'] = train['Embarked'].fillna(0)

# Confirm no NaNs
print(f"Total NaNs: {train.isna().sum().sum()}")

Total NaNs: 0


In [11]:
# ===================
# MODEL TRAINING & COMPARISON (without XGBoost)
# ===================

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Prepare data
X = train
print(f"Features: {X.shape[1]}, Samples: {X.shape[0]}")

# Define models (no XGBoost)
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Compare with 5-Fold Cross Validation
print("\n===== MODEL COMPARISON (5-Fold CV) =====\n")

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    results[name] = scores.mean()
    print(f"{name}:")
    print(f"  CV Scores: {scores.round(3)}")
    print(f"  Mean: {scores.mean():.4f} (+/- {scores.std():.4f})\n")

# Best model
best_model = max(results, key=results.get)
print(f"üèÜ Best Model: {best_model} ({results[best_model]:.4f})")

Features: 11, Samples: 891

===== MODEL COMPARISON (5-Fold CV) =====

Logistic Regression:
  CV Scores: [0.816 0.798 0.792 0.809 0.837]
  Mean: 0.8103 (+/- 0.0157)

Random Forest:
  CV Scores: [0.81  0.781 0.837 0.803 0.848]
  Mean: 0.8159 (+/- 0.0241)

Gradient Boosting:
  CV Scores: [0.799 0.787 0.837 0.781 0.86 ]
  Mean: 0.8126 (+/- 0.0306)

üèÜ Best Model: Random Forest (0.8159)


In [12]:
# ===================
# HYPERPARAMETER TUNING - Random Forest
# ===================

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search with 5-fold CV
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X, y)

print(f"\nüèÜ Best Parameters: {grid_search.best_params_}")
print(f"üéØ Best CV Score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 135 candidates, totalling 675 fits

üèÜ Best Parameters: {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
üéØ Best CV Score: 0.8328


In [13]:
# ===================
# PREPARE TEST DATA & SUBMIT
# ===================

# Reload test data fresh
test = pd.read_csv('test.csv')
passenger_ids = test['PassengerId']

# Apply SAME feature engineering
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.')
test['Title'] = test['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Mlle', 
                                        'Ms', 'Mme', 'Don', 'Lady', 'Sir', 
                                        'Capt', 'Countess', 'Jonkheer', 'Dona'], 'Rare')
test['Title'] = test['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4})
test['Title'].fillna(4, inplace=True)

test['Family_Size'] = test['SibSp'] + test['Parch'] + 1
test['IsAlone'] = (test['Family_Size'] == 1).astype(int)

test['Age'].fillna(test['Age'].median(), inplace=True)
test['AgeBin'] = pd.cut(test['Age'], bins=[0, 12, 18, 35, 60, 100], labels=[0, 1, 2, 3, 4])
test['AgeBin'] = test['AgeBin'].astype(float).fillna(2).astype(int)

test['Fare'].fillna(test['Fare'].median(), inplace=True)
test['FareBin'] = pd.qcut(test['Fare'].rank(method='first'), q=4, labels=[0, 1, 2, 3])
test['FareBin'] = test['FareBin'].astype(float).fillna(1).astype(int)

test['HasCabin'] = test['Cabin'].notna().astype(int)

test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})
test['Embarked'].fillna(0, inplace=True)
test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test['Embarked'].fillna(0, inplace=True)

test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Age', 'Fare'], axis=1, inplace=True)

# Check for NaNs
print(f"Test NaNs: {test.isna().sum().sum()}")
print(f"Test shape: {test.shape}")
print(f"Train shape: {X.shape}")

Test NaNs: 0
Test shape: (418, 11)
Train shape: (891, 11)


In [14]:
# ===================
# TRAIN FINAL MODEL & PREDICT
# ===================

# Train with best parameters on ALL training data
best_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=4,
    min_samples_split=10,
    min_samples_leaf=1,
    random_state=42
)
best_rf.fit(X, y)

# Predict on test
predictions = best_rf.predict(test)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': predictions
})

submission.to_csv('submission_improved.csv', index=False)
print("‚úÖ Submission file created!")
print(submission.head(10))

‚úÖ Submission file created!
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0
