In [1]:
import numpy as np # linear algebra
import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [59]:
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [13]:
train_data.head(10) 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [71]:

# Extracting first letter from Cabbins
train_data['CabinLetter'] = train_data['Cabin'].fillna('U').str[0]

# Features
y = train_data['Survived']
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked", "CabinLetter"]
X = train_data[features]

# Test/Train split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

# Defining numeric and catagorical columns
numeric_features = ["Age", "Fare", "SibSp", "Parch", "Pclass"]
catagorical_features = ["Sex", "Embarked", "CabinLetter"]

# Pipline for numric and catagorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

catagorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', catagorical_transformer, catagorical_features)
])

# Full pipeline with classifier
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, eval_metric='logloss', random_state=1))
    #('classifier', RandomForestClassifier(n_estimators=150, max_depth=10, random_state=1))
])

clf.fit(X_train, y_train)

# Evaluate on validation split
val_score = clf.score(X_val, y_val)
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')



In [75]:
print("Cross-validation scores:", cv_scores)
print("Average CV accuracy: {:.4f}".format(cv_scores.mean()))

Cross-validation scores: [0.79888268 0.82022472 0.87640449 0.80898876 0.86516854]
Average CV accuracy: 0.8339


In [73]:
param_grid = {
    'classifier__n_estimators': [150, 175],
    'classifier__max_depth': [5, 10, 15]
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print("Best params:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

Best params: {'classifier__max_depth': 5, 'classifier__n_estimators': 150}
Best cross-validation accuracy: 0.8339


In [None]:
# === Optional: Final submission on real test_data ===
test_data['CabinLetter'] = test_data['Cabin'].fillna('U').str[0]
X_test = test_data[features]

# Predict on test_data
pred = clf.predict(X_test)

# Save submission
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")