In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
#import random as rnd

In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
reat = pd.read_csv('/kaggle/input/test-dataset/test_augmented.csv')

In [3]:
for dataset in [train, test]:
    dataset['CabinType'] = dataset['Cabin'].str[0]
    dataset['CabinType'] = dataset['CabinType'].fillna('U')

In [4]:
le = LabelEncoder()
combined = pd.concat([train['CabinType'], test['CabinType']])
le.fit(combined)

train['CabinType'] = le.transform(train['CabinType'])
test['CabinType'] = le.transform(test['CabinType'])

In [5]:
train['Sex'] = train['Sex'].map({'male':1, 'female':2}).astype(int)
test['Sex'] = test['Sex'].map({'male':1, 'female':2}).astype(int)

In [6]:
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [7]:
title_ages = train.groupby('Title')['Age'].mean()

In [8]:
for title in title_ages.index:
    train.loc[(train['Age'].isna()) & (train['Title'] == title), 'Age'] = title_ages[title]

for title in title_ages.index:
    test.loc[(test['Age'].isna()) & (test['Title'] == title), 'Age'] = title_ages[title]

In [9]:
for dataset in [train, test]:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
        'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [10]:
title_mapping = {"Mr": 1, "Miss": 4, "Mrs": 5, "Master": 3, "Rare": 2}
for dataset in [train, test]:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [11]:
pclass_fare_means = train.groupby('Pclass')['Fare'].mean()

for dataset in [train, test]:
    for pclass in [1, 2, 3]:
        mean_fare = pclass_fare_means[pclass]
        dataset.loc[(dataset['Fare'].isnull()) & (dataset['Pclass'] == pclass), 'Fare'] = mean_fare


In [12]:
for dataset in [train, test]:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [13]:
train['AgeGroup'] = pd.cut(train['Age'], bins=[0, 16, 32, 48, 64, 100], labels=[5, 3, 2, 4, 1])
test['AgeGroup'] = pd.cut(test['Age'], bins=[0, 16, 32, 48, 64, 100], labels=[5, 3, 2, 4, 1])

In [14]:
train['AgeGroup'] = train['AgeGroup'].astype(int)
test['AgeGroup'] = test['AgeGroup'].astype(int)

In [15]:
train['A*T'] = train['AgeGroup'] * train['Title']
test['A*T'] = test['AgeGroup'] * test['Title']

In [16]:
train = train.drop(['Parch', 'SibSp', 'AgeGroup', 'Name', 'Cabin', 'Ticket', 'Embarked'], axis=1)
test = test.drop(['Parch', 'SibSp', 'AgeGroup', 'Name', 'Cabin', 'Ticket', 'Embarked'], axis=1)

In [17]:
train = train.drop(['PassengerId'], axis=1)

In [20]:
X_train = train.drop(["Survived"], axis=1)
Y_train = train["Survived"]
X_test  = test.drop(['PassengerId'], axis=1).copy()
y_true = reat['Survived']
X_train.shape, Y_train.shape, X_test.shape

((891, 8), (891,), (418, 8))

In [21]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [22]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)


# 1. RandomForest
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=8,
    criterion='gini',
    random_state=28
)

# 2. XGBoost
xgb = XGBClassifier(
    learning_rate=0.01,
    n_estimators=150,
    subsample=0.5,
    max_depth=8,
    random_state=3
)

# 3. Logistic Regression
lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, solver='liblinear', random_state=0)
)

# 4. XGBRandomForest
rf_model2 = XGBRFClassifier(
    n_estimators=150,
    max_depth=6,
    random_state=27,
    learning_rate=1.0,
    subsample=0.2
)

# VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('lr', lr), ('rf2', rf_model2)],
    voting='hard',
    weights=[4,2,1,3]
)

voting_clf.fit(X_tr, y_tr)
y_pred = voting_clf.predict(X_ts)
acc = accuracy_score(y_ts, y_pred)

print(f'VotingClassifier Accuracy: {acc:.4f}')

VotingClassifier Accuracy: 0.8380


In [318]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": Y_test_pred
    })
submission.to_csv('final.csv', index=False)