In [74]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/test-argumented-csv/test_augmented.csv
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [75]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test2 = test

train_res = train["Survived"].reset_index(drop = True)

In [76]:
# Embarked
train['Embarked'] = train['Embarked'].fillna("S")

# Alone
train['T_partner']=train["SibSp"]+train["Parch"]
train['Alone']=np.where(train['T_partner']>0, 0, 1)

# Name Length 
train['Words_Count'] = train['Name'].apply(lambda x: len(x.split()))

# Title
train['Title'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

# Age (Title + Sex + Pclass | median)
train['Age'] = train['Age'].fillna(
    train.groupby(['Title', 'Sex', 'Pclass'])['Age'].transform('median')
)

# Extract and map cabin number
train['Cabin'] = train['Cabin'].fillna('U')
import re
# Extract first letter
train['Cabin'] = train['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
cabin_category = {'A':9, 'B':8, 'C':7, 'D':6, 'E':5, 'F':4, 'G':3, 'T':2, 'U':1}
# Mapping 'Cabin' to group
train['Cabin'] = train['Cabin'].map(cabin_category)

# Convert nonnumerical data to numerical
g = pd.get_dummies(train['Sex'], drop_first = True)
e = pd.get_dummies(train['Embarked'], drop_first = True)
p = pd.get_dummies(train['Pclass'], drop_first = True)
train = pd.concat([g,train,e,p], axis =1)

# Ticket_type
train['Ticket_type'] = train['Ticket'].apply(lambda x: x[0:3])
train['Ticket_type'] = train['Ticket_type'].astype('category')
train['Ticket_type'] = train['Ticket_type'].cat.codes

# Pclass*Sex
train['Sex_Pclass'] = train['Sex'].astype(str) + '_' + train['Pclass'].astype(str)
sex_pclass_dummies = pd.get_dummies(train['Sex_Pclass'], prefix='SexPclass')
train = pd.concat([train, sex_pclass_dummies], axis=1)
train.drop(['Sex_Pclass'], axis=1, inplace=True)

# Drop extra columns
train.drop('SibSp', axis=1, inplace=True)
train.drop('Parch', axis=1, inplace=True)
train.drop('T_partner', axis=1, inplace=True)
train.drop(["PassengerId", "Name", "Ticket",'Pclass','Sex','Embarked', "Survived"], axis = 1, inplace = True)

In [77]:
# Embarked
test['Embarked'] = test['Embarked'].fillna("S")
test["Fare"] = test["Fare"].fillna(14.45)

# Alone
test['T_partner']=test["SibSp"]+test["Parch"]
test['Alone']=np.where(test['T_partner']>0, 0, 1)

# Name Length 
test['Words_Count'] = test['Name'].apply(lambda x: len(x.split()))

# Title
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)

# Age (Title + Sex + Pclass | median)
test['Age'] = test['Age'].fillna(
    test.groupby(['Title', 'Sex', 'Pclass'])['Age'].transform('median')
)

# Extract and map cabin number
test['Cabin'] = test['Cabin'].fillna('U')
import re
# Extract first letter
test['Cabin'] = test['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
cabin_category = {'A':9, 'B':8, 'C':7, 'D':6, 'E':5, 'F':4, 'G':3, 'T':2, 'U':1}
# Mapping 'Cabin' to group
test['Cabin'] = test['Cabin'].map(cabin_category)

# Convert nonnumerical data to numerical
g = pd.get_dummies(test['Sex'], drop_first=True)
e = pd.get_dummies(test['Embarked'], drop_first=True)
p = pd.get_dummies(test['Pclass'], drop_first=True)
test = pd.concat([g, test, e, p], axis=1)

# Ticket_type
test['Ticket_type'] = test['Ticket'].apply(lambda x: x[0:3])
test['Ticket_type'] = test['Ticket_type'].astype('category')
test['Ticket_type'] = test['Ticket_type'].cat.codes

# Pclass*Sex
test['Sex_Pclass'] = test['Sex'].astype(str) + '_' + test['Pclass'].astype(str)
sex_pclass_dummies = pd.get_dummies(test['Sex_Pclass'], prefix='SexPclass')
test = pd.concat([test, sex_pclass_dummies], axis=1)
test.drop(['Sex_Pclass'], axis=1, inplace=True)

# Drop extra columns
test.drop('SibSp', axis=1, inplace=True)
test.drop('Parch', axis=1, inplace=True)
test.drop('T_partner', axis=1, inplace=True)
test.drop(["PassengerId", "Name", "Ticket", 'Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)

In [78]:
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train, train_res, random_state = 0)
train_x.columns = train_x.columns.astype(str)
val_x.columns = val_x.columns.astype(str)

In [79]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRFClassifier
from sklearn.tree import DecisionTreeClassifier

In [80]:
rf_model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0)
rf_model2 = XGBRFClassifier(n_estimators=300, max_depth=8, random_state=777)
xgb_model = XGBClassifier(n_estimators=300, max_depth=5, random_state=0)
lr_model = LogisticRegression(max_iter=1000, random_state=0)
                                
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('rf2', rf_model2),
        ('xgb', xgb_model),
        ('lr', lr_model)
    ],
    voting='hard',
    weights=[4, 1, 4, 1]
)

In [81]:
train.columns = train.columns.astype(str)
test.columns = test.columns.astype(str)

best_model = voting_model
best_model.fit(train_x, train_y)

X_test = test[train.columns]
predictions = best_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test2['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission.csv', index=False)
print("Saved!")

Saved!
