In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import spacy
import tensorflow as tf
sns.set(style='darkgrid', palette='husl')

In [2]:
#setting a seed for reproducability
SEED = 1002
def seed_everything(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed) 
    
seed_everything(SEED) 

In [3]:
#reading input data with pandas
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [4]:
train.head()

In [5]:
train["Name"] = train["Name"].apply(lambda x: x.split(", ")[1].split(".")[0])
test["Name"] = test["Name"].apply(lambda x: x.split(", ")[1].split(".")[0])

In [6]:
train.info()

In [7]:
!pip install pycaret
from pycaret.classification import *

In [8]:
train.head(5)

In [9]:
train["Age"].fillna(train["Age"].median(), inplace=True)
train["Fare"].fillna(train["Fare"].median(), inplace=True)

test["Age"].fillna(test["Age"].median(), inplace=True)
test["Fare"].fillna(test["Fare"].median(), inplace=True)

In [10]:
train = train.drop(["PassengerId"], axis=1)
test_new = test.drop(["PassengerId"], axis=1)

In [11]:
train = train.drop_duplicates(keep='first')
train = train.apply(lambda x: x.fillna(" "))
test_new = test_new.apply(lambda x: x.fillna(" "))

In [12]:
train.head(5)

In [13]:
def take_first_element(text):
    return text[0]

In [14]:
train['Ticket'] = train['Ticket'].apply(lambda x: take_first_element(x))
test_new['Ticket'] = test_new['Ticket'].apply(lambda x: take_first_element(x))
train['Cabin'] = train['Cabin'].apply(lambda x: take_first_element(x))
test_new['Cabin'] = test_new['Cabin'].apply(lambda x: take_first_element(x))

In [15]:
experiment = setup(train, target ='Survived', session_id=42, experiment_name='kaggle-nlp1')

In [16]:
models()

In [17]:
models_list = list(models().index)

In [33]:
top5 = compare_models(sort="AUC", n_select = 5)

In [None]:
#my_model = create_model('catboost')
my_model = stack_models(estimator_list=top5[1:], meta_model=top5[0])

In [21]:
#Both are good at reducing variance and provide higher stability… … but only Boosting tries to reduce bias. 
#On the other hand, Bagging may solve the over-fitting problem, while Boosting can increase it.
tuned_model = tune_model(my_model)
#tuned_model = ensemble_model(my_model, method = 'Bagging')
#tuned_model = ensemble_model(my_model, method = 'Boosting', n_estimators = 100)

In [22]:
plot_model(tuned_model, plot = 'auc')

In [23]:
plot_model(tuned_model, plot = 'pr')

In [24]:
plot_model(tuned_model, plot='feature')

In [25]:
plot_model(tuned_model, plot = 'confusion_matrix')

In [26]:
evaluate_model(tuned_model)

In [27]:
final_model = finalize_model(tuned_model)

In [28]:
predict_model(final_model)

In [29]:
unseen_predictions = predict_model(final_model, data=test_new)
unseen_predictions.head()

In [30]:
test_result = np.array([1 if x >= 0.5 else 0 for x in unseen_predictions["Label"]])

In [31]:
test_result.sum()

In [32]:
test["Survived"] = test_result
csv_data = test[['PassengerId', 'Survived']]
csv_data.to_csv('final.csv', index=False)