In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import spacy
import tensorflow as tf
sns.set(style='darkgrid', palette='husl')

In [2]:
#setting a seed for reproducability
SEED = 1002
def seed_everything(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed) 
    
seed_everything(SEED) 

In [3]:
#reading input data with pandas
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [4]:
train.head()

In [5]:
train.info()

In [6]:
train["Surname"] = train["Name"].apply(lambda x: x.split(", ")[0][0])
test["Surname"] = test["Name"].apply(lambda x: x.split(", ")[0][0])
train["Name"] = train["Name"].apply(lambda x: x.split(", ")[1].split(".")[0])
test["Name"] = test["Name"].apply(lambda x: x.split(", ")[1].split(".")[0])
train["Ticket"] = train["Ticket"].apply(lambda x: x.split(" ")[-1])
test["Ticket"] = test["Ticket"].apply(lambda x: x.split(" ")[-1])
train["Ticket"].loc[train["Ticket"] == "LINE"] = float("NaN")
test["Ticket"].loc[test["Ticket"] == "LINE"] = float("NaN")
train["Ticket"] = train["Ticket"].astype(float)
test["Ticket"] = test["Ticket"].astype(float)

In [7]:
!pip install pycaret
from pycaret.classification import *

In [8]:
train.head(5)

In [9]:
train["Age"].fillna(train["Age"].median(), inplace=True)
train["Fare"].fillna(train["Fare"].median(), inplace=True)
train["Ticket"].fillna(train["Ticket"].median(), inplace=True)
test["Age"].fillna(test["Age"].median(), inplace=True)
test["Fare"].fillna(test["Fare"].median(), inplace=True)
test["Ticket"].fillna(test["Ticket"].median(), inplace=True)

In [10]:
train = train.drop(["PassengerId"], axis=1)
test_new = test.drop(["PassengerId"], axis=1)

In [11]:
train = train.drop_duplicates(keep='first')
train = train.apply(lambda x: x.fillna("X"))
test_new = test_new.apply(lambda x: x.fillna("X"))

In [12]:
train.head(5)

In [13]:
def take_first_element(text):
    return text[0]

In [14]:
train['cabin_0_or_1'] = train['Cabin'].apply(lambda x: 0 if x=="X" else 1)
test_new['cabin_0_or_1'] = test_new['Cabin'].apply(lambda x: 0 if x=="X" else 1)
train['Embarked_0_or_1'] = train['Embarked'].apply(lambda x: 0 if x=="S" else 1)
test_new['Embarked_0_or_1'] = test_new['Embarked'].apply(lambda x: 0 if x=="S" else 1)

In [15]:
#train['Ticket'] = train['Ticket'].apply(lambda x: take_first_element(x))
#test_new['Ticket'] = test_new['Ticket'].apply(lambda x: take_first_element(x))
train['Cabin'] = train['Cabin'].apply(lambda x: take_first_element(x))
test_new['Cabin'] = test_new['Cabin'].apply(lambda x: take_first_element(x))

In [16]:
train.head()

In [17]:
train.corr()

In [18]:
fig, axs = plt.subplots(4, 3, sharey=False, tight_layout=True, squeeze=False, figsize=(15,15))
axs[0, 0].hist(train["Pclass"], bins=5)
axs[0, 1].hist(train["Sex"], bins=5)
axs[0, 2].boxplot(train["SibSp"])
axs[1, 0].boxplot(train["Parch"])
axs[1, 1].boxplot(train["Age"])
axs[1, 2].boxplot(train["Fare"])
axs[2, 0].hist(train["Embarked"], bins=10)
axs[2, 1].hist(train["cabin_0_or_1"])
axs[2, 2].hist(train["Cabin"])
axs[3, 0].hist(train["Surname"], bins=10)
axs[3, 1].hist(train["Embarked_0_or_1"])


In [129]:
experiment = setup(train, target='Survived', session_id=42, experiment_name='kaggle-nlp1', normalize=True, 
                feature_interaction=True, feature_ratio=True, polynomial_features = True, polynomial_degree=2)

In [114]:
models()

In [95]:
models_list = list(models().index)


In [96]:
import warnings
warnings.filterwarnings("ignore")

In [130]:
top3 = compare_models(sort="Accuracy", n_select=3)

In [131]:
my_model = create_model('rf')
#blwnding models-sof or hard voting
#my_model = blend_models(estimator_list=top3[1:], method='soft')
#my_model = stack_models(estimator_list=top3[1:], meta_model=top3[0])

In [132]:
#Both are good at reducing variance and provide higher stability… … but only Boosting tries to reduce bias. 
#On the other hand, Bagging may solve the over-fitting problem, while Boosting can increase it.
tuned_model = tune_model(my_model)
#ensembled_model = ensemble_model(my_model, method='Bagging')
#tuned_model = tune_model(ensembled_model)
#tuned_model = ensembled_model(my_model, method='Boosting', n_estimators=100)

In [133]:
plot_model(tuned_model, plot='auc')
#plot_model(my_model, plot='auc')

In [134]:
plot_model(tuned_model, plot='pr')
#plot_model(my_model, plot='pr')

In [135]:
plot_model(tuned_model, plot='feature_all')
#plot_model(my_model, plot='feature')

In [136]:
plot_model(tuned_model, plot = 'confusion_matrix')
#plot_model(my_model, plot='confusion_matrix')

In [137]:
evaluate_model(tuned_model)
#evaluate_model(my_model)

In [138]:
final_model = finalize_model(tuned_model)

In [139]:
predict_model(final_model)#
#predict_model(my_model)

In [140]:
#unseen_predictions = predict_model(my_model, data=test_new)
unseen_predictions = predict_model(final_model, data=test_new)
unseen_predictions.head()

In [141]:
test_result = np.array([1 if x >= 0.5 else 0 for x in unseen_predictions["Label"]])

In [142]:
test_result.sum()

In [143]:
test["Survived"] = test_result
csv_data = test[['PassengerId', 'Survived']]
csv_data.to_csv('final7.csv', index=False)