In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#####  This notebook is for beginners by beginner . In this session will predict survival using most basic problem of Titanic Survivals using different Classifiers finally we will use XGboost to classify  

## Import Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import time
from statsmodels.stats.outliers_influence import variance_inflation_factor    
from joblib import Parallel, delayed
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Data Reading

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

## Data Analysing


Finding number of dulicated and null value in our training and test sets

In [None]:
print(f'number of duplicate rows: {train.duplicated().sum()}\nnumber of null values:\n{train.isna().sum()}')

In [None]:
print(f'number of duplicate rows: {test.duplicated().sum()}\nnumber of null values:\n{test.isna().sum()}')

To plot multiple pairwise bivariate distributions in a dataset, we can use the .pairplot() function. 

The diagonal plots are the univariate plots, and this displays the relationship for the (n, 2) combination of variables in a DataFrame as a matrix of plots.

In [None]:
sns.pairplot(train,palette = ["#8000ff","#da8829"])

#### Heatmaps
The main intention of Seaborn heatmap is to visualize the correlation matrix of data for feature selection

In [None]:
sns.heatmap(train.corr(), annot=True, fmt=".1f", cmap = sns.color_palette("coolwarm", 12), mask = np.zeros_like(train.corr()))

#### Now we perform Label encoding that is converting categorical data in numberical since most of ML algorithms works with numerical data.

For binary label encoding we can use lambda function to apply values. 
Here we add two new columns to our data and remove other.

In [None]:
train['catSex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 0).astype('int')
train['catEmbark'] = train['Embarked'].apply(lambda x: 0 if x =='S' else(1 if x == 'C' else '2')).astype('int')

In [None]:
test['catSex'] = test['Sex'].apply(lambda x: 1 if x == 'male' else 0).astype('int')
test['catEmbark'] = test['Embarked'].apply(lambda x: 0 if x =='S' else(1 if x == 'C' else '2')).astype('int')

In [None]:
for n in ['Name', 'Ticket', 'Cabin', 'Sex', 'Embarked']:
    train.drop(n, axis=1, inplace = True)
for n in ['Name', 'Ticket', 'Cabin', 'Sex', 'Embarked']:
    test.drop(n, axis=1, inplace = True)
train.dropna(inplace = True)

test['Age'].fillna(test['Age'].mode()[0], inplace=True)
test['Fare'].fillna(test['Fare'].mode()[0], inplace=True)

In [None]:
train.info()

In [None]:
test.info()

Now our both training and test sets are completely numerical we can move further for filling missing values if there are any.

In [None]:
print(f'number of duplicate rows: {train.duplicated().sum()}\nnumber of null values:\n{train.isna().sum()}')

In [None]:
print(f'number of duplicate rows: {test.duplicated().sum()}\nnumber of null values:\n{test.isna().sum()}')

In [None]:
def calculate_vif_(X, thresh=5.0):
    variables = [X.columns[i] for i in range(X.shape[1])]
    dropped=True
    while dropped:
        dropped=False
        print(len(variables))
        vif = Parallel(n_jobs=-1,verbose=5)(delayed(variance_inflation_factor)(X[variables].values, ix) for ix in range(len(variables)))

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print(time.ctime() + ' dropping \'' + X[variables].columns[maxloc] + '\' at index: ' + str(maxloc))
            variables.pop(maxloc)
            dropped=True

    print('Remaining variables:')
    print([variables])
    return X[[i for i in variables]]


vif = calculate_vif_(train.loc[:, train.columns != 'Survived']) 
vif.info()

In [None]:
X, y = train.loc[:, train.columns != 'Survived'].values, train['Survived'].values
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state = 0)

In [None]:
!pip install lazypredict

#### Lazy Predict
Lazy Predict is one of the best python libraries that helps you to semi-automate your Machine Learning Task. It builds a lot of basic models without much code and helps understand which models work better without any parameter tuning.

Suppose we have a problem statement and we really need to apply all the models on that particular dataset and we have to analyze that how our basic model is performing. Here basic model means “Model without parameters”. So we can do this task directly using Lazy Predict. After getting all accuracy we can choose the top 5 models and then apply hyperparameter tuning to them. It provides a Lazy Classifier to solve the classification problem and Lazy Regressor to solve the regression problem.

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=False, custom_metric=None)
models, predictions = clf.fit(X_train, X_cv, y_train, y_cv)
models

In [None]:
temp_df = models.sort_values('Accuracy', ascending = False).head(10)
sns.barplot(x = temp_df['Accuracy'], y = temp_df.index).set_title('Top 10 models based on Accuracy')

#### AUC-ROC
AUC - ROC curve is a performance measurement for the classification problems at various threshold settings. ROC is a probability curve and AUC represents the degree or measure of separability. It tells how much the model is capable of distinguishing between classes. Higher the AUC, the better the model is at predicting 0 classes as 0 and 1 classes as 1. By analogy, the Higher the AUC, the better the model is at distinguishing between patients with the disease and no disease.

In [None]:
temp_df = models.sort_values('ROC AUC', ascending = False).head(10)
sns.barplot(x = temp_df['ROC AUC'], y = temp_df.index).set_title('Top 10 models based on AUC')

#### F1 Score
The F1-score combines the precision and recall of a classifier into a single metric by taking their harmonic mean. It is primarily used to compare the performance of two classifiers. Suppose that classifier A has a higher recall, and classifier B has higher precision.

In [None]:
temp_df = models.sort_values('F1 Score', ascending = False).head(10)
sns.barplot(x = temp_df['F1 Score'], y = temp_df.index).set_title('Top 10 models based on F1 Score')

Now we use XGboost as our classifier for final predications.

In [None]:
model = XGBClassifier()
model.fit(X, y)

We can do further tuning by knowing less relevant features and removing from data functions used model.feature_importances_

In [None]:
xgb_ftr_imp = model.feature_importances_

In [None]:
feature_importances_df = pd.DataFrame(xgb_ftr_imp, index = train.loc[:,train.columns != 'Survived'].columns) \
.reset_index() \
.rename(columns = {0: 'importance', 'index': 'feature'}) \
.sort_values('importance', ascending = False)

In [None]:
sns.barplot(y=feature_importances_df['feature'], x = feature_importances_df['importance']).set_title('Importance of features for predicting Survival using XGBoost')

And hence we can see "Parch" and "Fare" are less relevent and doesnt contribute in Predicting hence we remove them. which will also help to increase accuracy.

In [None]:
for n in ['Parch', 'Fare']:
    train.drop(n, axis=1, inplace = True)
for n in ['Parch', 'Fare']:
    test.drop(n, axis=1, inplace = True)
train.dropna(inplace = True)
X, y = train.loc[:, train.columns != 'Survived'].values, train['Survived'].values
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state = 0)

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=False, custom_metric=None)
models, predictions = clf.fit(X_train, X_cv, y_train, y_cv)
models

In [None]:
model = XGBClassifier()
model.fit(X, y)

##### Finally we made prediction and stored them submission.csv file. And yes ready to submit this file to know score!

In [None]:
test['Survived'] = model.predict(test.values)
submission = test[['PassengerId','Survived']]
submission.to_csv("submission.csv", index=False)
submission.head()

### Conclusions 
1. We made successfull predictions using XGboost as classifier.
2. We also understood how import it is to analysize and not directly fit data in model.
3. I learned new technique of Lazy learner in this Model which I learned from amazing notebooks available on Kaggle.

In [None]:
nan