## Score With Vs Without Outliers (Dramatic Increase):

Hello kagglers .. 

in this notebook we are going to see the difference in score with and without outliers.

**Note:** You can skip the processing steps like: Data Cleaning, Data Engineering, ... And focus on the Modeling part.

If you find this notebook helpful, Please press the **UPVOTE** button up there, This help me a lot ^-^.

In [None]:
# =================================================================================================
# Importing the Libraries:
# =================================================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import warnings
warnings.filterwarnings("ignore")


sns.set(style='white', context='notebook', palette='deep')

In [None]:
# =================================================================================================
# Importing the Data:
# =================================================================================================

train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
IDtest = test["PassengerId"]
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_len = len(train)
dataset =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
print("Data imported successfully!!")

In [None]:
# =================================================================================================
# Data Cleaning:
# =================================================================================================
# You can Skip this cell !!
# ---------------------------------------------

#Fill Fare missing values with the median value
dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median())

# Apply log to Fare to reduce skewness distribution
dataset["Fare"] = dataset["Fare"].map(lambda i: np.log(i) if i > 0 else 0)

#Fill Embarked nan values of dataset set with 'S' most frequent value
dataset["Embarked"] = dataset["Embarked"].fillna("S")

# convert Sex into categorical value 0 for male and 1 for female
dataset["Sex"] = dataset["Sex"].map({"male": 0, "female":1})

dataset.drop(columns = ["Cabin" , "Ticket"] , inplace = True)
# Filling missing value of Age 

## Fill Age with the median age of similar rows according to Pclass, Parch and SibSp
# Index of NaN age rows
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = dataset["Age"].median()
    age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) & (dataset['Parch'] == dataset.iloc[i]["Parch"]) & (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        dataset['Age'].iloc[i] = age_pred
    else :
        dataset['Age'].iloc[i] = age_med
        
print("Data cleaned successfully!!")

In [None]:
# =================================================================================================
# Data Engineering:
# =================================================================================================
# You can Skip this cell !!
# ---------------------------------------------


# Get Title from Name
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)

# Convert to categorical values Title 
dataset["Title"] = dataset["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)

# Drop Name variable
dataset.drop(labels = ["Name"], axis = 1, inplace = True)

# Create a family size descriptor from SibSp and Parch
dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1

# Create new feature of family size
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset['Fsize'].map(lambda s: 1 if  s == 2  else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s >= 5 else 0)

# convert to indicator values Title and Embarked 
dataset = pd.get_dummies(dataset, columns = ["Title"])
dataset = pd.get_dummies(dataset, columns = ["Embarked"], prefix="Em")

# Create categorical values for Pclass
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"],prefix="Pc")

# Drop useless variables 
dataset.drop(labels = ["PassengerId"], axis = 1, inplace = True)

print("The operation is done successfully!!")

## Modeling:

#### Train With Outliers:

In [None]:
# ===========================================================================
# Separate train dataset and test dataset
# ===========================================================================

# Train:
train_with_outliers = dataset[:train_len]
train_with_outliers["Survived"] = train["Survived"].astype(int)

Y_train_with_outliers = train_with_outliers["Survived"]

X_train_with_outliers = train_with_outliers.drop(columns = ["Survived"])

# Test:
test = dataset[train_len:]
test.drop(labels=["Survived"],axis = 1,inplace=True)


In [None]:
X_train_with_outliers.head()

In [None]:
X_train_with_outliers.shape

We will use these models:
- Random Forest
- SVC
- GradientBoosting 
- AdaBoost


#### Parameters tuning:

In [None]:
# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)

In [None]:
# =====================================================================================
# SVC classifier
# =====================================================================================
def Best_SVM(X_train , Y_train):
    SVMC = SVC(probability=True)
    svc_param_grid = {'kernel': ['rbf'], 
                      'gamma': [ 0.001, 0.01, 0.1, 1],
                      'C': [1, 10, 50, 100,200,300, 1000]}
    gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
    gsSVMC.fit(X_train,Y_train)
    return gsSVMC.best_estimator_ , gsSVMC.best_score_
# =====================================================================================
# RFC Parameters tunning 
# =====================================================================================
def Best_Random_Forest(X_train , Y_train):
    RFC = RandomForestClassifier()
    rf_param_grid = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[100,300],
                  "criterion": ["gini"]}
    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
    gsRFC.fit(X_train,Y_train)
    return  gsRFC.best_estimator_ , gsRFC.best_score_
# =====================================================================================
# Adaboost
# =====================================================================================
def Best_ADABoosting(X_train , Y_train):
    DTC = DecisionTreeClassifier()
    adaDTC = AdaBoostClassifier(DTC, random_state=7)
    ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
                  "base_estimator__splitter" :   ["best", "random"],
                  "algorithm" : ["SAMME","SAMME.R"],
                  "n_estimators" :[1,2],
                  "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}
    gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
    gsadaDTC.fit(X_train,Y_train)
    return gsadaDTC.best_estimator_ , gsadaDTC.best_score_
# =====================================================================================
# Gradient boosting tunning
# =====================================================================================
def Best_Gradient_Boosting(X_train , Y_train):
    GBC = GradientBoostingClassifier()
    gb_param_grid = {'loss' : ["deviance"],
                  'n_estimators' : [100,200,300],
                  'learning_rate': [0.1, 0.05, 0.01],
                  'max_depth': [4, 8],
                  'min_samples_leaf': [100,150],
                  'max_features': [0.3, 0.1] 
                  }
    gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
    gsGBC.fit(X_train,Y_train)
    return gsGBC.best_estimator_ , gsGBC.best_score_

In [None]:
print("-"*100 , "\n" , "SVM:" )
SVMC_best_with_outlier , SVMC_score_with_outlier = Best_SVM(X_train_with_outliers , Y_train_with_outliers)
print("SVM Score: " , SVMC_score_with_outlier)
print("-"*100 , "\n" , "AdaBoost:" )
ada_best_with_outlier , ada_score_with_outlier = Best_ADABoosting(X_train_with_outliers , Y_train_with_outliers)
print("ADABoost Score: " , ada_score_with_outlier)
print("-"*100 , "\n" , "Random Forest:" )
RFC_best_with_outlier , RFC_score_with_outlier = Best_Random_Forest(X_train_with_outliers , Y_train_with_outliers)
print("Random Forest Score: " , RFC_score_with_outlier)
print("-"*100 , "\n" , "Gradient Boosting:" )
GBC_best_with_outlier , GBC_score_with_outlier = Best_Gradient_Boosting(X_train_with_outliers , Y_train_with_outliers)
print("Gradient Boosting Score: " , GBC_score_with_outlier)

#### Train without Outlier:

In [None]:
# Outlier detection 

def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = detect_outliers(X_train_with_outliers,2,["Age","SibSp","Parch","Fare"])

In [None]:
X_train_with_outliers.loc[Outliers_to_drop] # Show the outliers rows

In [None]:
# Drop outliers
train_without_outliers = train_with_outliers.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
Y_train_without_outliers = train_without_outliers["Survived"]
X_train_without_outliers = train_without_outliers.drop(columns = ["Survived"])

In [None]:
print("-"*100 , "\n" , "SVM:" )
SVMC_best_without_outlier , SVMC_score_without_outlier = Best_SVM(X_train_without_outliers , Y_train_without_outliers)
print("SVM Score: " , SVMC_score_without_outlier)
print("-"*100 , "\n" , "AdaBoost:" )
ada_best_without_outlier , ada_score_without_outlier = Best_ADABoosting(X_train_without_outliers , Y_train_without_outliers)
print("ADABoost Score: " , ada_score_without_outlier)
print("-"*100 , "\n" , "Random Forest:" )
RFC_best_without_outlier , RFC_score_without_outlier = Best_Random_Forest(X_train_without_outliers , Y_train_without_outliers)
print("Random Forest Score: " , RFC_score_without_outlier)
print("-"*100 , "\n" , "Gradient Boosting:" )
GBC_best_without_outlier , GBC_score_without_outlier = Best_Gradient_Boosting(X_train_without_outliers , Y_train_without_outliers)
print("Gradient Boosting Score: " , GBC_score_without_outlier)

In [None]:
result = pd.DataFrame({"Algorithm":["SVM" , "AdaBoost" , "Random Forest" , "Geadient Boosting"],
           "With outliers" : [SVMC_score_with_outlier , ada_score_with_outlier , RFC_score_with_outlier , GBC_score_with_outlier], 
           "Without outliers":[SVMC_score_without_outlier , ada_score_without_outlier , RFC_score_without_outlier , GBC_score_without_outlier],
          })
result["difference"] = result["Without outliers"] - result["With outliers"]
result

In [None]:
nan

In [None]:
nan

In [None]:
nan

In [None]:
nan