<a href="https://www.kaggle.com/code/mennatullaheisawy/titanic-binary-classification-with-lgbm-xgboost?scriptVersionId=191745621" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
sns.set(style='darkgrid')

class DataCleaningClass:
    def data_info(self, data):
        cols, dtype, nulls, duplicates, uniques = [], [], [], [], []

        for col in data.columns:
            cols.append(col)
            dtype.append(data[col].dtype)
            nulls.append(data[col].isnull().sum())
            duplicates.append(data.duplicated().sum())
            uniques.append(data[col].nunique())

        df = pd.DataFrame(
            {'Column': cols, 'DType': dtype, 'no of Nulls': nulls, 'no of Uniques': uniques, 'Duplicated rows': duplicates})
        return df


    # __data=[]
    def split_data(self, data, categorical_threshold=10):
        numerical_data = data.select_dtypes(include=['number'])
        object_data = data.select_dtypes(include=['object'])
        unique_data = data.nunique()
        cat_cols = unique_data[unique_data <= categorical_threshold].index
        cont_cols = unique_data[unique_data > categorical_threshold].index
        return numerical_data, object_data, cat_cols, cont_cols

    # numerical_data, object_data, cat_cols, cont_cols = split_data(data, 10)

# NUMERICAL PLOTTING
    def columns_histplot(self, data):
        # data = numerical data
        c = 3
        r = math.ceil(len(data.columns)/c)
        plt.figure(figsize=(20,5*r))
        l = len(data.columns)
        for i in range(l):
            plt.subplot(r, c, i + 1)
            sns.histplot(data[data.columns[i]], bins=10, kde=True)
            plt.title(f'HistPlot of {data.columns[i]}', fontsize=14, color='darkblue')
            plt.xticks(rotation=45)
            plt.ylabel('Frequency')

        plt.tight_layout()
        plt.show()

    def columns_boxplot(self, data):
        # data = numerical_data
        l = len(data.columns)
        plt.figure(figsize=(20, 30))
        for i in range(l):
            plt.subplot(l, 1, i + 1)
            sns.boxplot(x=data[data.columns[i]])
            plt.title(f'BoxPlot of {data.columns[i]}', fontsize=22, color='darkblue')

        plt.tight_layout()
        plt.show()

# CATEGORICAL PLOTTING
    def columns_pie(self, data):
        # data = data[cat_cols]
        c = 3
        r = math.ceil(len(data.columns) / c)
        plt.figure(figsize=(100, 100))
        l = len(data.columns)
        for i in range(l):
            plt.subplot(r, c, i + 1)
            unique_values = data[data.columns[i]].unique()
            label = unique_values if not pd.isnull(unique_values).any() else unique_values[:-1]
            plt.pie(data[data.columns[i]].value_counts(normalize=True), autopct='%1.0f%%', labels=label,
                    textprops={'fontsize': 48})
            plt.title(f'Pie Chart of {data.columns[i]}', fontsize=64)
        plt.tight_layout()
        plt.show()

        # CATEGORICAL PLOTTING WITH RESPECT TO CATEGORICAL TARGET
    def columns_countplot(self, data, target_index=0.1):
        # data = data[cat_cols]
        # target_data = the data of the target column if it is categorical
        c = 3
        r = math.ceil(len(data.columns) / c)
        plt.figure(figsize=(20, 5 * r))
        l = len(data.columns)
        for i in range(l):
            plt.subplot(r, c, i + 1)
            if (target_index == 0.1):
                sns.countplot(x=data[data.columns[i]])
                plt.title(f'CountPlot of {data.columns[i]}', fontsize=22, color='darkblue')
            else:
                if (target_index == i):
                    continue
                sns.countplot(x=data[data.columns[i]], hue=data[data.columns[target_index]])
                plt.title(f'CountPlot of {data.columns[i]} with {data.columns[target_index]} as a hue ',
                          fontsize=22,
                          color='darkblue')
            plt.xticks(rotation=45)

        plt.tight_layout()
        plt.show()


# CATEGORICAL PLOTTING WITH RESPECT TO CONTINUOUS TARGET
    def columns_barplot(self, data, target_data):
        # data = data[cat_cols]
        # target_data = the data of the target column if it is categorical
        l = len(data.columns)
        plt.figure(figsize=(10, 30))
        for i in range(l):
            plt.subplot(l, 1, i + 1)
            sns.barplot(x=data.columns[i], y=target_data)
            plt.title(f'BarPlot of {data.columns[i]} related to the {target_data.name}', fontsize=14, color='darkblue')

        plt.tight_layout()
        plt.show()

# CATEGORICAL PLOTTING WITH RESPECT TO CATEGORICAL TARGET
    def columns_countplot(self, data, target_index=0.1):
        # data = data[cat_cols]
        # target_data = the data of the target column if it is categorical
        c = 3
        r = math.ceil(len(data.columns) / c)
        plt.figure(figsize=(20, 5 * r))
        l = len(data.columns)
        for i in range(l):
            plt.subplot(r, c, i + 1)
            if(target_index==0.1):
                sns.countplot(x=data[data.columns[i]])
                plt.title(f'CountPlot of {data.columns[i]}', fontsize=22, color='darkblue')
            else:
                if(target_index==i):
                    continue
                sns.countplot(x=data[data.columns[i]], hue=data[data.columns[target_index]])
                plt.title(f'CountPlot of {data.columns[i]} with {data.columns[target_index]} as a hue ', fontsize=22,
                      color='darkblue')
            plt.xticks(rotation=45)

        plt.tight_layout()
        plt.show()

# CONTINUOUS PLOTTING WITH RESPECT TO CONTINUOUS TARGET
    def columns_scatterplot(self, data, target_data):
        # data = data[cont_cols]
        # target_data = the data of the target column if it is continuous
        c = 3
        r = math.ceil(len(data.columns)/c)
        plt.figure(figsize=(20,5*r))
        l = len(data.columns)
        for i in range(l):
            plt.subplot(r, c, i + 1)
            sns.scatterplot(x = data[data.columns[i]], y =target_data)
            plt.title(f'ScatterPlot of {data.columns[i]} with {target_data.name}', fontsize=22, color='darkblue')
            plt.xticks(rotation=45)

        plt.tight_layout()
        plt.show()

    def columns_lineplot(self, data, target_data):
        # data = data[cont_cols]
        # target_data = the data of the target column if it is continuous
        l = len(data.columns)
        plt.figure(figsize=(20, 30))
        for i in range(l):
            plt.subplot((l//2)+1, 2, i + 1)
            sns.lineplot(x=data[data.columns[i]],y=target_data)
            plt.title(f'Line Plot of {data.columns[i]} with {target_data.name}', fontsize=22, color='darkblue')

        plt.tight_layout()
        plt.show()




# DATA CLEANING
    def columns_fillna(self,data):
        for col in data.columns:
            if col in (data.select_dtypes(include=['number'])):
                data[col] = data[col].fillna(data[col].median())
            elif col in (data.select_dtypes(include=['object'])):
                data[col] = data[col].fillna(data[col].mode()[0])

        return data


    def columns_outlier(self,data, drop_categorical_outliers=False):
        # data = data
        for col in data.columns:
            if col in (data.select_dtypes(include=['number'])):
                q1, q3 = data[col].quantile([0.25, 0.75])
                iqr = q3 - q1
                lower = q1 - 1.5 * iqr
                upper = q3 + 1.5 * iqr
                outlier = (data[col] < lower) | (data[col] > upper)
                data = data[~outlier]

            elif (col in data.select_dtypes(include=['object']) and drop_categorical_outliers==True):
                for value in data[col].unique():
                    value_len = len(data[data[col]==value])
                    if value_len < len(data.columns):
                        data = data.drop(data[data[col]==value].index, axis=0)

            # data.reset_index(drop=True, inplace=True)
        return data


    def columns_transformation(self,data):
        # data = numerical_data
        skewed_data = data.skew()
        right_skewed = skewed_data[skewed_data > 0.6]
        left_skewed = skewed_data[skewed_data < (-0.5)]
        for col in skewed_data.index:
            if col in right_skewed.index:  # right skewed --> log transformer
                tr = ColumnTransformer(transformers=[('lg', FunctionTransformer(np.log1p), [col])])
                tr_type = 'Log'
            elif col in left_skewed.index:  # left skewed --> square tranformer
                tr = ColumnTransformer(transformers=[('sq', FunctionTransformer(np.square), [col])])
                tr_type = 'Square'
            else:
                continue
            plt.figure(figsize=(15, 6))
            col_tr = pd.DataFrame(tr.fit_transform(data))
            skew_before = data[col].skew()
            skew_after = col_tr[0].skew()
            plt.subplot(1, 2, 1)
            plt.title(f"Distribution of {col} before Transformation", fontsize=15)
            sns.histplot(data[col], kde=True, color="red")

            data[col] = col_tr[0]
            plt.subplot(1, 2, 2)
            plt.title(f"Distribution of {col} after Transformation", fontsize=15)
            sns.histplot(data[col], bins=20, kde=True, legend=False)
            plt.xlabel(col)
            plt.show()
            print(
                f"Skewness was {round(skew_before, 2)} before & now it is {round(skew_after, 2)} after {tr_type} transformation.")

        return data

    def columns_lencoder(self,data):
        # data = object_data
        le = LabelEncoder()
        for col in data.columns:
            data[col] = le.fit_transform(data[col])
        return data

    def columns_drop(self,data, cols):
        # data = data
        # cols = list of columns you need to drop
        # -- do not forget to call the split function
        data = data.drop(columns = cols, axis=1)
        # data = data.reset_index(drop=True)
        return data


In [None]:
# %pip install dataprep xgboost lightgbm

In [None]:
from dataprep.datasets import load_dataset
from dataprep.eda import create_report, plot, plot_correlation, plot_diff, plot_missing

# EDA

In [None]:
data = load_dataset('titanic')
data.head()

Using the **DataPrep** create_report method, we have the following Interpertations about this dataset.


*   data shape: (12,891)
*   3 numerical, 9 categorical
*   no duplicated rows(because of the ID and Name columns)
*   Age(19.9%), and Cabin(77.1), Embarked(0.2%) contain NaNs
*   The target column is Survived ( Binary Classification )

From the intital plots:


*   The major label in the target column is 0, not survived (61%)
*   The major category in pclass(55%), parch(76%), SibSp(68%) is 0
*   The major in sex is male (64%)
*   The major is embark is 's'
*   Fare is right skewed and has outliers
*   Age has outliers

Notes:

*   The strongest correlation is 0.41 between SibSp and Parch

In [None]:
plot_missing(data)

In [None]:
plot(data, 'PassengerId')

Initial Feature Selection:

*   I will drop Passenger_ID, Name as they are uniformly distributed.
*   and Cabin as it has 77.1% NaN values.
*   Drop Ticket feature as it has 76.4 unique continous noninfo values


In [None]:
plot_missing(data, 'Survived')

In [None]:
plot(data, 'Survived', 'Age')

* From the box plot between Age and Survived, the median in both of the classes is the same so I can replace NaNs with the median of the whole features

In [None]:
plot(data, 'Fare')

* This feature has a lot of outliers need to be removed

In [None]:
plot(data, 'Survived', 'Pclass')

* This plot shows that most of the people who didn't survive were in Pclass 3.

In [None]:
plot(data, 'Survived', 'Sex')

* From this plot, most of people who didn't survive were strictly males.

In [None]:
plot(data, 'Survived', 'Embarked')

* from this plot, most of people who didn't survive were in embark 'S'.
  
  In other words, embark 's' took the major cause for not surviving.

# Data Preprocessing

In [None]:
dc = DataCleaningClass()

#### Drop Features :

In [None]:
features = ['PassengerId', 'Name','Cabin']
data = dc.columns_drop(data, features)
data.head()

#### Fill NaN

In [None]:
data = dc.columns_fillna(data)
plot_missing(data)

#### Check for Duplicates

In [None]:
data.duplicated().sum()

In [None]:
data = data.drop_duplicates()
data.duplicated().sum()

#### Handle Outliers using IQR

In [None]:
data.info()

In [None]:
data = dc.columns_outlier(data)
data.info()

#### Handle Skewness
-- using log transformation for right skewed features

-- using square transformation for left skewed features

In [None]:
# split data to numerical and object
numerical_data, object_data, cat_cols, cont_cols = dc.split_data(data)
y = data['Survived']

In [None]:
numerical_data.skew().sort_values(ascending=False)

In [None]:
data[numerical_data.columns] = dc.columns_transformation(numerical_data)

In [None]:
numerical_data.skew().sort_values(ascending=False)

#### Encoding
-- using label encoder

In [None]:
# refresh the sub data after applying some changes
numerical_data, object_data, cat_cols, cont_cols = dc.split_data(data)

In [None]:
data[object_data.columns] = dc.columns_lencoder(object_data)
data.head()

In [None]:
data.info()

#### Train - Test Split

In [None]:
X = data.drop(['Survived'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

#### Features Scaling
-- using robust scaler

In [None]:
rs = RobustScaler()
X_train = rs.fit_transform(X_train)
X_test = rs.transform(X_test)



### Models

#### Logistic Regression

In [None]:
# Prepare a dataframe to save the models' accuracies in it
results = pd.DataFrame(columns=['Model', 'Train_Accuracy', 'Test_Accuracy'])
results

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
tr_acc = model.score(X_train, y_train)
te_acc = accuracy_score(y_pred, y_test)

print(tr_acc)
print(te_acc)

In [None]:
model_name = str(model).split('(')[0]
new_row = pd.DataFrame([[model_name, tr_acc, te_acc]], columns=results.columns)
results = pd.concat([results, new_row], ignore_index=True)

In [None]:
def plot_conf_matrix(y_test, y_pred):
    plt.figure(figsize=(5,5))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='0.00f', xticklabels=['Didn\'t_Survive','Survived'], yticklabels=['Didn\'t_Survive','Survived'], linewidths=0.2, cbar=False)
    plt.title(f'Test Data Confusion Matrix of {model_name}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('Actual Labels')
    plt.tight_layout()
    plt.show()

In [None]:
plot_conf_matrix(y_test, y_pred)

#### XGBoost Classifier

In [None]:
model = XGBClassifier(n_estimators=550, learning_rate=0.001, random_state=42)
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
tr_acc = model.score(X_train, y_train)
te_acc = accuracy_score(y_pred, y_test)

print(tr_acc)
print(te_acc)

In [None]:
model_name = str(model).split('(')[0]
new_row = pd.DataFrame([[model_name, tr_acc, te_acc]], columns=results.columns)
results = pd.concat([results, new_row], ignore_index=True)

In [None]:
plot_conf_matrix(y_test, y_pred)

#### Naive Bayes Classifier

In [None]:
model = LGBMClassifier(n_estimators=2000, learning_rate=0.001, random_state=42)
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
tr_acc = model.score(X_train, y_train)
te_acc = accuracy_score(y_pred, y_test)

print(tr_acc)
print(te_acc)

In [None]:
model_name = str(model).split('(')[0]
new_row = pd.DataFrame([[model_name, tr_acc, te_acc]], columns=results.columns)
results = pd.concat([results, new_row], ignore_index=True)

In [None]:
plot_conf_matrix(y_test, y_pred)

#### Support Vector Classifier

In [None]:
model = SVC(C=0.5, random_state=42)
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
tr_acc = model.score(X_train, y_train)
te_acc = accuracy_score(y_pred, y_test)

print(tr_acc)
print(te_acc)

In [None]:
model_name = str(model).split('(')[0]
new_row = pd.DataFrame([[model_name, tr_acc, te_acc]], columns=results.columns)
results = pd.concat([results, new_row], ignore_index=True)

In [None]:
plot_conf_matrix(y_test, y_pred)

In [None]:
results

### Prepare the submission file

In [None]:
test = pd.read_csv('/Users/menna/Desktop/Amit/Data Science/Titanic ML.csv')
test = test.set_index('PassengerId')
# test.shape
test.head()

In [None]:

features = ['Name','Cabin']
test = dc.columns_drop(test, features)
test = dc.columns_fillna(test)
# test = test.drop_duplicates()
# test = dc.columns_outlier(test)
numerical_data, object_data, cat_cols, cont_cols = dc.split_data(test)
test[numerical_data.columns] = dc.columns_transformation(numerical_data)
numerical_data, object_data, cat_cols, cont_cols = dc.split_data(test)
test[object_data.columns] = dc.columns_lencoder(object_data)
test_scaled = rs.transform(test)

In [None]:
test.shape

In [None]:
# using lgbm model
y_pred = model.predict(test_scaled)
y_pred

In [None]:
submission = pd.DataFrame({'PassengerId': test.index, 'Survived': y_pred})
submission.to_csv('submission.csv', index=False)