<a href="https://colab.research.google.com/github/omid-sakaki-ghazvini/Practices/blob/main/Titanic_EDA_%26_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Dependencies and Setup

<div style="direction:rtl">
<font color='green' size="5px">
 کتابخانه های مورد نیاز را نصب میکنیم
    </font>
</div>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from imblearn.combine import SMOTEENN

import warnings

warnings.filterwarnings('ignore')

<div style="direction:rtl">
<font color='green' size="5px">
 از لینک زیر دیتاست را دانلود کرده و در پوشه هم مسیر همین ژوپیتر نوت بوک قرار دهید
    </font>
</div>

## https://www.kaggle.com/competitions/titanic/data

# 2. Load Data

<div style="direction:rtl">
<font color='green' size="5px">
توسط خط فرمان زیر، دیتا را فراخوانی میکنیم
    </font>
</div>

In [None]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.head(10)

# 4.Data Analysis

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

In [None]:
plt.figure(figsize=(5, 5))
palette_color = sns.color_palette('pastel')
explode = [0.1 for _ in range(df['Survived'].nunique())]

target_counts = df.groupby('Survived')['Survived'].count()

target_counts.plot.pie(
    colors=palette_color,
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=140,
    textprops={'fontsize': 14},
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}
)

plt.title('Survived Distribution', fontsize=18, weight='bold')
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(5, 5))
palette_color = sns.color_palette('pastel')
explode = [0.1 for _ in range(df['Pclass'].nunique())]

target_counts = df.groupby('Pclass')['Pclass'].count()

target_counts.plot.pie(
    colors=palette_color,
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=140,
    textprops={'fontsize': 14},
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}
)

plt.title('Pclass Distribution', fontsize=18, weight='bold')
plt.axis('equal')
plt.show()

In [None]:
Survived = df.loc[df['Survived']==1]

plt.figure(figsize=(5, 5))
palette_color = sns.color_palette('pastel')
explode = [0.1 for _ in range(Survived['Pclass'].nunique())]

target_counts = Survived.groupby('Pclass')['Pclass'].count()

target_counts.plot.pie(
    colors=palette_color,
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=140,
    textprops={'fontsize': 14},
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}
)

plt.title('Survived & Pclass Distribution', fontsize=18, weight='bold')
plt.axis('equal')
plt.show()

In [None]:
Survived = df.loc[df['Survived']==0]

plt.figure(figsize=(5, 5))
palette_color = sns.color_palette('pastel')
explode = [0.1 for _ in range(Survived['Pclass'].nunique())]

target_counts = Survived.groupby('Pclass')['Pclass'].count()

target_counts.plot.pie(
    colors=palette_color,
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=140,
    textprops={'fontsize': 14},
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}
)

plt.title('Survived & Pclass Distribution', fontsize=18, weight='bold')
plt.axis('equal')
plt.show()

In [None]:
sns.kdeplot(data=df, x='Age', hue='Survived', shade=True);

In [None]:
def bar_chart(feature):
    survived = df[df['Survived']==1][feature].value_counts()
    dead = df[df['Survived']==0][feature].value_counts()
    barPlot = pd.DataFrame([survived,dead])
    barPlot.index = ['Survived','Dead']
    barPlot.plot(kind='bar',stacked=True, figsize=(10,5))

bar_chart('Sex')
print("Survived :\n",df[df['Survived']==1]['Sex'].value_counts())
print("Dead:\n",df[df['Survived']==0]['Sex'].value_counts())

# 3.Feature engineering

In [None]:
for data in df:
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

df['Title'].value_counts()

In [None]:
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2,
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }

sex_mapping = {"male": 0, "female": 1}

embarked_mapping = {'S':0,'C':1,'Q':2}

for data in df:
    df['title'] = df["Title"].map(title_mapping)
    df['sex'] = df['Sex'].map(sex_mapping)
    df['embarked'] = df['Embarked'].map(embarked_mapping)

In [None]:
df.drop(columns=['Name','Title','Sex', 'Cabin','Embarked'], axis=1, inplace=True)

In [None]:
df["Age"].fillna(df.groupby("title")["Age"].transform("median"), inplace= True)

In [None]:
for i in range(len(df)):
    if df['Age'][i]<=15:
        df['Age'][i]=0
    elif (df['Age'][i]>15) & (df['Age'][i]<=30):
        df['Age'][i]=1
    elif (df['Age'][i]>30) & (df['Age'][i]<=45):
        df['Age'][i]=2
    elif (df['Age'][i]>45) & (df['Age'][i]<=60):
        df['Age'][i]=3
    else:
        df['Age'][i]=4

In [None]:
for i in range(len(df)):
    if df['Fare'][i]<=25:
        df['Fare'][i]=0
    elif (df['Fare'][i]>25) & (df['Fare'][i]<=50):
        df['Fare'][i]=1
    elif (df['Fare'][i]>50) & (df['Fare'][i]<=65):
        df['Fare'][i]=2
    else:
        df['Fare'][i]=3

In [None]:
df.dropna(subset=['embarked'],inplace=True)

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count

# 4. Data preparation

In [None]:
X=df.drop(columns={'PassengerId','Survived','Ticket'})
y=df.Survived

In [None]:
smoteenn = SMOTEENN()
X_new, y_new = smoteenn.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2)

print("The size of the input train data is: {}".format(X_train.shape))
print("The size of the output train data is: {}".format(y_train.shape))
print("The size of the input test data is: {}".format(X_test.shape))
print("The size of the output test data is: {}".format(y_test.shape))

# 5. Build ML Models

In [None]:
models = {
    'RF' : RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'GradBoost': GradientBoostingClassifier(),
    'XGB' : XGBClassifier(),
    'LGBM' : LGBMClassifier()

}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': classification_rep['1']['precision'],
        'Recall': classification_rep['1']['recall'],
        'F1': classification_rep['1']['f1-score']
    }

results_df = pd.DataFrame(results).T

In [None]:
results_df

In [None]:
model =LGBMClassifier(**{
                      'learning_rate'       : 1.4,
                      'max_depth'           : 11,
                      'min_data_in_leaf'    : 22,

                     })

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("accuracy =" ,accuracy_score(y_test, y_pred))