# **Library Import**

In [374]:
# EDA
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Linear
from sklearn.linear_model import LogisticRegression
# DT
from sklearn.tree import DecisionTreeClassifier
# Neighbour
from sklearn.neighbors import KNeighborsClassifier
# SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# Train Test
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')

# **Exercise Q's**
1. Gunakan data titanic seaborn
2. Lakukan EDA untuk melihat anomali & tanggulangi
3. Tentukan kolom mana saja yang akan dipakai (x and y included)
4. Buatlah model menggunakan:
- logistic regression
- decision tree
- KNN
- Support Vector Classifier
5. Dengan menggunakan accuracy score sebagai matrix, tentukan model mana yang paling baik untuk menentukan korban selamat atau tidak.

# **Background**

The Titanic dataset, frequently employed in data analysis using Seaborn, offers insights into passenger information and survival outcomes. It serves as a valuable resource for studying survival patterns, class disparities, and cabin locations. Additionally, it is widely used for honing data visualization and predictive modeling skills, making it a popular choice among data analysis enthusiasts.

## **Ex's 1 (Data Load)**

In [375]:
ttnc = sns.load_dataset('titanic')
ttnc.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## **Ex's 2 (Data Cleaning)**

In [376]:
ttnc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [377]:
ttnc.drop('deck',axis=1,inplace=True)
ttnc.drop('pclass',axis=1,inplace=True)
ttnc.drop('alive',axis=1,inplace=True)
ttnc.drop('sibsp',axis=1,inplace=True)
ttnc.drop('parch',axis=1,inplace=True)
ttnc.drop('adult_male',axis=1,inplace=True)
ttnc.drop('who',axis=1,inplace=True)
ttnc.drop('embarked',axis=1,inplace=True)
ttnc.drop('embark_town',axis=1,inplace=True)

In [378]:
# ttnc['age'] = ttnc['age'].median()
ttnc.dropna(inplace=True)

In [379]:
ttnc['age'] = ttnc['age'].astype(int)

In [380]:
ttnc.fare.sort_values(ascending=False)

679    512.3292
258    512.3292
737    512.3292
438    263.0000
341    263.0000
         ...   
806      0.0000
822      0.0000
302      0.0000
263      0.0000
271      0.0000
Name: fare, Length: 714, dtype: float64

In [381]:
ttnc = ttnc[ttnc['fare']<100]

In [382]:
ttnc.describe()

Unnamed: 0,survived,age,fare
count,666.0,666.0,666.0
mean,0.381381,29.516517,23.572823
std,0.486091,14.529841,21.266922
min,0.0,0.0,0.0
25%,0.0,20.0,8.05
50%,0.0,28.0,13.92915
75%,1.0,38.0,29.09375
max,1.0,80.0,93.5


In [383]:
ttnc

Unnamed: 0,survived,sex,age,fare,class,alone
0,0,male,22,7.2500,Third,False
1,1,female,38,71.2833,First,False
2,1,female,26,7.9250,Third,True
3,1,female,35,53.1000,First,False
4,0,male,35,8.0500,Third,True
...,...,...,...,...,...,...
885,0,female,39,29.1250,Third,False
886,0,male,27,13.0000,Second,True
887,1,female,19,30.0000,First,True
889,1,male,26,30.0000,First,True


## **Ex's 3 (Train Test)**

In [384]:
ttnc.columns

Index(['survived', 'sex', 'age', 'fare', 'class', 'alone'], dtype='object')

In [385]:
# Data Dummy Creation to make classification values

ttnc = pd.get_dummies(ttnc, columns=['sex', 'class', 'alone'], dtype=int, drop_first=True)

In [386]:
# Train Test Split

x = ttnc.drop('survived', axis=1) # Features
y = ttnc['survived'] # Target

xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y,
    test_size = 0.2, 
    random_state = 42, 
    stratify = y
)

## **Ex's 4 (Machine Learning Modeling)**

### Logistic Regression

In [387]:
logreg = LogisticRegression()
logreg.fit(xtrain, ytrain)
pred = logreg.predict(xtest)
print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89        83
           1       0.86      0.73      0.79        51

    accuracy                           0.85       134
   macro avg       0.85      0.83      0.84       134
weighted avg       0.85      0.85      0.85       134



### Decision Tree

In [388]:
dt = DecisionTreeClassifier()
dt.fit(xtrain, ytrain)
pred_tree = dt.predict(xtest)
print(classification_report(ytest,pred_tree))

              precision    recall  f1-score   support

           0       0.81      0.73      0.77        83
           1       0.63      0.73      0.67        51

    accuracy                           0.73       134
   macro avg       0.72      0.73      0.72       134
weighted avg       0.74      0.73      0.73       134



### K-Nearest-Neighbour

In [389]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(xtrain, ytrain)
pred_neigh = neigh.predict(xtest)
print(classification_report(ytest,pred_neigh))

              precision    recall  f1-score   support

           0       0.68      0.75      0.71        83
           1       0.51      0.43      0.47        51

    accuracy                           0.63       134
   macro avg       0.60      0.59      0.59       134
weighted avg       0.62      0.63      0.62       134



### Support Vector Classifier

In [390]:
clf = make_pipeline(StandardScaler(), SVC())
clf.fit(xtrain, ytrain)
pred_clf = clf.predict(xtest)
print(classification_report(ytest,pred_clf))

              precision    recall  f1-score   support

           0       0.80      0.96      0.87        83
           1       0.91      0.61      0.73        51

    accuracy                           0.83       134
   macro avg       0.86      0.79      0.80       134
weighted avg       0.84      0.83      0.82       134



## **Ex's 5 (Matrix and Conclusion)**

### Confusion Matrix

In [391]:
print(confusion_matrix(ytest,pred))
print(confusion_matrix(ytest,pred_tree))
print(confusion_matrix(ytest,pred_neigh))
print(confusion_matrix(ytest,pred_clf))

[[77  6]
 [14 37]]
[[61 22]
 [14 37]]
[[62 21]
 [29 22]]
[[80  3]
 [20 31]]


### Conclusion

- Logistic Regression (pred) demonstrates strong performance across all evaluation metrics, boasting the highest accuracy, precision, recall, and F1-score among the models evaluated.

- SVC (pred_clf) also performs well, exhibiting notably high precision, although it has a slightly lower recall compared to Model 1.

- DT (pred_tree), while having a respectable precision score, lags behind Models 1 and 4 in terms of accuracy and F1-score.

- KNN (pred_neigh) trails behind the others with the lowest scores in accuracy, precision, recall, and F1-score.

Considering these metrics, Logistic Regression (pred) emerged as the "top-performing model" for general use. However, it's important to keep in mind that the choice of the "better" model should align with specific objectives and the balance between precision and recall.