In [2]:
import pandas as pd
import numpy as np

# 1. Business Understanding
Source : https://www.kaggle.com/competitions/titanic

# 2. Data Understanding

In [3]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


__Variable and	Definition__
1. survival :	Survival	(0 = No, 1 = Yes)
2. pclass :	Ticket class	(1 = 1st, 2 = 2nd, 3 = 3rd)
3. sex :	Sex
4. Age :	Age in years
5. sibsp :	# of siblings / spouses aboard the Titanic
6. parch :	# of parents / children aboard the Titanic
7. ticket :	Ticket number
8. fare :	Passenger fare
9. cabin :	Cabin number
10. embarked :	Port of Embarkation	(C = Cherbourg, Q = Queenstown, S = Southampton)

# 3. Data Preparation

## 3.1. Handling Missing Value
Secara umum, ada 2 approach yang dipakai untuk handling missing value, diantaranya :
- drop kolom/baris yang ada missing value-nya
- impute missing value based on assumption, biasanya jika maximum missing value < 30%, selainnya di drop saja kolomnya.

In [4]:
100*df.isna().sum()/len(df)

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

Action to take :
- kolom `Cabin` di drop karena missing valuenya > 30%
- `Age` dan `Embarked` di impute

### Tips For Missing Value dengan Mean, Median, dan Modus

__Mean__
<br>Jika datanya numerical dan berdistribusi normal, dimana nilai-nilainya terkonsentrasi di sekitar mean, maka mensubtitusi missing value dengan mean adalah pilihan terbaik Namun dalam beberapa kondisi mean akan menjadi pilihan yang kurang baik bahkan tidak tepat. Misalnya ketika distribusinya tidak normal (nilai-nilainya terkonsentrasi di bagian tertentu) atau adanya outlier yang dapat mempengaruhi mean.

__Median__
<br>Jika datanya numerical dan apabila terdapat skew yang significant pada data ataupun memiliki outlier yang cukup berpengaruh pada distribusi data, maka menggunakan median pada kasus ini lebih tepat untuk meminimalisir magnitude dari skew dan outlier tersebut.

__Modus__
<br>Digunakan pada numerical dengan variansi yang kecil atau tipe data categorical, atau dapat diisi dengan "MISSING".



> Note :<br>Tujuan pemakaiannya semata-mata hanya untuk mempertahankan distribusi data, namun hasilnya tidak dapat divalidasi kebenarannya.

In [5]:
df = df.drop(columns = ['Cabin'])

In [6]:
median_age = np.nanmedian(df['Age'])
df['Age'].fillna(median_age,inplace=True)

In [7]:
modus_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(modus_embarked,inplace=True)

In [8]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

## 3.2. Convert Categorical to Numerical Value

In [9]:
# OneHotEncoder

df = pd.get_dummies(df)

In [12]:
dropped_columns = ['PassengerId', 'Name', 'Ticket']

df = df.drop(columns = dropped_columns)

KeyError: "['Name', 'Ticket'] not found in axis"

## 3.3. Data Splitting

In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)",...,Ticket_W./C. 14263,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,1,3,26.0,0,0,7.925,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X = df.drop(columns = ['Survived'])
y = df.Survived

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

In [19]:
y_train.value_counts(normalize=True)

0    0.616573
1    0.383427
Name: Survived, dtype: float64

In [20]:
y_test.value_counts(normalize=True)

0    0.614525
1    0.385475
Name: Survived, dtype: float64

# 4. Modeling

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [45]:
model_knn = KNeighborsClassifier(n_neighbors=10)
model_knn.fit(X_train,y_train)
model_knn.score(X_train,y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7078651685393258

In [46]:
model_knn.predict(X_train.iloc[0:2])

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array([1, 0], dtype=int64)

# 5. Evaluation

In [28]:
from sklearn.metrics import classification_report

In [47]:
def model_performance(model) :
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print(classification_report(y_train,y_train_pred))
    print(classification_report(y_test,y_test_pred))

In [48]:
model_performance(model_knn)

              precision    recall  f1-score   support

           0       0.69      0.94      0.80       439
           1       0.77      0.34      0.47       273

    accuracy                           0.71       712
   macro avg       0.73      0.64      0.63       712
weighted avg       0.72      0.71      0.67       712

              precision    recall  f1-score   support

           0       0.65      0.89      0.75       110
           1       0.59      0.25      0.35        69

    accuracy                           0.64       179
   macro avg       0.62      0.57      0.55       179
weighted avg       0.63      0.64      0.60       179



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
