In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Data

## Read CSV

In [None]:
df = pd.read_csv("/kaggle/input/data-set-titanic/train.csv")
df

## Statistik Deskriptif

In [None]:
df.describe(include='all')

## Cek missing value

In [None]:
df.isnull().sum()
#mengecek nilai yang kosong

# Preprocessing

## Isi Data kosong pada kolom age


In [None]:
df['Age'] = df['Age'].fillna(df['Age'].median())
#kurung siku itu kolom
#fillna = isi yg kosong, yg kosong akan diganti dengan median atau nilai tengah

## Isi Data kosong pada Kolom Embarked

In [None]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
#embarked artinya datang dari pelabuhan mana
#mode atau modus, 0 ini modus yg paling banyak yaitu shouthampton/s

## Mengahapus kolom yg tidak diperlukan

In [None]:
df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)
#cabin dan tiket tidak digunakan

In [None]:
df.isnull().sum()

# EDA (Eksplorasi Data Analisis)

## Import Library

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

## Analisa distribusi data kategori

In [None]:
sns.countplot(x='Survived', data=df)
plt.title('Distribusi Survived')
plt.show()

sns.countplot(x='Pclass', data=df)
plt.title('Distribusi Kelas Penumpang')
plt.show()

sns.countplot(x='Sex', data=df)
plt.title('Distribusi Gender Penumpang')
plt.show()

## Analisa barplot antar Fitur

In [None]:
sns.barplot(x='Sex', y='Survived', data=df)
plt.title('Kelangsungan Hidup Berdasarkan Jenis Kelamin')
plt.show()

sns.barplot(x='Pclass', y='Survived', data=df)
plt.title('Kelangsungan Hidup Berdasarkan Kelas Penumpang')
plt.show()

sns.barplot(x='SibSp', y='Survived', data=df)
plt.title('Kelangsungan Hidup Berdasarkan Jumlah Saudara/Kerabat')
plt.show()

sns.barplot(x='Parch', y='Survived', data=df)
plt.title('Kelangsungan Hidup Berdasarkan Jumlah Anak/Orang Tua')
plt.show()

sns.barplot(x='Embarked', y='Survived', data=df)
plt.title('Kelangsungan Hidup Berdasarkan Embarked')
plt.show()

## Analisa histogram

In [None]:
#analisis histogram
sns.histplot(data=df, x='Age', hue='Survived', kde=True)
plt.title('Distribution of Age with Survived')
plt.show()


Anak-anak di bawah 10 tahun lebih berpeluang selamat, bisa jadi karena diprioritaskan untuk diselamatkan menggunakan sekoci

## Analisa violin

In [None]:
sns.violinplot(x='Survived', y='Age', data=df, split=True)
plt.title('Distribusi Umur Berdasarkan Status Kelangsungan Hidup')

# Mengatur ticks pada sumbu y agar hanya menampilkan kelipatan 10
plt.yticks(range(0, int(df['Age'].max())+10, 10))
plt.show()


usia 30 kebawah lebih berpeluang selamat

In [None]:
#analisi bloxpot atau outlier
sns.boxplot(x='Survived', y='Age', data=df)
plt.title('Boxplot Umur Berdasarkan Status Kelangsungan Hidup')
plt.show()

Area box Survived = 1 (Selamat) lebih rendah dibandingkan yang tidak selamat

## Analisa korelasi antara fitur numerik

In [None]:
#Analisis korelasi antar fitur
plt.figure(figsize=(10, 8))
#ini untuk lebar dan tinggi ukuran
corr_matrix = df.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

- Pclass dan Fare memiliki korelasi negativ, semakin rendah Pclass maka semakin tinggi Fare-nya
- Fitur yang paling berpengaruh terhadap Survived adalah Pclass

# Feature Engineering

### Mngubah Data Kategorik menjadi Numerik

In [None]:
from sklearn.preprocessing import LabelEncoder
#Label Encoder untuk mengubah data kategorik menjadi numerik

In [None]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
df.head()
#merubah kolom sex dan embarked

## Feature Selection

In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x = df[features]
y = df['Survived']



- atau bisa dengan drop fitur yg tidak digunakan lalu drop x survived
- df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
- X = df.drop('Survived', axis=1)
- y = df['Survived']

In [None]:
x.head()


In [None]:
y.head()

## Train Test Split, Memisahkan data untuk dilatih dan diuji

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

- test_size itu berarti data test sebesar 20%
- random state itu untuk mengacak urutan dan menghindari urutan yg sudah dipakai

### Data Train

In [None]:
x_train

In [None]:
y_train

### Data test

In [None]:
x_test

In [None]:
y_test

# Modelling

## Membangun classification model menggunakan Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

# Hasil Perbadingan model RandomForest, Regresi, Kneighbors, dan SVM

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(random_state=42)
}

# Training dan evaluasi model
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, y_pred))
    print('------------------------------------------------')
