<a href="https://colab.research.google.com/github/martharegina/machinelearning/blob/main/titanic_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE

In [8]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [9]:
# Tambah beberapa features

def is_alone(row):
  if row['SibSp'] == 0 and row['Parch'] == 0:
    return 1
  else:
    return 0

def add_features(df):
  df = df.copy()
  df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
  df['FamilySize'] = df['SibSp'] + df['Parch']
  df['IsAlone'] = df.apply(is_alone, axis=1)
  return df

train_data = add_features(train_data)
test_data = add_features(test_data)

In [10]:
# Hapus unnecessary columns
unnecessary_columns = ['Cabin', 'Ticket']
train_data = train_data.drop(unnecessary_columns, axis=1)
test_data = test_data.drop(unnecessary_columns, axis=1)

In [11]:
# Split data

y = train_data['Survived']
X = train_data.drop(['Survived'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42)

In [12]:
# Fill age dari rata-rata usia title

title_age_mean = X_train.groupby('Title')['Age'].mean()

def fill_age(row):
  if math.isnan(row['Age']):
    return title_age_mean[row['Title']]
  else:
    return row['Age']

X_train['Age'] = X_train.apply(fill_age, axis=1)
X_val['Age'] = X_val.apply(fill_age, axis=1)
test_data['Age'] = test_data.apply(fill_age, axis=1)

In [25]:
# Pilih model terbaik

num_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
cat_cols = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone']

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=10)

models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

results = {}

for name, model in models.items():
  my_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('model', model)
  ])

  scores = cross_val_score(my_pipeline, X_train, y_train, cv=5, scoring='accuracy')
  results[name] = scores.mean()

for model, accuracy in results.items():
  print(f'{model}: {accuracy}')



Logistic Regression: 0.8272234807446075
SVM: 0.8314094356347876
Random Forest: 0.7977740569289866
Gradient Boosting: 0.8215798286220821
K-Nearest Neighbors: 0.7920713089727174


In [24]:
# Pipeline SVM

svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVC(random_state=42, probability=True))
])

param_grid = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.01, 0.1, 1],
    'model__kernel': ['rbf', 'linear']
}

grid = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print('Best parameters: ', grid.best_params_)
print('Best score: ', grid.best_score_)



Best parameters:  {'model__C': 100, 'model__gamma': 0.01, 'model__kernel': 'rbf'}
Best score:  0.8314192849404117


In [27]:
# Predict X_val

best_svm = grid.best_estimator_
preds = best_svm.predict(X_val)
print('Validation accuracy: ', accuracy_score(y_val, preds))

Validation accuracy:  0.8212290502793296




In [28]:
# Predict test_data

preds_test = best_svm.predict(test_data)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': preds_test})
output.to_csv('submission_titanic.csv', index=False)

