In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from data import load_data,save_result

In [None]:
X,y,df_test,passenger_ids,_=load_data()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models={
    "logistic":LogisticRegression(max_iter=1000),
    "SVC_linear":SVC(kernel='linear', probability=True, random_state=42),
    "SVC_rbf":SVC(kernel='rbf', probability=True, random_state=42),
    "NB":GaussianNB(),
    "RF":RandomForestClassifier(n_estimators=100, random_state=42) 
}

for name in models:
    model=models[name]
    print("Model: "+name)
    cv_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
    print("Cross-Validation Scores:", cv_scores)
    print("Mean Accuracy:", cv_scores.mean())

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print("\nAccuracy:", accuracy)
    print("Classification Report:\n", class_report)

    # save results
    predictions  = model.predict(df_test)
    save_result(name, predictions,passenger_ids)

Model: logistic
Cross-Validation Scores: [0.81564246 0.82022472 0.82022472 0.84269663 0.85955056]
Mean Accuracy: 0.8316678174628084

Accuracy: 0.8100558659217877
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.83      0.84       105
         1.0       0.76      0.78      0.77        74

    accuracy                           0.81       179
   macro avg       0.80      0.81      0.80       179
weighted avg       0.81      0.81      0.81       179

Predictions saved to ./results/result_logistic.csv
Model: SVC_linear
Cross-Validation Scores: [0.79888268 0.81460674 0.8258427  0.81460674 0.85393258]
Mean Accuracy: 0.8215742891218379

Accuracy: 0.8212290502793296
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.86      0.85       105
         1.0       0.79      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0

## XGBoost

In [7]:
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("Classification Report:\n", class_report)

# Make predictions on the test dataset
predictions  = best_xgb_model.predict(df_test)
save_result("XG", predictions,passenger_ids)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.8398601398601399

Accuracy: 0.8044692737430168
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.85      0.84       105
         1.0       0.77      0.74      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179

Predictions saved to ./results/result_XG.csv
