In [1]:
from tools import *

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Initialize Gaussian Naïve Bayes
gnb = GaussianNB()

# K-Fold Cross Validation (Stratified)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
cv_log_losses = []
for train_idx, val_idx in kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # Train Naïve Bayes
    model = GaussianNB()
    model.fit(X_train_fold, y_train_fold)

    # Validate model
    y_val_pred = model.predict(X_val_fold)
    acc = accuracy_score(y_val_fold, y_val_pred)
    cv_scores.append(acc)

print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

final_model = GaussianNB()
final_model.fit(X_train, y_train)

# Predict on test set
y_test_pred = final_model.predict(X_test)

# Convert predictions back to original labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

submission = pd.DataFrame({"id": test_raw.index, "y": y_test_pred_labels})
submission.to_csv("submission_gnb.csv", index=False)
print("Submission file saved as submission_gnb.csv")

Cross-validation accuracy: 0.6500 ± 0.0087
Submission file saved as submission_gnb.csv 🚀


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Feature Selection using Mutual Information
selector = SelectKBest(mutual_info_classif, k=8)  # Selecting top 8 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get selected feature names
selected_features = np.array(features)[selector.get_support()]
print("Selected Features:", selected_features)

# Initialize Gaussian Naïve Bayes
gnb = GaussianNB()

# K-Fold Cross Validation (Stratified)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for train_idx, val_idx in kf.split(X_train_selected, y_train):
    X_train_fold, X_val_fold = X_train_selected[train_idx], X_train_selected[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # Train model
    model = GaussianNB()
    model.fit(X_train_fold, y_train_fold)

    # Validate model
    y_val_pred = model.predict(X_val_fold)
    acc = accuracy_score(y_val_fold, y_val_pred)
    cv_scores.append(acc)

print(f"Cross-validation accuracy after feature selection: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Train final model on full training data
final_model = GaussianNB()
final_model.fit(X_train_selected, y_train)

# Predict on test set
y_test_pred = final_model.predict(X_test_selected)

# Convert predictions back to original labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Save submission file
submission = pd.DataFrame({"id": test_raw.index, "y": y_test_pred_labels})
submission.to_csv("submission_gnb_selected.csv", index=False)
print("Submission file saved as submission_gnb_selected.csv")


Selected Features: ['x2' 'x3' 'x4' 'x6' 'x8' 'x9' 'x10' 'x11']
Cross-validation accuracy after feature selection: 0.6572 ± 0.0074
Submission file saved as submission_gnb_selected.csv
