# 🚂 Shinkansen Travel Experience - Passenger Satisfaction Prediction
# 📚 Prepared for Google Colab

In [ ]:
# ---------------------------------------------
# 1. Setup
# ---------------------------------------------

!pip install lightgbm xgboost catboost openpyxl

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import warnings
warnings.filterwarnings('ignore')

travel_train = pd.read_csv('/content/Traveldata_train.csv')
survey_train = pd.read_csv('/content/Surveydata_train.csv')
travel_test = pd.read_csv('/content/Traveldata_test.csv')
survey_test = pd.read_csv('/content/Surveydata_test.csv')
sample_submission = pd.read_csv('/content/Sample_Submission.csv')

In [ ]:
# ---------------------------------------------
# 2. Merge Datasets
# ---------------------------------------------

train = pd.merge(travel_train, survey_train, on='ID')
test = pd.merge(travel_test, survey_test, on='ID')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [ ]:
# ---------------------------------------------
# 3. Data Preprocessing
# ---------------------------------------------

missing = train.isnull().mean() * 100
missing = missing[missing > 0].sort_values(ascending=False)
print("Missing values:\n", missing)

for col in train.columns:
    if train[col].dtype == 'object':
        train[col].fillna(train[col].mode()[0], inplace=True)
        test[col].fillna(test[col].mode()[0], inplace=True)
    else:
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(test[col].mean(), inplace=True)

cat_features = train.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_features:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

In [ ]:
# ---------------------------------------------
# 4. Feature Engineering
# ---------------------------------------------

X = train.drop(['Overall_Experience', 'ID'], axis=1)
y = train['Overall_Experience']
X_test = test.drop('ID', axis=1)

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [ ]:
# ---------------------------------------------
# 5. Model Building
# ---------------------------------------------

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_params = {
    'objective': 'binary',
    'metric': 'accuracy',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': -1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'random_state': 42,
    'verbose': -1
}

oof_preds = np.zeros(X.shape[0])
test_preds = np.zeros(X_test.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Training fold {fold + 1}")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(lgb_params, train_data, num_boost_round=5000, 
                      valid_sets=[train_data, val_data], early_stopping_rounds=100, verbose_eval=500)

    oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    test_preds += model.predict(X_test, num_iteration=model.best_iteration) / kf.n_splits

final_oof = (oof_preds >= 0.5).astype(int)
final_test = (test_preds >= 0.5).astype(int)

print("\nCross-validation Accuracy:", accuracy_score(y, final_oof))
print("Classification Report:\n", classification_report(y, final_oof))

In [ ]:
# ---------------------------------------------
# 6. Feature Importance
# ---------------------------------------------

lgb.plot_importance(model, max_num_features=20, importance_type='gain')
plt.title("Feature Importance")
plt.show()

In [ ]:
# ---------------------------------------------
# 7. Submission
# ---------------------------------------------

submission = sample_submission.copy()
submission['Overall_Experience'] = final_test
submission.to_csv('final_submission.csv', index=False)

print("\n✅ Submission file created: final_submission.csv")