In [19]:
## Importing important Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [12]:
# Loading the Excel file
train = pd.read_csv("train.csv")
test = pd.read_csv("test_8gqdJqH.csv")
transactions = pd.read_csv("transactions.csv")

In [13]:

train['doj'] = pd.to_datetime(train['doj'])
transactions['doj'] = pd.to_datetime(transactions['doj'])

train_full = transactions[transactions['dbd'].between(0, 30)].copy()

# Merge final seatcount target
train_full = train_full.merge(train, on=['doj', 'srcid', 'destid'], how='inner')


In [15]:
# 2. Feature Engineering
# ---------------------------
train_full['doj_day'] = train_full['doj'].dt.day
train_full['doj_weekday'] = train_full['doj'].dt.weekday
train_full['doj_month'] = train_full['doj'].dt.month
train_full['doj_is_weekend'] = train_full['doj_weekday'].isin([5, 6]).astype(int)

# Log transformation for stability
train_full['log_cumsum_seatcount'] = np.log1p(train_full['cumsum_seatcount'])
train_full['log_cumsum_searchcount'] = np.log1p(train_full['cumsum_searchcount'])
train_full['booking_intensity'] = train_full['cumsum_seatcount'] / (train_full['cumsum_searchcount'] + 1)


In [16]:
features = [
    'srcid', 'destid', 'doj_day', 'doj_weekday', 'doj_month',
    'doj_is_weekend', 'log_cumsum_seatcount', 'log_cumsum_searchcount',
    'booking_intensity'  # Add other engineered features
]


X = train_full[features]
y = train_full['final_seatcount']

In [17]:
# Optional scaling (for Ridge)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ---------------------------
# 2. Stage 1: LightGBM Base Model
# ---------------------------
lgb_model = LGBMRegressor(
    learning_rate=0.05,
    n_estimators=500,
    max_depth=10,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgb_model.fit(X_train, np.log1p(y_train))

lgb_train_preds = np.expm1(lgb_model.predict(X_train))
lgb_val_preds = np.expm1(lgb_model.predict(X_val))

# ---------------------------
# 3. Stage 2: Residual Model (Ridge on residuals)
# ---------------------------
residuals_train = y_train - lgb_train_preds
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, residuals_train)

residuals_val_preds = ridge.predict(X_val)

# ---------------------------

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 912
[LightGBM] [Info] Number of data points in the train set: 1666560, number of used features: 9
[LightGBM] [Info] Start training from score 7.434467


In [35]:
# ---------------------------
# 4. Combine predictions
# ---------------------------
final_val_preds = lgb_val_preds + residuals_val_preds

# ---------------------------
# 5. Evaluate final result
# ---------------------------
rmse = np.sqrt(mean_squared_error(y_val, final_val_preds))
r2 = r2_score(y_val, final_val_preds)

print(f" Enhanced Residual Modeling RMSE: {rmse:.2f}")
print(f" Enhanced Residual Modeling R²: {r2:.4f}")

 Enhanced Residual Modeling RMSE: 560.83
 Enhanced Residual Modeling R²: 0.7805


In [36]:
# ---------------------------
# 5. Predict on test dbd = 15
# ---------------------------
test_15 = transactions[transactions['dbd'] == 15].copy()
test_15['doj'] = pd.to_datetime(test_15['doj'])

# Apply same features
test_15['doj_day'] = test_15['doj'].dt.day
test_15['doj_weekday'] = test_15['doj'].dt.weekday
test_15['doj_month'] = test_15['doj'].dt.month
test_15['doj_is_weekend'] = test_15['doj_weekday'].isin([5, 6]).astype(int)
test_15['log_cumsum_seatcount'] = np.log1p(test_15['cumsum_seatcount'])
test_15['log_cumsum_searchcount'] = np.log1p(test_15['cumsum_searchcount'])
test_15['booking_intensity'] = test_15['cumsum_seatcount'] / (test_15['cumsum_searchcount'] + 1)
# Re-creating route_key if missing
test_15['route_key'] = test_15['doj'].dt.strftime('%Y-%m-%d') + '_' + test_15['srcid'].astype(str) + '_' + test_15['destid'].astype(str)


In [37]:
# 6. Predict on test data
# ---------------------------
X_test = test_15[features].copy()
X_test_scaled = scaler.transform(X_test)

lgb_test_preds = np.expm1(lgb_model.predict(X_test_scaled))
ridge_test_residuals = ridge.predict(X_test_scaled)
final_test_preds = lgb_test_preds + ridge_test_residuals


In [38]:
# Post-process predictions
final_test_preds = np.clip(np.round(final_test_preds), 0, None)
test_15['final_seatcount'] = final_test_preds.astype(int)



sample = pd.read_csv('sample_submission.csv') 
# Ensuring yur prediction DataFrame known as 'submission' and 'sample' has the same column names
submission = sample.copy()

preds = test_15.groupby('route_key', as_index=False)['final_seatcount'].mean()

# Merging cleanly into the sample
submission = sample.merge(preds, on='route_key', how='left')





In [39]:
submission.head()

Unnamed: 0,route_key,final_seatcount_x,final_seatcount_y
0,2025-02-11_46_45,0,3474.0
1,2025-01-20_17_23,0,1312.0
2,2025-01-08_02_14,0,
3,2025-01-08_08_47,0,
4,2025-01-08_09_46,0,


In [40]:
submission['final_seatcount'] = submission['final_seatcount_y']
submission['final_seatcount'] = submission['final_seatcount'].fillna(0)
submission['final_seatcount'] = submission['final_seatcount'].round().astype(int)
# Dropping extra columns
submission = submission[['route_key', 'final_seatcount']]


submission.to_csv('submission 5.csv', index=False)
print(" Submission file 5 saved.")

 Submission file 5 saved.
