# Imports and Installs

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Data Loading

## Train-Test Split

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


## (Optional) Data Enhancement

# (Optional) EDA (Explorative Data Analysis)

# Data Preprocessing

## Data Cleaning

### Fill NaN Values

In [4]:
# concatenate all data to get fill values and then split again into no NaN value datasets
num_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

combined = pd.concat([df_train, df_test], sort=False).reset_index(drop=True)

# fill numerical NaN values with median
for col in num_cols:
    combined[col] = combined[col].fillna(combined[col].median())

# fill categorical NaN values with "-1", to avoid bias which would occur by filling with mode
for col in cat_cols:
    combined[col] = combined[col].astype('category').cat.codes

# split back into train/test
df_train_processed = combined.iloc[:len(df_train)].copy()
df_test_processed = combined.iloc[len(df_train):].copy()

## Encoding

## (Optional) Feature Engineering

## (Optional) Feature Selection

## Normalisation

In [5]:
X = df_train_processed.drop(['id', 'Listening_Time_minutes'], axis=1)
y = df_train_processed['Listening_Time_minutes']
X_test = df_test_processed.drop(['id', 'Listening_Time_minutes'], axis=1, errors='ignore')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Handling

## Model Selection

## Model Evaluation

In [6]:
# Model training
# Model predictions
# Performance summary

### Recommended: complete the model training and model prediction during cross-validation

## Model Tuning

### Hyperparameter Optimisation

### (Optional) Parameter Summary

# (Optional) Ensemble Learning

## Training & Evaluation

### Model 1 - LightGBM

In [13]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'seed': 100
}

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'valid'],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

lgb_pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_pred_val))
print("LightGBM Validation RMSE:", lgb_rmse)

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 13.03	valid's rmse: 13.0609
[200]	train's rmse: 12.9199	valid's rmse: 13.0283
[300]	train's rmse: 12.8297	valid's rmse: 13.0069
[400]	train's rmse: 12.7531	valid's rmse: 12.9914
[500]	train's rmse: 12.6811	valid's rmse: 12.9789
[600]	train's rmse: 12.6157	valid's rmse: 12.9718
[700]	train's rmse: 12.5475	valid's rmse: 12.9614
[800]	train's rmse: 12.4848	valid's rmse: 12.9519
[900]	train's rmse: 12.4291	valid's rmse: 12.9445
[1000]	train's rmse: 12.3753	valid's rmse: 12.9364
[1100]	train's rmse: 12.3251	valid's rmse: 12.9324
[1200]	train's rmse: 12.2759	valid's rmse: 12.926
[1300]	train's rmse: 12.2276	valid's rmse: 12.9208
[1400]	train's rmse: 12.1815	valid's rmse: 12.915
[1500]	train's rmse: 12.138	valid's rmse: 12.912
[1600]	train's rmse: 12.0904	valid's rmse: 12.9077
[1700]	train's rmse: 12.0483	valid's rmse: 12.9057
[1800]	train's rmse: 12.0084	valid's rmse: 12.9018
[1900]	train's rmse: 11.9661	valid's

#### Model 2- Linear Regression (With Normalization)

In [8]:
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])
lr_pipeline.fit(X_train, y_train)
lr_pred_val = lr_pipeline.predict(X_val)
lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred_val))
print("Linear Regression Validation RMSE:", lr_rmse)

Linear Regression Validation RMSE: 13.351701011642964


### Model 3 - Support Vector Machine (With Normalization)

In [9]:
# # Takes too long ...
# svm_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('svm', SVR())
# ])
# svm_pipeline.fit(X_train, y_train)
# svm_pred_val = svm_pipeline.predict(X_val)
# svm_rmse = np.sqrt(mean_squared_error(y_val, svm_pred_val))
# print("SVM Validation RMSE:", svm_rmse)

# Submission

### Model 1 - Final Training And Submission

In [10]:
# Retrain final LightGBM model on the full training data using the best iteration from validation
final_lgb_train = lgb.Dataset(X, label=y)
final_lgb_model = lgb.train(
    lgb_params,
    final_lgb_train,
    num_boost_round=lgb_model.best_iteration,
    callbacks=[lgb.log_evaluation(100)]
)

# Predict on the test set using the final model
lgb_test_preds = final_lgb_model.predict(X_test, num_iteration=final_lgb_model.best_iteration)

# Prepare submission using df_test original IDs
submission_lgb = pd.DataFrame({'id': df_test['id'], 'Listening_Time_minutes': lgb_test_preds})
submission_lgb.to_csv('submission_lgb.csv', index=False)
print("LightGBM submission saved to submission_lgb.csv")

LightGBM submission saved to submission_lgb.csv


### Model 2 - Final Training And Submission

In [11]:
final_lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])
final_lr_pipeline.fit(X, y)
lr_test_preds = final_lr_pipeline.predict(X_test)

# Prepare submission using df_test original IDs
submission_lr = pd.DataFrame({'id': df_test['id'], 'Listening_Time_minutes': lr_test_preds})
submission_lr.to_csv('submission_lr.csv', index=False)
print("Linear Regression submission saved to submission_lr.csv")

Linear Regression submission saved to submission_lr.csv


### Model 3 - Final Training And Submission

In [12]:
# # Final retraining of the SVM pipeline on full training data
# final_svm_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('svm', SVR())
# ])
# final_svm_pipeline.fit(X, y)

# # Predict on the test set
# svm_test_preds = final_svm_pipeline.predict(X_test)

# # Prepare submission using df_test original IDs
# submission_svm = pd.DataFrame({'id': df_test['id'], 'Listening_Time_minutes': svm_test_preds})
# submission_svm.to_csv('submission_svm.csv', index=False)
# print("SVM submission saved to submission_svm.csv")