### Podcast Predicting
In this project I need to predict the time of time (continues) Regression.
1. Metrics I am gonna use is LightGBM and XGBoost.
2. Metric: mean_squared_error & rmse
3. Additional models: KFolds, RandomForest, ExtraTreesRegressor
4. Encoding: Label Encoding  

1. Importing the Libraries
2. Checking the dataset and cleaning it
3. Fill the empty lists with ffill
4. Encoding
5. Feature Importance
6. Check for combination with heatmap and combine with PCA

# Importing the libraries

In [1]:
import pandas as pd
import numpy as no
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
import os
os.listdir('/kaggle/input')

['train-csv', 'playground-series-s5e4', 'test-csv']

In [3]:
df_train = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")

In [4]:
df_train.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [5]:
df_train['Episode_Length_minutes'].fillna(df_train["Episode_Length_minutes"].median(), inplace=True)
df_train['Guest_Popularity_percentage'].fillna(df_train["Guest_Popularity_percentage"].median(), inplace=True)
df_train['Number_of_Ads'].fillna(df_train["Number_of_Ads"].median(), inplace=True)

df_test['Episode_Length_minutes'].fillna(df_test["Episode_Length_minutes"].median(), inplace=True)
df_test['Guest_Popularity_percentage'].fillna(df_test["Guest_Popularity_percentage"].median(), inplace=True)

In [6]:
categorical_features = df_train.select_dtypes(exclude=['number']).columns.tolist()
categorical_features

['Podcast_Name',
 'Episode_Title',
 'Genre',
 'Publication_Day',
 'Publication_Time',
 'Episode_Sentiment']

In [7]:
cat_col = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
dfs = pd.concat([df_train, df_test], sort=False).reset_index(drop=True)
dfs = pd.get_dummies(dfs, columns=cat_col)

In [8]:
cat_col = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
dfs = pd.concat([df_train, df_test], sort=False).reset_index(drop=True)

week_map = {
    'Sunday': 1,
    'Thursday': 2,
    'Friday': 3,
    'Saturday': 4,
    'Wednesday': 5,
    'Monday': 6,
    'Tuesday': 7
}
dfs['Publication_Day'] = dfs['Publication_Day'].map(week_map)

day_map = {
    'Evening': 1,
    'Morning': 2,
    'Afternoon': 3,
    'Night': 4,
}
dfs['Publication_Time'] = dfs['Publication_Time'].map(day_map)

pos_map = {
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3
}
dfs['Episode_Sentiment'] = dfs['Episode_Sentiment'].map(pos_map)

dfs = pd.get_dummies(dfs, columns=cat_col)

In [9]:
dfs["Episode_Number"] = dfs["Episode_Title"].str.replace("Episode ", "", regex=True).astype(int)
dfs = dfs.drop(columns=["Episode_Title"])

In [10]:
agg_stats = df_train.groupby('Podcast_Name')['Listening_Time_minutes'].agg(
    Podcast_Median_ListenTime='median',
    Podcast_Q25_ListenTime=lambda x: x.quantile(0.25),
    Podcast_Q75_ListenTime=lambda x: x.quantile(0.75),
).reset_index()

dfs = dfs.merge(agg_stats, on='Podcast_Name', how='left')

In [11]:
from sklearn.decomposition import PCA
from gensim.models import Word2Vec

In [12]:
import numpy as np

In [13]:
categorical_features = ['Podcast_Name']
dfs['target_cat'] = dfs[categorical_features].astype(str).agg(' '.join, axis=1)

sentences = [text.split() for text in dfs['target_cat']]
word2vec_model = Word2Vec(sentences, vector_size=50, window=10, sg=1, min_count=1, workers=1)

def get_avg_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

word2vec_vectors = [get_avg_vector(sentence, word2vec_model) for sentence in sentences]


pca = PCA()
word2vec_pca = pca.fit_transform(word2vec_vectors)
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1
pca = PCA(n_components=n_components)
word2vec_pca_reduced = pca.fit_transform(word2vec_vectors)

word2vec_df = pd.DataFrame(word2vec_pca_reduced, columns=[f'word2vec_pca_{i}' for i in range(word2vec_pca_reduced.shape[1])])
word2vec_df = pd.DataFrame(word2vec_vectors, columns=[f'word2vec_{i}' for i in range(word2vec_model.vector_size)])

dfs = pd.concat([dfs.reset_index(drop=True), word2vec_df], axis=1)

dfs = dfs.drop(columns=['target_cat', 'Podcast_Name'])

In [14]:
dfs['Host_Guest_Avg_Popularity'] = (dfs['Host_Popularity_percentage'] + dfs['Guest_Popularity_percentage']) / 2

dfs['Has_Ads'] = dfs['Number_of_Ads'].apply(lambda x: 1 if x > 0 else 0)

dfs['Is_High_Host_Popularity'] = (dfs['Host_Popularity_percentage'] > 70).astype(int)
dfs['Is_High_Guest_Popularity'] = (dfs['Guest_Popularity_percentage'] > 70).astype(int)
dfs['Host_Guest_Popularity_Gap'] = dfs['Host_Popularity_percentage'] - dfs['Guest_Popularity_percentage']
dfs['Ad_Density'] = dfs['Number_of_Ads'] / dfs['Episode_Length_minutes']
dfs['Ad_Density'].replace([np.inf, -np.inf], np.nan, inplace=True)
dfs['Is_Long_Episode'] = (dfs['Episode_Length_minutes'] > 60).astype(int)

In [15]:
df_train = dfs[~dfs["Listening_Time_minutes"].isnull()]
df_test = dfs[dfs["Listening_Time_minutes"].isnull()]

In [16]:
X = df_train.drop(columns=["id","Listening_Time_minutes"])
y = df_train["Listening_Time_minutes"]

In [17]:
import optuna
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=911, shuffle = True)

In [19]:
def objective(trial):
    param = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.3, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0, step=0.05),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0, step=0.05),
        "max_bin": trial.suggest_int("max_bin", 256, 1024, step=128),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "random_state": 911
    }

    # The following lines were incorrectly indented and are now moved inside the function
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    model = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=100,
        evals=[(dtrain, "train"), (dval, "valid")],
        early_stopping_rounds=50,
        verbose_eval=0
    )

    y_pred = model.predict(dval, iteration_range=(0, model.best_iteration))
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

In [20]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

print("Best hyperparameters:", study.best_params)
print("Best RMSE score:", study.best_value)

[I 2025-04-17 01:04:14,571] A new study created in memory with name: no-name-414c09ca-2857-4fb3-b37d-c1551dcba6a4
[I 2025-04-17 01:04:44,878] Trial 0 finished with value: 13.044506703381575 and parameters: {'learning_rate': 0.12000000000000001, 'max_depth': 9, 'subsample': 0.8, 'colsample_bytree': 0.75, 'max_bin': 896, 'min_child_weight': 6, 'gamma': 1.8412661015735239, 'lambda': 0.0011115420299260812, 'alpha': 0.03148869877520974}. Best is trial 0 with value: 13.044506703381575.
[I 2025-04-17 01:06:57,580] Trial 1 finished with value: 13.9107595196823 and parameters: {'learning_rate': 0.3, 'max_depth': 29, 'subsample': 0.6, 'colsample_bytree': 0.9, 'max_bin': 640, 'min_child_weight': 10, 'gamma': 0.5235922381104846, 'lambda': 0.04360687596450015, 'alpha': 0.9142077474487205}. Best is trial 0 with value: 13.044506703381575.
[I 2025-04-17 01:07:21,726] Trial 2 finished with value: 13.122772285934685 and parameters: {'learning_rate': 0.29, 'max_depth': 10, 'subsample': 0.75, 'colsample_b

Best hyperparameters: {'learning_rate': 0.12000000000000001, 'max_depth': 9, 'subsample': 0.8, 'colsample_bytree': 0.75, 'max_bin': 896, 'min_child_weight': 6, 'gamma': 1.8412661015735239, 'lambda': 0.0011115420299260812, 'alpha': 0.03148869877520974}
Best RMSE score: 13.044506703381575


In [21]:
best_params = study.best_params
dtrain_full = xgb.DMatrix(X, label=y) 

final_model = xgb.train(
    params=best_params,
    dtrain=dtrain_full,
    num_boost_round=100,
    verbose_eval=0
)

dtest = xgb.DMatrix(df_test.drop(columns=["id", "Listening_Time_minutes"])) 
predictions = final_model.predict(dtest)

submission_df = pd.DataFrame({'id': df_test['id'], 'Listening_Time_minutes': predictions})
submission_df.to_csv('submission.csv', index=False)

In [22]:
submission_df

Unnamed: 0,id,Listening_Time_minutes
750000,750000,54.237244
750001,750001,18.416592
750002,750002,47.154533
750003,750003,80.660782
750004,750004,48.795441
...,...,...
999995,999995,12.164994
999996,999996,58.593014
999997,999997,6.314389
999998,999998,73.660294
