<a href="https://colab.research.google.com/github/nihalB05/podcast-listening-time-prediction-/blob/main/pplt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install category_encoders

In [None]:

# 1. Imports and Setup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder

sns.set_theme(style="whitegrid")

# 2. Load Data
df_train = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")

# 3. Initial Exploration (optional quick checks)
print(df_train.info()); print(df_train.isnull().sum())
print(df_test.info()); print(df_test.isnull().sum())

# 4. Data Cleaning & Feature Engineering
# 4.1 Drop tiny missing rows or impute
# Number_of_Ads has one null in train: drop
df_train.dropna(subset=["Number_of_Ads"], inplace=True)

# 4.2 Round Number_of_Ads to integer
for df in (df_train, df_test):
    df.loc[:, 'Number_of_Ads'] = df['Number_of_Ads'].round().astype(int)

# 4.3 Remove extreme outliers in Number_of_Ads (>50)
    df_train.drop(df_train[df_train['Number_of_Ads'] >= 50].index, inplace=True)

# 4.4 Fill missing Episode_Length_minutes by Genre median
for df in (df_train, df_test):
    df['Episode_Length_minutes'] = df.groupby('Genre')['Episode_Length_minutes'].transform(lambda x: x.fillna(x.median()))

# 4.5 Guest_Popularity: fill missing with -1 + missing flag
for df in (df_train, df_test):
    df['Guest_Popularity_percentage'].fillna(-1, inplace=True)
    df['Guest_Popularity_missing'] = (df['Guest_Popularity_percentage'] == -1).astype(int)

# 4.6 Extract Episode_Number from Episode_Title
extract_episode = lambda t: int(re.search(r"Episode\s*(\d+)", str(t), re.IGNORECASE).group(1)) if re.search(r"Episode\s*(\d+)", str(t), re.IGNORECASE) else 0
for df in (df_train, df_test):
    df['Episode_Number'] = df['Episode_Title'].apply(extract_episode)

# 4.7 Interaction features
df_train['host_guest_popularity'] = df_train['Host_Popularity_percentage'] * df_train['Guest_Popularity_percentage']
df_train['length_ads_ratio'] = df_train['Episode_Length_minutes'] / (df_train['Number_of_Ads'] + 1)
# same on test
for df in (df_test,):
    df['host_guest_popularity'] = df['Host_Popularity_percentage'] * df['Guest_Popularity_percentage']
    df['length_ads_ratio'] = df['Episode_Length_minutes'] / (df['Number_of_Ads'] + 1)
genre_counts=df_train["Genre"].value_counts().reset_index()

genre_counts.columns = ['Genre', 'Count']

fig= go.Figure(data=[go.Pie(
    labels=genre_counts['Genre'],
    values=genre_counts['Count'],
    hole=0.4,  # Adjust the 'hole' value to make it a donut chart (0 for pie)
    title='Podcast Count by Genre',
    hoverinfo='label+percent',
    textinfo='percent',
    marker=dict(colors=px.colors.sequential.Plasma), #Use a color scale.

)])

fig.show()
# Define your specific bin edges
bin_edges = [0, 1, 2, 3, 4, 5, 13, float('inf')] #float('inf') to capture all values greater than 13.

# Define corresponding bin labels
bin_labels = ['1', '2', '3', '4', '5', '6-13', '14+']

# Create the bins
df_train['Ads_Bins'] = pd.cut(df_train['Number_of_Ads'], bins=bin_edges, labels=bin_labels, right=False) #right=False, so that the bin includes the lower value.

# Create the count plot
sns.countplot(x='Ads_Bins', data=df_train)
plt.show()

plt.boxplot(df_train["Number_of_Ads"])
plt.xlabel("distribution of Number of Ads ")
plt.ylabel("Number of ads ")


# 4.8 Binning Number_of_Ads (optional; keep continuous too)
# Already using continuous; bins commented out for now
# 4.9 Target encode Podcast_Name
te = TargetEncoder(cols=['Podcast_Name'])
df_train['Podcast_Name_encoded'] = te.fit_transform(df_train['Podcast_Name'], df_train['Listening_Time_minutes'])
df_test['Podcast_Name_encoded'] = te.transform(df_test['Podcast_Name'])

# 4.10 Drop unused columns
drop_cols = ['id', 'Episode_Title', 'Podcast_Name']
df_train.drop(columns=[c for c in drop_cols if c in df_train.columns], inplace=True)
df_test_ids = df_test['id'].copy()
df_test.drop(columns=[c for c in drop_cols if c in df_test.columns], inplace=True)

# 5. One-Hot Encode Categorical Features manually
cats = ['Genre','Publication_Day','Episode_Sentiment', 'Publication_Time']
for col in cats:
    df_train = pd.get_dummies(df_train, columns=[col], prefix=col)
    df_test = pd.get_dummies(df_test, columns=[col], prefix=col)

# Ensure train/test have same columns
test_extra = set(df_train.columns) - set(df_test.columns)
for c in test_extra:
    df_test[c] = 0

# 6. Prepare Features & Target
y = df_train['Listening_Time_minutes']
X = df_train.drop(columns=['Listening_Time_minutes'])

# 7. Train/Validation Split
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=69)

# 8. Preprocessing Pipeline for numeric scaling only
numeric_features = x_train.select_dtypes(include=['int64','float64']).columns.tolist()
# exclude booleans already encoded
dummy_cols = x_train.select_dtypes(include=['uint8','bool']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features)
], remainder='passthrough')  # pass through dummy columns

# 9. Preprocess Train/Val Sets
x_train_pre = preprocessor.fit_transform(x_train)
x_val_pre   = preprocessor.transform(x_val)

df_test_pre = preprocessor.transform(df_test)

# 10. Train Best XGBoost Model
xgb_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=69,
    early_stopping_rounds=30
)
xgb_model.fit(
    x_train_pre, y_train,
    eval_set=[(x_val_pre, y_val)],
    verbose=20
)

# 11. Validation Evaluation
y_pred_val = xgb_model.predict(x_val_pre)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = np.sqrt(mse_val)
print(f"Validation RMSE: {rmse_val:.4f}")

# 12. Final Model Training on Full Train Data
# Combine full train
full_pre = preprocessor.fit_transform(X)

xgb_final = XGBRegressor(
    n_estimators=xgb_model.best_iteration,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=69
)
xgb_final.fit(full_pre, y)

# 13. Predict on Test and Create Submission
final_preds = xgb_final.predict(df_test_pre)
submission = pd.DataFrame({'id': df_test_ids, 'Listening_Time_minutes': final_preds})
submission.to_csv('submission.csv', index=False)
print("Submission.csv saved.")



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Guest_Popularity_percentage'].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Guest_Popularity_percentage'].fillna(-1, inplace=True)


[0]	validation_0-rmse:25.09733
[20]	validation_0-rmse:13.46522
[40]	validation_0-rmse:13.13505
[60]	validation_0-rmse:13.10053
[80]	validation_0-rmse:13.08252
[100]	validation_0-rmse:13.07095
[120]	validation_0-rmse:13.06099
[140]	validation_0-rmse:13.05559
[160]	validation_0-rmse:13.04650
[180]	validation_0-rmse:13.03921
[200]	validation_0-rmse:13.03331
[220]	validation_0-rmse:13.02539
[240]	validation_0-rmse:13.02198
[260]	validation_0-rmse:13.01546
[280]	validation_0-rmse:13.01115
[300]	validation_0-rmse:13.00634
[320]	validation_0-rmse:13.00257
[340]	validation_0-rmse:12.99901
[360]	validation_0-rmse:12.99440
[380]	validation_0-rmse:12.99325
[400]	validation_0-rmse:12.99018
[420]	validation_0-rmse:12.98740
[440]	validation_0-rmse:12.98340
[460]	validation_0-rmse:12.98111
[480]	validation_0-rmse:12.97983
[499]	validation_0-rmse:12.97623
Validation RMSE: 12.9762
Submission.csv saved.


In [None]:
submission.shape


(250000, 2)