In [1]:
# =====================
# Core Libraries
# =====================
import numpy as np
import pandas as pd

# =====================
# Visualization
# =====================
import seaborn as sns
import matplotlib.pyplot as plt

# =====================
# Preprocessing
# =====================
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer

# =====================
# Model Selection & Tuning
# =====================
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# =====================
# Regression Models
# =====================
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor

# =====================
# Classification Models
# =====================
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# =====================
# Pipelines
# =====================
from sklearn.pipeline import Pipeline

# =====================
# Metrics
# =====================
from sklearn.metrics import (confusion_matrix, roc_curve, precision_recall_curve,
                             roc_auc_score, precision_score,
                             recall_score, f1_score)

# =====================
# Other Useful Tools
# =====================
from sklearn.datasets import make_regression
from numpy import log1p


In [54]:
data = pd.read_csv("train.csv")

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [None]:
data.isna().sum()

In [None]:
data.columns


In [None]:
genre_means = data.groupby('Genre')['Listening_Time_minutes'].mean().reset_index()

# Sort values by mean duration
genre_means = genre_means.sort_values(by='Listening_Time_minutes', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=genre_means, x='Genre', y='Listening_Time_minutes', palette='viridis')

# Add titles and labels
plt.title('Average Listening  per Genre')
plt.ylabel('Mean Listening (minutes)')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
genre_means = data.groupby('Episode_Sentiment')['Listening_Time_minutes'].mean().reset_index()

# Sort values by mean duration
genre_means = genre_means.sort_values(by='Listening_Time_minutes', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=genre_means, x='Episode_Sentiment', y='Listening_Time_minutes', palette='viridis')

# Add titles and labels
plt.title('Average Listening  per Sentiment')
plt.ylabel('Mean Listening (minutes)')
plt.xlabel('Sentiment')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
genre_means = data.groupby('Podcast_Name')['Listening_Time_minutes'].mean().reset_index()

# Sort values by mean duration
genre_means = genre_means.sort_values(by='Listening_Time_minutes', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=genre_means, x='Podcast_Name', y='Listening_Time_minutes', palette='viridis')

# Add titles and labels
plt.title('Average Listening  per Each podcast')
plt.ylabel('Mean Listening (minutes)')
plt.xlabel('Podcast name')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Bin the Guest_Popularity_percentage into 10% intervals
data['Popularity_Bin'] = pd.cut(data['Guest_Popularity_percentage'],
                                bins=range(0, 130, 10),  # 0–100 in steps of 10
                                right=False,  # 0–10 includes 0 but not 10
                                labels=[f"{i}-{i+10}%" for i in range(0, 120, 10)])

grouped = data.groupby('Popularity_Bin')['Listening_Time_minutes'].mean().reset_index()
plt.figure(figsize=(10, 5))
sns.barplot(data=grouped, x='Popularity_Bin', y='Listening_Time_minutes', palette='Blues_d')

plt.title("Mean Listening Time by Guest Popularity percentage")
plt.xlabel("Guest Popularity (%)")
plt.ylabel("Mean Listening Time (minutes)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# Bin the Guest_Popularity_percentage into 10% intervals
data['Popularity_Bin'] = pd.cut(data['Host_Popularity_percentage'],
                                bins=range(0, 130, 10),  # 0–100 in steps of 10
                                right=False,  # 0–10 includes 0 but not 10
                                labels=[f"{i}-{i+10}%" for i in range(0, 120, 10)])

grouped = data.groupby('Popularity_Bin')['Host_Popularity_percentage'].mean().reset_index()
plt.figure(figsize=(10, 5))
sns.barplot(data=grouped, x='Popularity_Bin', y='Host_Popularity_percentage', palette='Blues_d')

plt.title("Mean Listening Time by host Popularity percentage")
plt.xlabel("host Popularity")
plt.ylabel("Mean Listening Time (minutes)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [62]:
data["Episode_Title"] = data["Episode_Title"].str.replace("Episode", "", regex=False).astype(int)


In [None]:
# Compute mean Listening_Time_minutes per episode
episode_means = data.groupby('Episode_Title')['Listening_Time_minutes'].mean().reset_index()

# Sort values by mean duration (descending)
episode_means = episode_means.sort_values(by='Listening_Time_minutes', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=episode_means, x='Episode_Title', y='Listening_Time_minutes', palette='viridis')

# Annotate
plt.title('Average Listening Time per Episode')
plt.ylabel('Mean Listening Time (minutes)')
plt.xlabel('Episode Title')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Feature engineering:

In [55]:
data.drop(["id", "Podcast_Name"], axis=1, inplace=True)


In [56]:
categorical_cols = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

In [None]:
categorical_cols = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

# One-hot encode and concatenate
data_ohe = pd.get_dummies(data, columns=categorical_cols, prefix=categorical_cols)

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


cols_for_impute = ['Episode_Length_minutes' , 'Guest_Popularity_percentage' , 'Number_of_Ads']

imputer = IterativeImputer(random_state=42)
data[cols_for_impute] = imputer.fit_transform(data[cols_for_impute])


In [18]:
data

Unnamed: 0,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,98,66.620916,True Crime,74.81,Thursday,Night,51.991335,0.0,Positive,31.41998
1,26,119.800000,Comedy,66.95,Saturday,Afternoon,75.950000,2.0,Negative,88.01241
2,16,73.900000,Education,69.97,Tuesday,Evening,8.970000,0.0,Negative,44.92531
3,45,67.170000,Technology,57.22,Monday,Morning,78.700000,2.0,Positive,46.27824
4,86,110.510000,Health,80.07,Monday,Afternoon,58.680000,3.0,Neutral,75.61031
...,...,...,...,...,...,...,...,...,...,...
749995,25,75.660000,Education,69.36,Saturday,Morning,51.907535,0.0,Negative,56.87058
749996,21,75.750000,Business,35.21,Saturday,Night,52.249184,2.0,Neutral,45.46242
749997,51,30.980000,Lifestyle,78.58,Thursday,Morning,84.890000,0.0,Negative,15.26000
749998,47,108.980000,Lifestyle,45.39,Thursday,Morning,93.270000,0.0,Negative,100.72939


In [None]:
data_ohe.drop(columns="Popularity_Bin" , inplace=True)

In [57]:
data['log_Episode_Length_minutes'] = np.log1p(data['Episode_Length_minutes'])

In [58]:
data['log_Number_of_Ads'] = np.log1p(data['Number_of_Ads'])

In [59]:
data['log_Host_Popularity_percentage'] = np.log1p(data['Host_Popularity_percentage'])

In [None]:
data

In [63]:
train = data.drop("Listening_Time_minutes",axis=1)
test = data["Listening_Time_minutes"]
x_train,x_test,y_train,y_test = train_test_split(train,test,random_state=42,test_size=.2)

In [64]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error



train_pool = Pool(x_train, y_train, cat_features=categorical_cols)
val_pool = Pool(x_test, y_test, cat_features=categorical_cols)

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='RMSE',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

y_pred = model.predict(x_test)

0:	learn: 25.2101347	test: 25.1946312	best: 25.1946312 (0)	total: 375ms	remaining: 6m 14s
100:	learn: 13.1616494	test: 13.1266071	best: 13.1266071 (100)	total: 36.1s	remaining: 5m 21s
200:	learn: 13.1079361	test: 13.0890173	best: 13.0890173 (200)	total: 1m 12s	remaining: 4m 46s
300:	learn: 13.0740402	test: 13.0683035	best: 13.0682433 (298)	total: 1m 48s	remaining: 4m 12s
400:	learn: 13.0459903	test: 13.0559146	best: 13.0559146 (400)	total: 2m 26s	remaining: 3m 39s
500:	learn: 13.0205241	test: 13.0447641	best: 13.0447641 (500)	total: 3m 3s	remaining: 3m 2s
600:	learn: 12.9981681	test: 13.0348259	best: 13.0348259 (600)	total: 3m 39s	remaining: 2m 25s
700:	learn: 12.9771339	test: 13.0261646	best: 13.0261646 (700)	total: 4m 16s	remaining: 1m 49s
800:	learn: 12.9568405	test: 13.0205020	best: 13.0205020 (800)	total: 4m 52s	remaining: 1m 12s
900:	learn: 12.9381846	test: 13.0153870	best: 13.0153529 (898)	total: 5m 29s	remaining: 36.2s
999:	learn: 12.9234520	test: 13.0113652	best: 13.0113652 (9

In [66]:
print(model.get_feature_importance(prettified=True))

                        Feature Id  Importances
0       log_Episode_Length_minutes    51.711103
1           Episode_Length_minutes    38.788281
2                    Number_of_Ads     1.771771
3      Guest_Popularity_percentage     1.504174
4                log_Number_of_Ads     1.431211
5                    Episode_Title     1.331329
6   log_Host_Popularity_percentage     1.266472
7       Host_Popularity_percentage     1.251279
8                            Genre     0.342562
9                  Publication_Day     0.275798
10                Publication_Time     0.180037
11               Episode_Sentiment     0.145982


In [None]:
train

In [None]:
# Identify column types
from sklearn.compose import ColumnTransformer


numerical_cols = x_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = x_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Step 1: Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        # categorical columns are passed through
    ],
    remainder='passthrough'  # Pass categorical columns directly to CatBoost
)

# Step 2: Calculate categorical indices in final array
# Because numerical columns go first (transformed), categorical cols follow
cat_indices = list(range(len(numerical_cols), len(numerical_cols) + len(categorical_cols)))

# Step 3: Create model and pipeline
cat_model = CatBoostRegressor(verbose=0)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', cat_model)
])

# Step 4: GridSearchCV
param_grid = {
    'regressor__depth': [6, 8],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__iterations': [300, 500]
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)

# ✅ Step 5: Fit using column indices, not names
grid.fit(x_train, y_train, regressor__cat_features=cat_indices)

# Step 6: Evaluate
y_pred = grid.predict(x_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("Best Parameters:", grid.best_params_)
print("Test RMSE:", rmse)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline


pipeline_default = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

pipeline_default.fit(x_train, y_train)

y_pred = pipeline_default.predict(x_test)
rmse_default = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Default Linear Regression RMSE: {rmse_default:.3f}")


In [None]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor


scaler=StandardScaler()
model_x=XGBRegressor()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)
model_x.fit(x_train,y_train)
y_pred=model_x.predict(x_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Output results
print("Mean Absolute Error (MAE):", round(mae, 2))
print("Mean Squared Error (MSE):", round(mse, 2))
print("Root Mean Squared Error (RMSE):", round(rmse, 2))
print("R² Score:", round(r2, 4))

In [None]:
"""from sklearn.metrics import mean_squared_error, make_scorer


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('model', LinearRegression())
])

param_grid = {
    'poly__degree': [1, 2, 3],                      # Linear, quadratic, cubic
    'poly__interaction_only': [False, True],        # Whether to use only interaction terms
    'poly__include_bias': [False]                   # Usually False to avoid multicollinearity
}

rmse = make_scorer(mean_squared_error, greater_is_better=False, squared=False)


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring=rmse,    # our custom RMSE scorer
    cv=5,            # 5-fold cross-validation
    verbose=1,
    n_jobs=1
)

grid_search.fit(x_train, y_train)

best_rmse = -grid_search.best_score_  
best_params = grid_search.best_params_

print(f"✅ Best RMSE: {best_rmse:.4f}")
print(f"🏆 Best Hyperparameters: {best_params}")"""

In [67]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
0,750000,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral
1,750001,Sound Waves,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral
2,750002,Joke Junction,Episode 11,69.10,Comedy,67.89,Friday,Evening,97.51,0.0,Positive
3,750003,Comedy Corner,Episode 73,115.39,Comedy,23.40,Sunday,Morning,51.75,2.0,Positive
4,750004,Life Lessons,Episode 50,72.32,Lifestyle,58.10,Wednesday,Morning,11.30,2.0,Neutral
...,...,...,...,...,...,...,...,...,...,...,...
249995,999995,Mind & Body,Episode 100,21.05,Health,65.77,Saturday,Evening,96.40,3.0,Negative
249996,999996,Joke Junction,Episode 85,85.50,Comedy,41.47,Saturday,Night,30.52,2.0,Negative
249997,999997,Joke Junction,Episode 63,12.11,Comedy,25.92,Thursday,Evening,73.69,1.0,Neutral
249998,999998,Market Masters,Episode 46,113.46,Business,43.47,Friday,Night,93.59,3.0,Positive


In [68]:
test_prune = test.drop(["id" , "Podcast_Name"] , axis=1 )

In [69]:
test_prune["Episode_Title"] = test_prune["Episode_Title"].str.replace("Episode", "", regex=False).astype(int)


In [70]:
test_prune['log_Episode_Length_minutes'] = np.log1p(test_prune['Episode_Length_minutes'])
test_prune['log_Number_of_Ads'] = np.log1p(test_prune['Number_of_Ads'])
test_prune['log_Host_Popularity_percentage'] = np.log1p(test_prune['Host_Popularity_percentage'])


In [None]:
test_ohe = pd.get_dummies(test, columns=categorical_cols, prefix=categorical_cols)

In [None]:
imputer = IterativeImputer(random_state=42)
test_prune[cols_for_impute] = imputer.fit_transform(test_prune[cols_for_impute])

In [71]:
y_pred = model.predict(test_prune)


In [72]:
submission = pd.DataFrame({'id': test['id'], 'Listening_Time_minutes': y_pred})
submission.to_csv("my_submission.csv", index=False)