In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error



# 1. Load data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("train:", train.shape)
print("test:", test.shape)

train: (750000, 12)
test: (250000, 11)


 # 2. EDA: Basic Exploration

In [3]:
print(train.isnull().sum())

display(train.describe())

categorical_features = [
    'Podcast_Name', 'Episode_Title', 'Genre',
    'Publication_Day', 'Publication_Time',
    'Episode_Sentiment'
]

for col in categorical_features:
    print(f"\n{col} — unique:", train[col].nunique())


id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64


Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,662907.0,750000.0,603970.0,749999.0,750000.0
mean,374999.5,64.504738,59.859901,52.236449,1.348855,45.437406
std,216506.495284,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,35.73,39.41,28.38,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,94.07,79.53,76.6,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97



Podcast_Name — unique: 48

Episode_Title — unique: 100

Genre — unique: 10

Publication_Day — unique: 7

Publication_Time — unique: 4

Episode_Sentiment — unique: 3


# 3. Preprocess

In [7]:
missing_values_columns = ["Episode_Length_minutes", "Number_of_Ads"]

for col in missing_values_columns:
    mean = train[col].mean() # Using mean as you chose

    # Fill NaNs in BOTH train and test sets with that value
    train[col] = train[col].fillna(mean)
    test[col] = test[col].fillna(mean)


test['Has_Guest'] = test['Guest_Popularity_percentage'].apply(
    lambda x: 0 if pd.isna(x) else 1
)
train['Has_Guest'] = train['Guest_Popularity_percentage'].apply(
    lambda x: 0 if pd.isna(x) else 1
)

train['Guest_Popularity_percentage'] = train['Guest_Popularity_percentage'].fillna(0.0)
test['Guest_Popularity_percentage'] = test['Guest_Popularity_percentage'].fillna(0.0)

train = train.drop(columns=['Episode_Title'])
test = test.drop(columns=['Episode_Title'])

train.head()




Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Has_Guest
1,1,Joke Junction,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241,1
2,2,Study Sessions,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531,1
3,3,Digital Digest,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824,1
4,4,Mind & Body,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031,1
5,5,Fitness First,26.54,Health,48.96,Saturday,Afternoon,0.0,3.0,Positive,22.77047,1


In [9]:
# Create 'Popularity_Combined'
train['Popularity_Combined'] = train['Host_Popularity_percentage'] + train['Guest_Popularity_percentage']
test['Popularity_Combined'] = test['Host_Popularity_percentage'] + test['Guest_Popularity_percentage']

# Create 'Ads_per_Minute'
epsilon = 1e-6 # To prevent division by zero
train['Ads_per_Minute'] = train['Number_of_Ads'] / (train['Episode_Length_minutes'] + epsilon)
test['Ads_per_Minute'] = test['Number_of_Ads'] / (test['Episode_Length_minutes'] + epsilon)

train.head()

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Has_Guest,Popularity_Combined,Ads_per_Minute
1,1,Joke Junction,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241,1,142.9,0.016694
2,2,Study Sessions,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531,1,78.94,0.0
3,3,Digital Digest,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824,1,135.92,0.029775
4,4,Mind & Body,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031,1,138.75,0.027147
5,5,Fitness First,26.54,Health,48.96,Saturday,Afternoon,0.0,3.0,Positive,22.77047,1,48.96,0.113037


Train

In [41]:
# 1. Separate your target variable (y_train) and store test IDs
y_train = train['Listening_Time_minutes']
test_ids = test['id']

# 2. Drop the target and 'id' columns before combining
# We drop 'id' because it's an identifier, not a feature
X_train = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id'])

# 3. Combine them. This ensures alignment for one-hot encoding.
combined_df = pd.concat([X_train, X_test], ignore_index=True)

print(f"Combined shape before encoding: {combined_df.shape}")

Original y_train sample: 1    88.01241
2    44.92531
3    46.27824
4    75.61031
5    22.77047
Name: Listening_Time_minutes, dtype: float64
Log-transformed y_train sample: 1    4.488776
2    3.827016
3    3.856050
4    4.338732
5    3.168444
Name: Listening_Time_minutes, dtype: float64
Combined shape before encoding: (912906, 12)


In [42]:
# List of categorical columns you need to encode
categorical_features = [
    'Podcast_Name',
    'Genre',
    'Publication_Day',
    'Publication_Time',
    'Episode_Sentiment'
]

# Apply one-hot encoding
combined_df_encoded = pd.get_dummies(combined_df, columns=categorical_features, drop_first=True)

# 'drop_first=True' helps avoid multicollinearity (redundant information)

print(f"Shape after encoding: {combined_df_encoded.shape}")

Shape after encoding: (912906, 74)


In [43]:
# Split the data back using the original length of X_train
X_train_processed = combined_df_encoded.iloc[:len(X_train)].copy()
X_test_processed = combined_df_encoded.iloc[len(X_train):].copy()

print(f"Processed Train shape: {X_train_processed.shape}")
print(f"Processed Test shape: {X_test_processed.shape}")
print(f"Target (y_train) shape: {y_train.shape}")

Processed Train shape: (662906, 74)
Processed Test shape: (250000, 74)
Target (y_train) shape: (662906,)


In [44]:
from sklearn.preprocessing import StandardScaler

# List of the original numerical features we created or kept
numerical_cols = [
    'Episode_Length_minutes',
    'Host_Popularity_percentage',
    'Guest_Popularity_percentage',
    'Number_of_Ads',
    'Has_Guest',  # (Though binary, scaling doesn't hurt)
    'Popularity_Combined',
    'Ads_per_Minute'
]

In [45]:
# Initialize the StandardScaler
scaler = StandardScaler()

# 1. Fit on the training data AND transform it
X_train_processed[numerical_cols] = scaler.fit_transform(X_train_processed[numerical_cols])

# 2. Transform the test data using the mean/std learned from the training data
X_test_processed[numerical_cols] = scaler.transform(X_test_processed[numerical_cols])

print("Scaling complete.")
print("\nTraining data head after scaling:")
display(X_train_processed.head())

print("\nTest data head after scaling:")
display(X_test_processed.head())

Scaling complete.

Training data head after scaling:


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Has_Guest,Popularity_Combined,Ads_per_Minute,Podcast_Name_Brain Boost,Podcast_Name_Business Briefs,Podcast_Name_Business Insights,...,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Neutral,Episode_Sentiment_Positive
0,1.67725,0.309402,1.020754,0.560668,0.0,1.010454,-0.336258,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,0.284993,0.441463,-1.021645,-1.169355,0.0,-0.583315,-0.643922,False,False,False,...,False,False,False,True,False,True,False,False,False,False
2,0.080856,-0.116082,1.104609,0.560668,0.0,0.836525,-0.095193,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,1.395462,0.883127,0.494145,1.42568,0.0,0.907043,-0.143631,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,-1.15155,-0.477284,-1.295165,1.42568,0.0,-1.330364,1.439242,False,False,False,...,True,False,False,False,False,False,False,False,False,True



Test data head after scaling:


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Has_Guest,Popularity_Combined,Ads_per_Minute,Podcast_Name_Brain Boost,Podcast_Name_Business Briefs,Podcast_Name_Business Insights,...,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Neutral,Episode_Sentiment_Positive
662906,0.0,-0.951744,0.33101,0.0,0.0,-0.271838,-0.257698,False,False,False,...,True,False,False,False,False,True,False,False,True,False
662907,1.67725,0.499186,-1.295165,0.560668,-1.0,-0.77394,-0.336258,False,False,False,...,False,True,False,False,False,False,True,False,True,False
662908,0.284993,0.350507,1.678176,-1.169355,0.0,1.571114,-0.643922,False,False,False,...,False,False,False,False,False,True,False,False,False,True
662909,0.080856,-1.594998,0.282831,0.560668,0.0,-0.677755,-0.095193,False,False,False,...,False,True,False,False,False,False,True,False,False,True
662910,1.395462,-0.0776,-0.950597,1.42568,0.0,-0.821035,-0.143631,False,False,False,...,False,False,False,False,True,False,True,False,True,False


test

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Split the processed training data into a new training set and a validation set
# 80% will be used for training, 20% for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_processed, y_train, test_size=0.2, random_state=42
)

print(f"Training features shape: {X_train_split.shape}")
print(f"Validation features shape: {X_val_split.shape}")

Training features shape: (530324, 74)
Validation features shape: (132582, 74)


In [47]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# We'll use the same validation split as before:
# X_train_split, X_val_split, y_train_split, y_val_split

# 1. Initialize the model
cb = CatBoostRegressor(
    iterations=1000,          # Similar to n_estimators
    learning_rate=0.05,
    loss_function='RMSE',     # Tell it to optimize for RMSE
    eval_metric='RMSE',       # Tell it to report RMSE
    random_seed=42,
    logging_level='Silent'    # Hides the training logs for a cleaner output
)

print("CatBoost Regressor initialized.")

# 2. Train the model
print("Starting model training...")
cb.fit(
    X_train_split, y_train_split,
    eval_set=(X_val_split, y_val_split), # Provide validation data
    early_stopping_rounds=100            # Stop if RMSE doesn't improve
)
print("Model training complete.")

CatBoost Regressor initialized.
Starting model training...
Model training complete.


In [48]:
# 1. Make predictions on the validation data
val_predictions = cb.predict(X_val_split)

# 2. Calculate the Root Mean Squared Error (RMSE)
val_rmse = np.sqrt(mean_squared_error(y_val_split, val_predictions))

print(f"\nValidation RMSE: {val_rmse:.4f}")


New Validation RMSE (after log transform): 10.4890


In [49]:
# 1. Predict on the actual test set
print("Making predictions on the test set...")
test_predictions_raw = cb.predict(X_test_processed)

print("Predictions complete.")

Making predictions on the test set...
Predictions complete.


In [51]:
# 1. Get the original (but imputed) episode lengths from the test set
# We use .iloc to align with X_test_processed
original_test_length = X_test.iloc[:]['Episode_Length_minutes'].values

# 2. Apply the constraint
# np.minimum takes the smaller of the two values for each row
test_predictions_final = np.minimum(test_predictions_raw, original_test_length)

print("Listening time constraint applied.")



Listening time constraint applied.


In [52]:
# 1. Get the original (but imputed) episode lengths from the test set
# We use .iloc to align with X_test_processed
original_test_length = X_test.iloc[:]['Episode_Length_minutes'].values

# 2. Apply the constraint
# np.minimum takes the smaller of the two values for each row
test_predictions_final = np.minimum(test_predictions_raw, original_test_length)

print("Listening time constraint applied.")

Listening time constraint applied.


In [34]:
# 1. Create the submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'Listening_Time_minutes': test_predictions_final
})

# 2. Save to a CSV file
submission.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("Here's a preview:")
display(submission.head())


Submission file 'submission.csv' created successfully!
Here's a preview:


Unnamed: 0,id,Listening_Time_minutes
0,750000,44.404811
1,750001,87.025726
2,750002,54.106034
3,750003,46.544672
4,750004,69.699269
