In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [113]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [114]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/train.csv')

In [115]:
train_data.shape

(750000, 12)

In [116]:
train_data.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,662907.0,750000.0,603970.0,749999.0,750000.0
mean,374999.5,64.504738,59.859901,52.236449,1.348855,45.437406
std,216506.495284,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,35.73,39.41,28.38,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,94.07,79.53,76.6,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97


In [117]:
#count nulls in each column
train_data.isnull().sum()


Unnamed: 0,0
id,0
Podcast_Name,0
Episode_Title,0
Episode_Length_minutes,87093
Genre,0
Host_Popularity_percentage,0
Publication_Day,0
Publication_Time,0
Guest_Popularity_percentage,146030
Number_of_Ads,1


In [118]:
# Replace null values with Median
train_data['Episode_Length_minutes'].fillna(train_data['Episode_Length_minutes'].median(), inplace=True)
train_data['Guest_Popularity_percentage'].fillna(train_data['Guest_Popularity_percentage'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Episode_Length_minutes'].fillna(train_data['Episode_Length_minutes'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Guest_Popularity_percentage'].fillna(train_data['Guest_Popularity_percentage'].median(), inplace=True)


In [119]:
# remove id column
train_data = train_data.drop('id', axis=1)
train_data = train_data.drop('Podcast_Name', axis=1)

In [120]:
# Remove the word 'Episode' from the Episode_Title column
train_data['Episode_Title'] = train_data['Episode_Title'].str.replace('Episode', '', regex=False)

In [121]:
train_data.tail()

Unnamed: 0,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
749995,25,75.66,Education,69.36,Saturday,Morning,53.58,0.0,Negative,56.87058
749996,21,75.75,Business,35.21,Saturday,Night,53.58,2.0,Neutral,45.46242
749997,51,30.98,Lifestyle,78.58,Thursday,Morning,84.89,0.0,Negative,15.26
749998,47,108.98,Lifestyle,45.39,Thursday,Morning,93.27,0.0,Negative,100.72939
749999,99,24.1,Sports,22.45,Saturday,Night,36.72,0.0,Neutral,11.94439


In [122]:
train_data.dtypes

Unnamed: 0,0
Episode_Title,object
Episode_Length_minutes,float64
Genre,object
Host_Popularity_percentage,float64
Publication_Day,object
Publication_Time,object
Guest_Popularity_percentage,float64
Number_of_Ads,float64
Episode_Sentiment,object
Listening_Time_minutes,float64


In [123]:
# one hot encode Genre, puplication day ,publication time ,episode sentiment
train_data = pd.get_dummies(train_data, columns=['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'], drop_first=True)

In [124]:
X = train_data.drop(columns=['Listening_Time_minutes'])
y = train_data['Listening_Time_minutes']

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 600000 samples
Testing set size: 150000 samples


In [125]:
train_data.columns

Index(['Episode_Title', 'Episode_Length_minutes', 'Host_Popularity_percentage',
       'Guest_Popularity_percentage', 'Number_of_Ads',
       'Listening_Time_minutes', 'Genre_Comedy', 'Genre_Education',
       'Genre_Health', 'Genre_Lifestyle', 'Genre_Music', 'Genre_News',
       'Genre_Sports', 'Genre_Technology', 'Genre_True Crime',
       'Publication_Day_Monday', 'Publication_Day_Saturday',
       'Publication_Day_Sunday', 'Publication_Day_Thursday',
       'Publication_Day_Tuesday', 'Publication_Day_Wednesday',
       'Publication_Time_Evening', 'Publication_Time_Morning',
       'Publication_Time_Night', 'Episode_Sentiment_Neutral',
       'Episode_Sentiment_Positive'],
      dtype='object')

In [126]:
# Ensure we only select columns that exist in the DataFrame
numerical_cols = [col for col in ['Episode_Number', 'Episode_Length_minutes',
                   'Host_Popularity_percentage', 'Guest_Popularity_percentage',
                   'Number_of_Ads'] if col in X.columns]
categorical_cols = [col for col in X.columns if col not in numerical_cols]

# Scale numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical_cols])
X_test_numerical = scaler.transform(X_test[numerical_cols])

# Create DataFrames maintaining original indices
X_train_num_df = pd.DataFrame(X_train_numerical,
                             columns=numerical_cols,
                             index=X_train.index)
X_test_num_df = pd.DataFrame(X_test_numerical,
                            columns=numerical_cols,
                            index=X_test.index)

# Combine features without resetting index
X_train_processed = pd.concat([X_train_num_df, X_train[categorical_cols]], axis=1)
X_test_processed = pd.concat([X_test_num_df, X_test[categorical_cols]], axis=1)

# Convert to float - now safer with explicit handling
try:
    X_train_processed = X_train_processed.astype(float)
    X_test_processed = X_test_processed.astype(float)
except ValueError as e:
    print(f"Error converting to float: {e}")
    # Handle non-float columns here

In [89]:
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Define the model with specified hyperparameters
model = XGBRegressor(
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    subsample=0.7,
    colsample_bytree=0.7
)

# Fit the model
model.fit(X_train_processed, y_train)

# Make predictions
y_pred = model.predict(X_test_processed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 168.72895445029815
R-squared: 0.7706949086718065


In [92]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- Load and Preprocess Test Data ---

# Load test data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/test.csv')

# Drop irrelevant columns
df = df.drop(["id", "Podcast_Name"], axis=1)

# Optional: Clean "Episode_Title" (if needed)
df["Episode_Title"] = df["Episode_Title"].str.replace("Episode ", "", regex=False)

# Handle missing values
numerical_cols = [
    "Episode_Length_minutes",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage",
    "Number_of_Ads"
]
for col in numerical_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

# One-hot encoding for categorical columns
categorical_cols = ["Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

numerical_cols = ["Episode_Title"] + numerical_cols  # Corrected to Episode_Number
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# --- Prediction Function for XGBRegressor ---

def predict_listening_time_xgb(test_df, trained_model):
    """
    Predict Listening_Time_minutes using a trained XGBRegressor model.

    Args:
        test_df (pd.DataFrame): Test features (already preprocessed).
        trained_model (XGBRegressor): Already trained model.

    Returns:
        pd.DataFrame: Predicted results with ID and Listening_Time_minutes.
    """
    # Convert DataFrame to NumPy array
    X_test = test_df.to_numpy()

    # Predict
    y_pred = trained_model.predict(X_test)

    # Create IDs starting from 750000
    ids = np.arange(750000, 750000 + len(y_pred))

    # Create output DataFrame
    results_df = pd.DataFrame({
        'id': ids,
        'Listening_Time_minutes': y_pred
    })

    # Save predictions
    results_df.to_csv("predictions_xgb.csv", index=False)

    return results_df

# --- Usage after model training ---

# model is already trained in your earlier block:
model.fit(X_train_processed, y_train)

# Now just run prediction
predictions_df = predict_listening_time_xgb(df, model)
print(predictions_df.head())


       id  Listening_Time_minutes
0  750000               65.257416
1  750001               19.698502
2  750002               13.013015
3  750003               65.067017
4  750004               45.398121
