In [28]:
import polars as pl
import altair as alt
import numpy as np

# Read in data

In [6]:
df_episodes = pl.read_csv("../data/labeling-app/podcast_episodes.csv")
df_banger = pl.read_csv("../data/labeling-app/episode_types.csv")
df_all = df_banger.join(df_episodes, left_on="episode", right_on="title", how="left")

# Get initial summary

In [7]:
df_all.glimpse()

Rows: 553
Columns: 8
$              <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ episode      <str> 'Dorian Finney-Smith Traded to the Lakers', '2024 Year in Review; The Kings Fire Mike Brown', 'Friday Daily Duncs (12/27/24)', 'The Worst Contracts in the NBA; Fox and Butler Drama and the Rest of the News', 'Thursday Daily Duncs (12/26/24)', 'The Christmas Games: LAL @ GSW; SAS @ NYK; MIN @ DAL; PHI @ BOS; DEN @ PHX', 'Tuesday Daily Dunc (12/24/24)', 'Monday Daily Duncs (12/23/24)', '15 in 60 (Eastern Conference 12.23.24)', 'Friday Daily Duncs (12/20/24)'
$ episode_type <str> 'big_picture', 'big_picture', 'daily_duncs', 'big_picture', 'daily_duncs', 'gamer', 'daily_duncs', 'daily_duncs', 'big_picture', 'daily_duncs'
$ banger       <str> 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no'
$ date         <str> '2024-12-30 05:37:20', '2024-12-29 07:50:33', '2024-12-28 03:58:33', '2024-12-27 06:58:03', '2024-12-26 18:57:24', '2024-12-26 08:59:56', '2024-12-24 18:07:47', '2024-12-23 23:16:4

# Cast to correct types for modeling

In [45]:
df_types = df_all.select(
    pl.col("episode"),
    pl.col("episode_type", "banger").cast(pl.Categorical),
    pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S"),
    pl.col("description"),
    pl.col("duration").alias("duration_secs")
)
print(df_types)

shape: (553, 6)
┌─────────────────────┬──────────────┬────────┬──────────────┬─────────────────────┬───────────────┐
│ episode             ┆ episode_type ┆ banger ┆ date         ┆ description         ┆ duration_secs │
│ ---                 ┆ ---          ┆ ---    ┆ ---          ┆ ---                 ┆ ---           │
│ str                 ┆ cat          ┆ cat    ┆ datetime[μs] ┆ str                 ┆ i64           │
╞═════════════════════╪══════════════╪════════╪══════════════╪═════════════════════╪═══════════════╡
│ Dorian Finney-Smith ┆ big_picture  ┆ no     ┆ 2024-12-30   ┆ The Lakers avoid    ┆ 1643          │
│ Traded to …         ┆              ┆        ┆ 05:37:20     ┆ giving up a f…      ┆               │
│ 2024 Year in        ┆ big_picture  ┆ yes    ┆ 2024-12-29   ┆ In an annual        ┆ 5023          │
│ Review; The Kings…  ┆              ┆        ┆ 07:50:33     ┆ tradition, Nate a…  ┆               │
│ Friday Daily Duncs  ┆ daily_duncs  ┆ no     ┆ 2024-12-28   ┆ Full Daily D

# Some ideas for features to predict bangers status
- Rule-based step where if Daily Dunc is in the title the answer is no
- If the title includes Game 1-7 anywhere in it (since playoff recaps tend to be good)
- Decomposing datetime into seasonal trends so if there's any trends over time we capture them
- Say if the episode is over 30 min, because the shorter pods tend not to be bangers
- Can do some exploration of word usage breakdown once we split to train and validation in descriptions for any words that seem to indicate banger episodes vs. not

In [126]:
df_features = df_types.select(
    pl.col("episode").str.contains(r"Game [1-7]").alias("about_playoff_game").cast(pl.Int8),
    pl.col("episode").str.contains(r"H&D").alias("is_hollinger_duncan").cast(pl.Int8),
    pl.col("episode").str.contains(r"Daily Duncs").alias("is_daily_dunc").cast(pl.Int8),
    pl.col("episode").str.contains(r"Mock").alias("is_mock_episode").cast(pl.Int8),
    pl.col("episode").str.contains(r"Awards").alias("is_awards_episode").cast(pl.Int8),
    pl.col("date").dt.year().alias("year"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.weekday().alias("weekday"),
    pl.col("date").dt.hour().alias("hour"),
    (pl.col("duration_secs") > 1800).alias("longer_thirty_min").cast(pl.Int8),
    pl.col("duration_secs"),
    pl.col("episode").str.contains(r"Celtics").alias("description_contains_celtics").cast(pl.Int8),
).drop_nulls()

In [127]:
df_features.describe()

statistic,about_playoff_game,is_hollinger_duncan,is_daily_dunc,is_mock_episode,is_awards_episode,year,month,weekday,hour,longer_thirty_min,duration_secs,description_contains_celtics
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",0.074141,0.050633,0.399638,0.009042,0.009042,2023.954792,6.453888,3.399638,11.56962,0.566004,2524.669078,0.005425
"""std""",0.262238,0.219445,0.490267,0.094742,0.094742,0.207948,3.615354,1.664136,8.581617,0.496073,2020.403906,0.073521
"""min""",0.0,0.0,0.0,0.0,0.0,2023.0,1.0,1.0,0.0,0.0,139.0,0.0
"""25%""",0.0,0.0,0.0,0.0,0.0,2024.0,3.0,2.0,4.0,0.0,443.0,0.0
"""50%""",0.0,0.0,0.0,0.0,0.0,2024.0,6.0,3.0,7.0,1.0,2875.0,0.0
"""75%""",0.0,0.0,1.0,0.0,0.0,2024.0,10.0,5.0,20.0,1.0,3981.0,0.0
"""max""",1.0,1.0,1.0,1.0,1.0,2024.0,12.0,7.0,23.0,1.0,8785.0,1.0


In [121]:
df_outcome = df_types.select(pl.col("banger").cast(pl.Categorical))

# Split into train and validation

In [65]:
from sklearn.model_selection import train_test_split

In [130]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_outcome, 
                                                    test_size=0.25, random_state= 33,
                                                    stratify=df_outcome)

In [131]:
X_train.describe()

statistic,about_playoff_game,is_hollinger_duncan,is_daily_dunc,is_mock_episode,is_awards_episode,year,month,weekday,hour,longer_thirty_min,duration_secs,description_contains_celtics
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",0.077295,0.050725,0.400966,0.007246,0.002415,2023.956522,6.405797,3.439614,11.2343,0.557971,2481.862319,0.007246
"""std""",0.267382,0.2197,0.490687,0.084919,0.049147,0.204178,3.562594,1.664317,8.580099,0.497229,2008.761364,0.084919
"""min""",0.0,0.0,0.0,0.0,0.0,2023.0,1.0,1.0,0.0,0.0,139.0,0.0
"""25%""",0.0,0.0,0.0,0.0,0.0,2024.0,4.0,2.0,3.0,0.0,445.0,0.0
"""50%""",0.0,0.0,0.0,0.0,0.0,2024.0,6.0,3.0,7.0,1.0,2792.0,0.0
"""75%""",0.0,0.0,1.0,0.0,0.0,2024.0,10.0,5.0,20.0,1.0,3985.0,0.0
"""max""",1.0,1.0,1.0,1.0,1.0,2024.0,12.0,7.0,23.0,1.0,8785.0,1.0


# The baseline model is just guessing the majority class

In [118]:
y_train["banger"].value_counts()

banger,count
cat,u32
"""no""",254
"""yes""",160


In [100]:
from sklearn.metrics import accuracy_score

In [77]:
y_pred_baseline = ["no"] * len(y_train)
accuracy = accuracy_score(y_true = y_train, y_pred=y_pred_baseline)

print(f"Baseline accuracy: {round(accuracy, 2)}")

Baseline accuracy: 0.61


# What if we train a simple logistic regression (with elastic net penalty)

In [132]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report

In [129]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(penalty='elasticnet', solver='saga', 
                                          random_state=33, max_iter=1000))
    ]
)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 33)

# Define hyperparameter grid
param_grid = {
    'classifier__l1_ratio': [0.1, 0.5, 0.7, 0.9],  # Elastic Net mixing ratio
    'classifier__C': [0.01, 0.1, 1, 10]           # Regularization strength
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold, scoring='accuracy', n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Display the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Parameters: {'classifier__C': 0.1, 'classifier__l1_ratio': 0.9}
Best Cross-Validation Score: 0.7947693211871878


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [135]:
# Retrieve the best model
best_model = grid_search.best_estimator_
print(best_model)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)
print("\nClassification Report on Test Set:\n")
print(classification_report(y_test, y_pred))

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(C=0.1, l1_ratio=0.9, max_iter=1000,
                                    penalty='elasticnet', random_state=33,
                                    solver='saga'))])
Best Parameters: {'classifier__C': 0.1, 'classifier__l1_ratio': 0.9}
Best Cross-Validation Score: 0.7947693211871878

Classification Report on Test Set:

              precision    recall  f1-score   support

          no       0.98      0.68      0.81        85
         yes       0.66      0.98      0.79        54

    accuracy                           0.80       139
   macro avg       0.82      0.83      0.80       139
weighted avg       0.86      0.80      0.80       139



# Convert to onnx for faster inference

In [143]:
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Get the number of features from training data
n_features = X_train.shape[1]

# Define the initial types for the features
initial_types = [('float_input', FloatTensorType([None, n_features]))]

# Convert the model to ONNX
onnx_model = convert_sklearn(
    best_model,
    initial_types=initial_types,
    target_opset=15  # Can adjust the opset version if needed
)

# Save the ONNX model
onnx_model_path = "episode_banger_model.onnx"
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

Original model predictions: ['yes' 'no' 'yes' 'no' 'no']
ONNX model predictions: ['yes' 'no' 'yes' 'no' 'no']


In [150]:
# Optional: Verify the model can be loaded
import onnxruntime as rt
sess = rt.InferenceSession(onnx_model_path)

# Optional: Test inference with the ONNX model
# Make sure X_test is numpy array
X_test_numpy = X_test.to_numpy()
input_name = sess.get_inputs()[0].name
print(f"Input name: {input_name}")
pred_onnx = sess.run(None, {input_name: X_test_numpy.astype(np.float32)})[0]
pred_onnx_probs = sess.run(None, {input_name: X_test_numpy.astype(np.float32)})[1]

print(pred_onnx_probs[0]["yes"])

# Compare predictions with original model
print("Original model predictions:", y_pred[:5])
print("ONNX model predictions:", pred_onnx[:5])
print("ONNX model probabilities:", pred_onnx_probs[:5])

Input name: float_input
0.6986801028251648
Original model predictions: ['yes' 'no' 'yes' 'no' 'no']
ONNX model predictions: ['yes' 'no' 'yes' 'no' 'no']
ONNX model probabilities: [{'no': 0.3013198971748352, 'yes': 0.6986801028251648}, {'no': 0.970174252986908, 'yes': 0.02982574701309204}, {'no': 0.44074806571006775, 'yes': 0.5592519044876099}, {'no': 0.9732498526573181, 'yes': 0.026750147342681885}, {'no': 0.9692041277885437, 'yes': 0.0307958722114563}]
