In [1]:
# -*- coding: utf-8 -*-
"""
Toy Example: Basic End-to-End ML Workflow (Classification)

Demonstrates the key steps for a timed ML technical interview scenario:
1. Load & Inspect Data
1b. Univariate Visualization (NEW)
2. Basic EDA & Preprocessing Strategy
3. Train/Test Split
4. Preprocessing (Imputation, Encoding, Scaling) using Pipelines
5. Train Baseline Model with Cross-Validation (UPDATED)
6. Evaluate Model on Test Set
"""

'\nToy Example: Basic End-to-End ML Workflow (Classification)\n\nDemonstrates the key steps for a timed ML technical interview scenario:\n1. Load & Inspect Data\n1b. Univariate Visualization (NEW)\n2. Basic EDA & Preprocessing Strategy\n3. Train/Test Split\n4. Preprocessing (Imputation, Encoding, Scaling) using Pipelines\n5. Train Baseline Model with Cross-Validation (UPDATED)\n6. Evaluate Model on Test Set\n'

In [27]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Set plot style
sns.set_style("whitegrid")

In [4]:
# --- 1. Create & Load Toy Dataset ---
df = pd.read_csv("../data/kaggle_toy.csv")

In [6]:
print("--- Initial Data ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())
print("\n--- Basic Statistics (Numerical) ---")
print(df.describe())
print("\n--- Basic Statistics (Categorical) ---")
print(df.describe(include='object'))
print("\n--- Missing Values ---")
print(df.isnull().sum())
print("Duplicated:")
print(df.duplicated().sum())



--- Initial Data ---
   id     Podcast_Name Episode_Title  Episode_Length_minutes       Genre  \
0   0  Mystery Matters    Episode 98                     NaN  True Crime   
1   1    Joke Junction    Episode 26                  119.80      Comedy   
2   2   Study Sessions    Episode 16                   73.90   Education   
3   3   Digital Digest    Episode 45                   67.17  Technology   
4   4      Mind & Body    Episode 86                  110.51      Health   

   Host_Popularity_percentage Publication_Day Publication_Time  \
0                       74.81        Thursday            Night   
1                       66.95        Saturday        Afternoon   
2                       69.97         Tuesday          Evening   
3                       57.22          Monday          Morning   
4                       80.07          Monday        Afternoon   

   Guest_Popularity_percentage  Number_of_Ads Episode_Sentiment  \
0                          NaN            0.0          Pos

In [9]:

# Identify feature types for ColumnTransformer
numerical_features = ['Guest_Popularity_percentage', 'Host_Popularity_percentage', "Number_of_Ads", "Episode_Length_minutes"]
nominal_features = ['Genre', 'Publication_Day', 'Publication_Time']
ordinal_features = ['Episode_Sentiment']
experience_order = ['Negative', 'Neutral', 'Positive']
remove_features = ["id","Podcast_Name","Episode_Title"]
target = "Listening_Time_minutes"

# --- 2. Define Features and Target (for modeling) ---
X = df.drop([target] + remove_features, axis=1)
y = df[target]

In [None]:
# --- 1b. Univariate Visualization (NEW SECTION) ---
print("\n--- Generating Univariate Visualizations ---")


# Plot numerical features
print("Plotting numerical feature distributions...")
for col in numerical_features:
    sns.histplot(df[col], kde=True)
    plt.show()

# Plot categorical features
print("Plotting categorical feature distributions...")
for col in nominal_features + ordinal_features:
    sns.countplot(data=df, x=col, order=df[col].value_counts().index) # Order bars by frequency
    plt.show()

In [12]:
# --- 3. Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
print(f"\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


--- Data Split ---
Training set shape: (8000, 8)
Test set shape: (2000, 8)


In [29]:
# --- 4. Preprocessing Pipelines ---
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),

])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output= False, drop = "first", handle_unknown='ignore'))
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[experience_order]))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nom', nominal_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features)
    ],
    remainder='passthrough'
).set_output(transform="pandas")

X_train_tr = preprocessor.fit_transform(X_train)
X_train_tr

Unnamed: 0,num__Guest_Popularity_percentage,num__Host_Popularity_percentage,num__Number_of_Ads,num__Episode_Length_minutes,nom__Genre_Comedy,nom__Genre_Education,nom__Genre_Health,nom__Genre_Lifestyle,nom__Genre_Music,nom__Genre_News,...,nom__Publication_Day_Monday,nom__Publication_Day_Saturday,nom__Publication_Day_Sunday,nom__Publication_Day_Thursday,nom__Publication_Day_Tuesday,nom__Publication_Day_Wednesday,nom__Publication_Time_Evening,nom__Publication_Time_Morning,nom__Publication_Time_Night,ord__Episode_Sentiment
9254,-0.014637,-0.147502,1.495380,-1.362223,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
1561,0.420455,-1.246593,-1.227205,-0.925938,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1670,0.527362,-0.837750,1.495380,-0.018595,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0
6087,-1.548269,1.032923,-0.319677,1.295088,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
6669,-0.210370,1.349559,-1.227205,-0.834496,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,-0.667079,-0.112707,-0.319677,1.101255,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5191,0.030562,-0.182732,-0.319677,-0.018595,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5390,0.030562,1.068588,-1.227205,1.576499,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
860,-1.210257,1.579206,1.495380,-0.798756,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
X_train_tr.columns

Index(['num__Guest_Popularity_percentage', 'num__Host_Popularity_percentage',
       'num__Number_of_Ads', 'num__Episode_Length_minutes',
       'nom__Genre_Comedy', 'nom__Genre_Education', 'nom__Genre_Health',
       'nom__Genre_Lifestyle', 'nom__Genre_Music', 'nom__Genre_News',
       'nom__Genre_Sports', 'nom__Genre_Technology', 'nom__Genre_True Crime',
       'nom__Publication_Day_Monday', 'nom__Publication_Day_Saturday',
       'nom__Publication_Day_Sunday', 'nom__Publication_Day_Thursday',
       'nom__Publication_Day_Tuesday', 'nom__Publication_Day_Wednesday',
       'nom__Publication_Time_Evening', 'nom__Publication_Time_Morning',
       'nom__Publication_Time_Night', 'ord__Episode_Sentiment'],
      dtype='object')

In [42]:
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.dummy import DummyRegressor

from sklearn.metrics import mean_squared_error


# --- 5. Create Full Pipeline & Train with Cross-Validation (UPDATED SECTION) ---
selector_estimator = DecisionTreeRegressor(random_state=42, max_depth=3)


models = {
    "dummy" : DummyRegressor(),
    "lr" : LinearRegression(),
    "hub" : HuberRegressor(),
    "rf" : RandomForestRegressor(random_state=42),
    "SVR" : SVR(),
}
scoring = "neg_root_mean_squared_error"

for name, model in models.items():
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('selector', SelectFromModel(estimator=selector_estimator)),
        ('classifier', model)
    ])
    print("="*20, name, "="*20)
    print("\nStarting CV")
    cv_scores = cross_val_score(full_pipeline, X_train, y_train, cv=3, scoring=scoring)

    print(f"CV Scores: {cv_scores}")
    print(f"Mean CV: {np.mean(cv_scores):.4f}")
    print(f"Std CV Accuracy: {np.std(cv_scores):.4f}")


    print(f"\n Fit Model on Entire Training Set ---")
    full_pipeline.fit(X_train, y_train)

    print("\nEvaluation")
    y_pred = full_pipeline.predict(X_test)
    print(mean_squared_error(y_true=y_test, y_pred=y_pred))
# y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1]

# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy on Test Set: {accuracy:.4f}")

# print("\nClassification Report:")
# print(classification_report(y_test, y_pred, zero_division=0))

# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, y_pred))




Starting CV
CV Scores: [-27.32602897 -27.58588296 -27.08921934]
Mean CV: -27.3337
Std CV Accuracy: 0.2028

 Fit Model on Entire Training Set ---

Evaluation
732.1642938348867

Starting CV
CV Scores: [-14.59870962 -13.36672894 -13.87347052]
Mean CV: -13.9463
Std CV Accuracy: 0.5056

 Fit Model on Entire Training Set ---

Evaluation
194.1974516566448

Starting CV
CV Scores: [-14.59629896 -13.37134884 -13.87357118]
Mean CV: -13.9471
Std CV Accuracy: 0.5028

 Fit Model on Entire Training Set ---

Evaluation
194.19128398133472

Starting CV
CV Scores: [-16.02932427 -14.69789028 -15.2005344 ]
Mean CV: -15.3092
Std CV Accuracy: 0.5490

 Fit Model on Entire Training Set ---

Evaluation
244.23229236016797

Starting CV
CV Scores: [-14.64162535 -13.50041162 -13.9180414 ]
Mean CV: -14.0200
Std CV Accuracy: 0.4714

 Fit Model on Entire Training Set ---

Evaluation
195.76628306456533


In [36]:
# --- 7. Interpretation & Next Steps (Simulated) ---
print("\n--- Summary & Next Steps ---")
print(f"Successfully trained and evaluated a baseline Logistic Regression model.")
print(f"Cross-validation on the training set yielded a mean accuracy of ___.")
print(f"Achieved an accuracy of ____ on the unseen test data.")
print("Next steps if more time allowed:")
print("- More detailed EDA (bivariate analysis, correlations).")
print("- Experiment with different imputation strategies.")
print("- Feature engineering.")
print("- Try more complex models & hyperparameter tuning (guided by CV results).")
print("- Deeper error analysis.")



--- Summary & Next Steps ---
Successfully trained and evaluated a baseline Logistic Regression model.
Cross-validation on the training set yielded a mean accuracy of ___.
Achieved an accuracy of ____ on the unseen test data.
Next steps if more time allowed:
- More detailed EDA (bivariate analysis, correlations).
- Experiment with different imputation strategies.
- Feature engineering.
- Try more complex models & hyperparameter tuning (guided by CV results).
- Deeper error analysis.
