In [1]:
# -*- coding: utf-8 -*-
"""
Toy Example: Basic End-to-End ML Workflow (Classification)

Demonstrates the key steps for a timed ML technical interview scenario:
1. Load & Inspect Data
2. Basic EDA & Preprocessing Strategy
3. Train/Test Split
4. Preprocessing (Imputation, Encoding, Scaling) using Pipelines
5. Train Baseline Model
6. Evaluate Model
"""

'\nToy Example: Basic End-to-End ML Workflow (Classification)\n\nDemonstrates the key steps for a timed ML technical interview scenario:\n1. Load & Inspect Data\n2. Basic EDA & Preprocessing Strategy\n3. Train/Test Split\n4. Preprocessing (Imputation, Encoding, Scaling) using Pipelines\n5. Train Baseline Model\n6. Evaluate Model\n'

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score, adjusted_rand_score

In [4]:
# --- 1. Create & Load Toy Dataset ---
# In a real scenario, you'd use pd.read_csv() or similar

df = pd.read_csv("../data/kaggle_toy.csv")


In [6]:
print("--- Initial Data ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())
print("\n--- Basic Statistics (Numerical) ---")
print(df.describe())
print("\n--- Basic Statistics (Categorical) ---")
print(df.describe(include='object'))
print("\n--- Missing Values ---")
print(df.isnull().sum())

--- Initial Data ---
   id     Podcast_Name Episode_Title  Episode_Length_minutes       Genre  \
0   0  Mystery Matters    Episode 98                     NaN  True Crime   
1   1    Joke Junction    Episode 26                  119.80      Comedy   
2   2   Study Sessions    Episode 16                   73.90   Education   
3   3   Digital Digest    Episode 45                   67.17  Technology   
4   4      Mind & Body    Episode 86                  110.51      Health   

   Host_Popularity_percentage Publication_Day Publication_Time  \
0                       74.81        Thursday            Night   
1                       66.95        Saturday        Afternoon   
2                       69.97         Tuesday          Evening   
3                       57.22          Monday          Morning   
4                       80.07          Monday        Afternoon   

   Guest_Popularity_percentage  Number_of_Ads Episode_Sentiment  \
0                          NaN            0.0          Pos

In [8]:
df["Episode_Sentiment"].value_counts()

Episode_Sentiment
Neutral     3426
Negative    3345
Positive    3229
Name: count, dtype: int64

In [17]:
# Identify feature types (crucial for ColumnTransformer)
# Note: In a real scenario with many columns, you might do this programmatically
numerical_features = ['Guest_Popularity_percentage', 'Host_Popularity_percentage', "Number_of_Ads", "Episode_Length_minutes"]
# One-Hot Encode 'Department' as it has no inherent order
nominal_features = ['Genre', 'Publication_Day', 'Publication_Time']
# Ordinal Encode 'ExperienceLevel' as it has a clear order
ordinal_features = ['Episode_Sentiment']
# Define the order for ordinal features
experience_order = ['Negative', 'Neutral', 'Positive']
removal_features = ["id","Podcast_Name","Episode_Title"]


target = "Listening_Time_minutes"
# --- 2. Define Features and Target ---
X = df.drop(target, axis=1)
X = df.drop(removal_features, axis = 1)
y = df[target]


In [18]:
# --- 3. Train/Test Split ---
# Split *before* applying preprocessing that learns from data (like scaling or imputation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Stratify for classification is good practice
print(f"\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")



--- Data Split ---
Training set shape: (8000, 9)
Test set shape: (2000, 9)


In [19]:

# --- 4. Preprocessing Pipelines ---
# Create pipeline for numerical features: Impute missing values with median, then scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create pipeline for nominal categorical features: Impute missing with most frequent, then one-hot encode
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output = False, drop = "first", handle_unknown='ignore')) # handle_unknown='ignore' is safer for unseen values in test set
])

# Create pipeline for ordinal categorical features: Impute missing with most frequent, then ordinal encode
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[experience_order])) # Pass the defined order
])



# Use ColumnTransformer to apply different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nom', nominal_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features)
    ],
    remainder='passthrough' # Keep other columns (if any) - 'drop' is also common
).set_output(transform="pandas")


In [20]:
X_train_tr = preprocessor.fit_transform(X_train)
X_train_tr

Unnamed: 0,num__Guest_Popularity_percentage,num__Host_Popularity_percentage,num__Number_of_Ads,num__Episode_Length_minutes,nom__Genre_Comedy,nom__Genre_Education,nom__Genre_Health,nom__Genre_Lifestyle,nom__Genre_Music,nom__Genre_News,...,nom__Publication_Day_Saturday,nom__Publication_Day_Sunday,nom__Publication_Day_Thursday,nom__Publication_Day_Tuesday,nom__Publication_Day_Wednesday,nom__Publication_Time_Evening,nom__Publication_Time_Morning,nom__Publication_Time_Night,ord__Episode_Sentiment,remainder__Listening_Time_minutes
9254,-0.014637,-0.147502,1.495380,-1.362223,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,7.02668
1561,0.420455,-1.246593,-1.227205,-0.925938,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,25.01009
1670,0.527362,-0.837750,1.495380,-0.018595,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,23.97471
6087,-1.548269,1.032923,-0.319677,1.295088,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,82.81752
6669,-0.210370,1.349559,-1.227205,-0.834496,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,33.35078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,-0.667079,-0.112707,-0.319677,1.101255,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,65.31980
5191,0.030562,-0.182732,-0.319677,-0.018595,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,95.50000
5390,0.030562,1.068588,-1.227205,1.576499,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,69.64095
860,-1.210257,1.579206,1.495380,-0.798756,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,27.47442


In [30]:
# --- 5. Create Full Pipeline with Model ---
# Choose a simple baseline model
models = {
    "lr" : LinearRegression(),
    "rf" : RandomForestRegressor(random_state=42),
    "SVR" : SVR(),
}



for name, model in models.items():
# Chain the preprocessor and the model into a single pipeline
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    print(f"\n--- Starting {name} Model Training ---")
    full_pipeline.fit(X_train, y_train)
    print(f"--- {name} Model Training Complete ---")
    print("Evaluating")
    y_pred = full_pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(mse)



--- Starting lr Model Training ---
--- lr Model Training Complete ---
Evaluating
3.7066658918417053e-28

--- Starting rf Model Training ---
--- rf Model Training Complete ---
Evaluating
0.00031213054222578736

--- Starting SVR Model Training ---
--- SVR Model Training Complete ---
Evaluating
0.17093510220317884


In [None]:

# Predict on the test data
y_pred = full_pipeline.predict(X_test)
y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1] # Get probabilities for AUC if needed

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.4f}")

print("\nClassification Report:")
# Note: With a tiny dataset like this, the report might look sparse or have warnings
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [32]:

# --- 8. Interpretation & Next Steps (Simulated) ---
print("\n--- Summary & Next Steps ---")
print(f"Successfully trained a baseline Logistic Regression model.")
print(f"Achieved an accuracy of {mse:.4f} on the unseen test data.")
print("Next steps if more time allowed:")
print("- More detailed EDA (visualizations, outlier detection/handling).")
print("- Experiment with different imputation strategies (e.g., KNNImputer).")
print("- Feature engineering (e.g., interaction terms like Age*Salary).")
print("- Try more complex models (e.g., RandomForest, GradientBoosting).")
print("- Hyperparameter tuning using GridSearchCV or RandomizedSearchCV with Cross-Validation.")
print("- Deeper error analysis (examining misclassified examples).")


--- Summary & Next Steps ---
Successfully trained a baseline Logistic Regression model.
Achieved an accuracy of 0.1709 on the unseen test data.
Next steps if more time allowed:
- More detailed EDA (visualizations, outlier detection/handling).
- Experiment with different imputation strategies (e.g., KNNImputer).
- Feature engineering (e.g., interaction terms like Age*Salary).
- Try more complex models (e.g., RandomForest, GradientBoosting).
- Hyperparameter tuning using GridSearchCV or RandomizedSearchCV with Cross-Validation.
- Deeper error analysis (examining misclassified examples).
