# Download Libraries 
To ensure you have all of the right libraries it is recommended that you clone the "capstone_env" conda environment as differences in libraries can cause issues when running the streamlit.

For a deeper understanding of these models please see the 'calvin_dev' notebook in the dev repository. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (classification_report, accuracy_score, 
                             confusion_matrix, mean_absolute_error, 
                             mean_squared_error, r2_score)
import joblib

An important version to ensure you have is to ensure that you have Sci-kit Learn version 1.5.2. You can Check that below. 

In [2]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.5.2
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License
        
        Copyright (c) 2007-2024 The scikit-learn developers.
        All rights reserved.
        
        Redistribution and use in source and binary forms, with or without
        modification, are permitted provided that the following conditions are met:
        
        * Redistributions of source code must retain the above copyright notice, this
          list of conditions and the following disclaimer.
        
        * Redistributions in binary form must reproduce the above copyright notice,
          this list of conditions and the following disclaimer in the documentation
          and/or other materials provided with the distribution.
        
        * Neither the name of the copyright holder nor the names of its
          contributors may be used to endorse or promote produ

# Load the Datasets

In [3]:
individual_device_data = pd.read_csv('data/anon_processed_unique_device_v3.csv')
campaign_level_data = pd.read_csv('data/anan_campaign_modeling_data_v3.csv')

# Build Important Functions and Variables

In [4]:
def normalize_genre_columns(data, genre_columns):
    """Normalize genre columns to percentages."""
    data['genre_sum'] = data[genre_columns].sum(axis=1)
    data[genre_columns] = data[genre_columns].div(data['genre_sum'], axis=0).fillna(0)
    data.drop(columns=['genre_sum'], inplace=True)
    return data

def save_model(model, scaler, model_path, scaler_path):
    """Save the trained model and scaler to disk."""
    joblib.dump(model, model_path)
    if scaler:
        joblib.dump(scaler, scaler_path)
    print(f"Model saved as '{model_path}'.")
    if scaler:
        print(f"Scaler saved as '{scaler_path}'.")

In [5]:
genre_columns = [
    'Classics', 'Comedy', 'Other', 'Reality', 'News and Information', 'Drama',
    'Action & Adventure', 'Thriller', 'Sci-Fi & Fantasy', 'Horror', 'Western',
    'Documentaries', 'Sports', 'Instructional & Educational', 'Home & Lifestyle',
    'Romance', 'Anime', 'Musical', 'Independent', 'Entertainment', 'Paranormal',
    'Music', 'Gay & Lesbian', 'Crime', 'Food & Cooking', 'Faith & Spirituality',
    'Game Show', 'Dance', 'Children & Family', 'Telenovela', 'Talk Show',
    'Variety Show', 'War', 'Young Adult', 'None'
]

# Nearest Neighbors Model 
This model is being built so that in the Streamlit application there is a feature for the user to identify the most similar campaign

In [6]:
data = normalize_genre_columns(campaign_level_data.copy(), genre_columns)
X = data[['impressions', 'clicks'] + genre_columns]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

nn_model = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn_model.fit(X_scaled)
save_model(nn_model, scaler, 'models/nearest_neighbors_model.pkl', 'models/nearest_neighbors_scaler.pkl')

Model saved as 'models/nearest_neighbors_model.pkl'.
Scaler saved as 'models/nearest_neighbors_scaler.pkl'.


# Random Forest Classifier
This model is built so that in the streamlit application there is the possibility to predict the score that the created campaign the user made is possible. 

In [7]:
data = normalize_genre_columns(campaign_level_data.copy(), genre_columns)
data['score_encoded'] = data['score'].map({'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3})
X = data[['impressions', 'clicks'] + genre_columns]
y = data['score_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100)
rf_classifier.fit(X_train, y_train)

## Evaulate Random Forest Classifier

In [8]:
y_pred = rf_classifier.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Poor', 'Fair', 'Good', 'Excellent']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

save_model(rf_classifier, None, 'models/rf_classifier.pkl', None)

Accuracy: 0.56
Classification Report:
              precision    recall  f1-score   support

        Poor       0.25      0.33      0.29         3
        Fair       0.33      0.50      0.40         2
        Good       1.00      0.83      0.91         6
   Excellent       0.50      0.40      0.44         5

    accuracy                           0.56        16
   macro avg       0.52      0.52      0.51        16
weighted avg       0.62      0.56      0.58        16

Confusion Matrix:
[[1 1 0 1]
 [1 1 0 0]
 [0 0 5 1]
 [2 1 0 2]]
Model saved as 'models/rf_classifier.pkl'.


# Random Forest Regressor 
The random forest regressor is built so that the user can create a theoritcal campaign and the model would then be able to have the average watch time per device predicted. 

In [9]:
data = normalize_genre_columns(campaign_level_data.copy(), genre_columns)
X = data[['impressions', 'clicks'] + genre_columns]
y = data['avg_time_watched_per_device']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor(random_state=42, n_estimators=100)
rf_regressor.fit(X_train, y_train)

## Evaluate the Regressor

In [10]:
y_pred = rf_regressor.predict(X_test)
print("Regression Model Performance:")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred):.2f}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred):.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")

save_model(rf_regressor, None, 'models/regression_model.pkl', None)

Regression Model Performance:
Mean Absolute Error (MAE): 24.16
Mean Squared Error (MSE): 1467.87
R^2 Score: -0.19
Model saved as 'models/regression_model.pkl'.
