## Processing User Data with the New Fields ##

In [None]:
# install the packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date, time, timedelta

In [None]:
import import_ipynb
import User_Data_Generation
userdata = User_Data_Generation.users

In [None]:
# Process data: convert fields into the right data types

user_data = userdata.copy()
user_data["timestamp"] = pd.to_datetime(user_data["timestamp"])

# add new date column 

user_data["Date"] = pd.to_datetime(user_data["timestamp"]).dt.normalize()

In [None]:
user_data.head()

In [None]:
# Tackling "Action" field: extracting user actions to define attribution 

# Function to extract interactions
def extract_interactions(history, position):
    # Split the history string into a list
    entries = history.split(", ")
    # Check if the requested position exists (from the end)
    if len(entries) >= position:
        return entries[-position]
    return None  # Return None if the position doesn't exist

# Apply the function to create the columns
user_data["Last Interaction"] = user_data["action"].apply(lambda x: extract_interactions(x, 1))
user_data["Third Interaction"] = user_data["action"].apply(lambda x: extract_interactions(x, 2))
user_data["Second Interaction"] = user_data["action"].apply(lambda x: extract_interactions(x, 3))
user_data["First Interaction"] = user_data["action"].apply(lambda x: extract_interactions(x, 4))

In [None]:
# Create a new field "Sales_Channel" for attribution & further insights on the MMM

# Define probabilities for Sales_Channel when Last Interaction is "purchase"
sales_channel_probs = {
    "Search": 0.4,
    "Social": 0.25,
    "Video": 0.15,
    "Display": 0.10,
    "Digital Audio": 0.10
}

# Function to determine Sales_Channel
def determine_sales_channel(last_interaction):
    if last_interaction in ["view", "click"]:
        return "Offline"
    else:
        return np.random.choice(
            list(sales_channel_probs.keys()), 
            p=list(sales_channel_probs.values())
        )

# Apply the function to create the Sales_Channel column
user_data["Sales_Channel"] = user_data["Last Interaction"].apply(determine_sales_channel)

In [None]:
# Testing a new function in a copied dataframe
user_data2 = user_data.copy()

# Getting view counts and click counts out of the action section

click_probs = [0.35, 0.25, 0.15, 0.15, 0.1] # Search > Social > Video > Display > Audio
view_probs = [0.1, 0.2, 0.25, 0.3, 0.15]  # Display > Social > Video > Search > Audio
categories = ["Search", "Social", "Video", "Display", "Audio"]

# Function to distribute actions
def distribute_actions(actions):
    purchases = actions.count("purchase")
    clicks = actions.count("click") + purchases  # Purchases include 1 click each
    views = actions.count("view") + clicks  # Clicks (and purchases) include 1 view each
    
    # Distribute views and clicks based on probabilities
    view_distribution = np.random.multinomial(views, view_probs)
    click_distribution = np.random.multinomial(clicks, click_probs)
    
    # Create output dictionary
    output = {f"{category}_Views": view_distribution[i] for i, category in enumerate(categories)}
    output.update({f"{category}_Clicks": click_distribution[i] for i, category in enumerate(categories)})
    return output

# Apply the function and expand the result into new columns
distributions = user_data2["action"].apply(distribute_actions)
distributions_user_data2 = pd.DataFrame(list(distributions))

# Add the new columns to the original DataFrame
user_data3 = pd.concat([user_data2, distributions_user_data2], axis=1)

In [None]:
# Feature Engineering

user_data3["Sales_Channel"].unique() # checking the number of unique values in Sales Channel
user_data3["category"].unique() # checking the number of unique values in Category
user_data4 = user_data3.copy()

# Import scikit-learn for OHE

from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(drop=None, sparse_output=False)

# OneHotEncode "category"
category_encoded = encoder.fit_transform(user_data4[["category"]])
category_col_encoded = encoder.get_feature_names_out(["category"])
user_data4[category_col_encoded] = category_encoded  # Add encoded columns without removing the original

# OneHotEncode "Sales_Channel"
saleschannel_encoded = encoder.fit_transform(user_data4[["Sales_Channel"]])
sales_cha_encoded = encoder.get_feature_names_out(["Sales_Channel"])
user_data4[sales_cha_encoded] = saleschannel_encoded  # Add encoded columns without removing the original

# Display updated columns
print(user_data4.columns)


In [None]:
ud = user_data4
ud.head()
ud.columns

## Joining User Data with the Macro & Spend Data ##

In [None]:
# importing other notebook to join macro table with the user data

import import_ipynb
import MMM_macro_data 
daily = MMM_macro_data.daily

In [None]:
# Bringing the sum of revenue and sales on a daily rollup 

# Define the feature fields for summation
feature_engineered_sum = [
    'Search_Views', 'Social_Views', 'Video_Views', 'Display_Views', 'Audio_Views',
    'Search_Clicks', 'Social_Clicks', 'Video_Clicks', 'Display_Clicks', 'Audio_Clicks',
    'category_Books', 'category_Clothing', 'category_Electronics', 'category_Home',
    'Sales_Channel_Digital Audio', 'Sales_Channel_Display', 'Sales_Channel_Offline',
    'Sales_Channel_Search', 'Sales_Channel_Social', 'Sales_Channel_Video'
]

# Base aggregation dictionary
agg_dictionary = {
    "revenue": ("price", "sum"),
    "salescount": ("user_id", "count"),
    "searchclicks": ("Search_Clicks", "sum"),
    "search_impr": ("Search_Views", "sum")
}

# Update the dictionary dynamically for all feature fields
agg_dictionary.update({field: (field, "sum") for field in feature_engineered_sum})


# Perform groupby operation
ud_join = ud.groupby("Date").agg(**agg_dictionary)

# check the values

ud_join.head()

In [None]:
mmm_1 = daily.merge(ud_join, on = "Date", how="left") # Joining the user data with the macro data

In [None]:
# Macro data exploration

mmm_1.head()


## MMM Joined Table EDA

In [None]:
mmm_1.info()

In [None]:
# Checking correlation & distribution

# Filter columns that contain 'Spend' in their names
spendc = [col for col in mmm_1.columns if "Spend" in col] + ["revenue", "salescount"]

# Create pairplot
sns.pairplot(mmm_1[spendc], diag_kind="kde", corner=True, height=2.5)

# Show the plot
plt.suptitle("Pairplot of Spend Fields", y=1.02)
plt.show()

In [None]:
# Ensure correct syntax with explicit data parameter
sns.lineplot(data=mmm_1, x="Date", y="salescount")
plt.title("Sales Count Over Time")
plt.xlabel("Date")
plt.ylabel("Sales Count")
plt.xticks(rotation=45)
plt.show()

# Final Table

In [None]:
# Drop columns containing "Factor" and specific columns 'searchclicks' and 'search_impr' to prevent data leakage

mmm_2 = mmm_1.copy()

mmm_2_columns_to_drop = [col for col in mmm_2.columns if "Factor" in col] + ['searchclicks', 'search_impr']
mmm_2 = mmm_2.drop(columns=mmm_2_columns_to_drop)

mmm_3 = mmm_2.copy()

mmm_3.head()

In [None]:
# Monthly Data

import pandas as pd

# Convert Month, Year into date format and Weekday into string
mmm_3["Month_Year"] = pd.to_datetime(mmm_3[["Year", "Month"]].assign(Day=1))
mmm_3["Weekday"] = mmm_3["Weekday"].astype(str)

# Group the table by Month_Year
grouped_columns = {
    "Promo_Exists": "mean",
    "Consumer_Index": "mean",
    "Inflation_Rate": "mean",
}

# Add all other numeric columns with sum
for col in mmm_3.select_dtypes(include="number").columns:
    if col not in grouped_columns:
        grouped_columns[col] = "sum"

# Group and aggregate
mmm_monthly = mmm_3.groupby("Month_Year").agg(grouped_columns).reset_index()

# Output the resulting monthly table
mmm_monthly

In [None]:
# Turn Monthly Data into CSV to use it in different models such as LightweightMMM & PyMC 

mmm_monthly.to_csv("monthly_data.csv", index=False)

# Marketing Mix Modelling: Baseline Model

#### Simple Baseline Model

Below is the simple baseline model based on the linear regression.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

# Prepare the data
# Assuming `mmm_monthly` is the grouped monthly DataFrame from the previous step

# Define predictors (X) and target variable (y)
predictors = [
    "Promo_Exists", "Consumer_Index", "Inflation_Rate", "Search_Spend",
    "Display_Spend", "Video_Spend", "Social_Spend", "Digital_Audio_Spend",
    "TV_Spend", "Radio_Spend", "OOH_Spend", "Gross_Rating_Point",
    "Sales_Channel_Digital Audio", "Sales_Channel_Display",
    "Sales_Channel_Offline", "Sales_Channel_Search", "Sales_Channel_Social",
    "Sales_Channel_Video", "Search_Views", "Social_Views", "Video_Views",
    "Display_Views", "Audio_Views", "Search_Clicks", "Social_Clicks",
    "Video_Clicks", "Display_Clicks", "Audio_Clicks", "category_Books",
    "category_Clothing", "category_Electronics", "category_Home"
]

target = "revenue"

# Apply MinMaxScaler to selected columns
scaler_columns = [col for col in predictors if 
                  "_Views" in col or "_Clicks" in col or "category_" in col or "Sales_Channel_" in col]
scaler = MinMaxScaler()
mmm_monthly[scaler_columns] = scaler.fit_transform(mmm_monthly[scaler_columns])

X = mmm_monthly[predictors]
y = mmm_monthly[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Print evaluation metrics
print("Linear Regression Model Evaluation:")
print(f"R-squared (R²): {r2:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2%}")

# Display the coefficients
coefficients = pd.DataFrame({
    "Feature": predictors,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", ascending=False)

print("\nFeature Coefficients:")
print(coefficients)

# Add predictions to the DataFrame for further analysis
mmm_monthly["predicted_revenue"] = model.predict(X)

# Display a sample of the updated DataFrame
print("\nUpdated DataFrame with Predictions:")
print(mmm_monthly[["Month", "Year", "revenue", "predicted_revenue"]].head())

In [None]:
# Plot: Actual vs. Predicted Revenue
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label="Perfect Fit")
plt.title("Actual vs. Predicted Revenue")
plt.xlabel("Actual Revenue")
plt.ylabel("Predicted Revenue")
plt.legend()
plt.tight_layout()
plt.show()

### Baseline Model: Lasso

In [None]:
# Lasso Model

from sklearn.linear_model import Lasso

# Apply Lasso Regularization
alpha = 100  # Adjust for optimal performance
lasso_model = Lasso(alpha=alpha, random_state=42)
lasso_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_lasso = lasso_model.predict(X_test)

# Calculate metrics
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
mape_lasso = mean_absolute_percentage_error(y_test, y_pred_lasso)


print("Lasso Regression Evaluation:")
print(f"Mean Squared Error (MSE): {mse_lasso:.2f}")
print(f"R-squared (R²): {r2_lasso:.2f}")
print(f"RMSE): {rmse_lasso:.2f}")
print(f"MAPE: {mape_lasso:.2f}")

# Display coefficients
coefficients = pd.DataFrame({
    "Feature": predictors,
    "Coefficient": lasso_model.coef_
}).sort_values(by="Coefficient", ascending=False)

print("\nFeature Coefficients (Lasso):")
print(coefficients)

# Add predictions to the DataFrame
mmm_monthly["predicted_revenue_lasso"] = lasso_model.predict(X)

## XG Boost Based MMM

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns


# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Regression objective
    n_estimators=1000,             # Number of boosting rounds
    learning_rate=0.05,             # Learning rate
    max_depth=5,                    # Maximum tree depth
    subsample=0.8,                   # Subsample ratio
    colsample_bytree=0.8,             # Column sampling
    random_state=42,
    early_stopping_rounds=10
)

xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
r2_xgb = r2_score(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)

print("XGBoost Model Evaluation:")
print(f"R-squared (R²): {r2_xgb:.2f}")
print(f"Mean Squared Error (MSE): {mse_xgb:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_xgb:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_xgb:.2%}")

# Feature importance plot
plt.figure(figsize=(10, 6))
xgb.plot_importance(xgb_model, max_num_features=10, importance_type="weight")
plt.title("Top 10 Feature Importance")
plt.show()

# Add predictions to the original DataFrame
mmm_monthly["predicted_revenue_xgb"] = xgb_model.predict(X)

# Plot actual vs predicted revenue
plt.figure(figsize=(12, 6))
sns.lineplot(x=mmm_monthly['Month_Year'], y=mmm_monthly['revenue'], label='Actual Revenue', color='blue')
sns.lineplot(x=mmm_monthly['Month_Year'], y=mmm_monthly['predicted_revenue_xgb'], label='Predicted Revenue', color='red')
plt.title('Actual vs Predicted Revenue Over Time')
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend()
plt.grid()
plt.show()

NameError: name 'X_train' is not defined

# Random Forest Algorithm with Grid Search 

In addition to Linear Regression, Lasso model and the XGBoost which are instrumental to solve regression problems, Random Forest algorithm by picking the best estimator thanks to the Grid Search could be very instrumental to understand which marketing channel would perform the best in the context of an MMM.