In [1]:
# import relevant libraries

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
# load csvs from data-analysis folder

spend_revenue = pd.read_csv(r"/Users/adityamxr/Desktop/finding-marketing-insights/data-analysis/spend_revenue.csv")
sales_full_merged = pd.read_csv(r"/Users/adityamxr/Desktop/finding-marketing-insights/data-analysis/sales_full_merged.csv")

In [3]:
# verify import

dfs = [spend_revenue, sales_full_merged]

for df in dfs:
    print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   YearMonth                   12 non-null     object 
 1   Total_Revenue               12 non-null     float64
 2   Offline_Spend               12 non-null     int64  
 3   Online_Spend                12 non-null     float64
 4   Total_Spend                 12 non-null     float64
 5   Marketing_Spend_Percentage  12 non-null     float64
dtypes: float64(4), int64(1), object(1)
memory usage: 708.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52924 entries, 0 to 52923
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CustomerID            52924 non-null  int64  
 1   Transaction_ID        52924 non-null  int64  
 2   Transaction_Date      52924 non-null  object 
 3   P

In [5]:
# prepare the data (using 'spend_revenue' which has Total_Spend and Total_Revenue)
X_offline_online = spend_revenue[['Offline_Spend', 'Online_Spend']]
y_revenue = spend_revenue['Total_Revenue']

In [6]:
# initialize model
model = LinearRegression()

# train the model using offline and online spend to predict revenue
model.fit(X_offline_online, y_revenue)

In [7]:
# make predictions and evaluate R² value
y_pred = model.predict(X_offline_online)

# calculate the R² score
r2_total = r2_score(y_revenue, y_pred)

In [8]:
# linear regression for offline and online spend separately
# offline Spend
X_offline = spend_revenue[['Offline_Spend']]
model_offline = LinearRegression()
model_offline.fit(X_offline, y_revenue)
y_pred_offline = model_offline.predict(X_offline)
r2_offline = r2_score(y_revenue, y_pred_offline)

# online Spend
X_online = spend_revenue[['Online_Spend']]
model_online = LinearRegression()
model_online.fit(X_online, y_revenue)
y_pred_online = model_online.predict(X_online)
r2_online = r2_score(y_revenue, y_pred_online)

# print out the R² scores
print(f"R² for Offline and Online Spend together: {r2_total:.4f}")
print(f"R² for Offline Spend: {r2_offline:.4f}")
print(f"R² for Online Spend: {r2_online:.4f}")

R² for Offline and Online Spend together: 0.4098
R² for Offline Spend: 0.3565
R² for Online Spend: 0.4033


**Online marketing spend has a stronger impact on revenue than offline marketing spend, with an R² of 0.4033 compared to 0.3565. When considering both offline and online spend together, they explain about 41% of the revenue variation, which suggests that there are additional factors (like perhaps product quality, product pricing, etc) that also influence revenue.**