# Marketing Mix Model

In [None]:
# import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #for ploting things
import seaborn as sns #more plots
import os #simple OS / directory functionality

In [None]:
#Import data
df = pd.read_csv("../2022-MMA831-Python/data/MMM_adv_sales.csv")
df.head()

In [None]:
#remove extra 'Unnamed' column
df_clean = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df_clean.head()

In [None]:
#Data Description 
df_clean.describe()

In [None]:
#Correlation b/w variables
corr = df_clean.corr()
sns.heatmap(corr)

In [None]:
#Seperate label and features
labels = df_clean['sales']
features = df_clean.drop(['sales'], axis=1)

In [None]:
# Create scatter plot showing Sales vs TV
plt.plot(labels, features["TV"], 'ro')
plt.title("Sales vs TV")
plt.xlabel("TV")
plt.ylabel("sales")
plt.show()


In [None]:
#Now, try use the template below to create a for loop we don't have to copy-paste code to create similar plots for other features for x in featues
for X in features:
    #<copy-paste code from above, except replace "TV" with X>
    plt.plot(labels, features[X], 'ro') 
    plt.title("Sales vs " + X)
    plt.xlabel(X)
    plt.ylabel("sales")
    plt.show()

In [None]:
#Data Distribultion 
#this 
plt.hist(features["TV"], bins=50)
plt.title("TV")
plt.show()


In [None]:
#Data Distribultion -- use similar approach to get data dist for each variables 
for X in features:
    plt.hist(features[X], bins=25)
    plt.title(X)
    plt.show()

In [None]:
#lets create a new column that is log of newspaper 
df_clean['newspaper_log'] = np.log(df_clean['newspaper'])


In [None]:
df_clean

In [None]:
plt.hist(df_clean['newspaper_log'], bins=25)  # arguments are passed to np.histogram
plt.title("Newspaper after log transformation")
plt.show()

In [None]:
# Now lets fit a simple regression -- let's try one with all variables

In [None]:
import statsmodels.formula.api as sm
model1 = sm.ols(formula="sales~TV+radio+newspaper_log", data=df_clean).fit()
print(model1.summary())


In [None]:
# What do we notice? What is relationship b/w newspaper_log and Sales? 
# Looks like its not contributing to the model. Let's try plotting Newspaper vs. Sales and also building another model with out it

In [None]:
#Plot sales vs. newspaper -- see if it confirms what we saw in regression (i.e. no strong relationship with sales)
plt.plot(df_clean['sales'], df_clean['newspaper_log'], 'ro')  # arguments are passed to np.histogram
plt.title("Scatter plot b/w sales and newspaper")
plt.xlabel("Newspaper")
plt.ylabel("Sales")
plt.show()

In [None]:
#this time we fit the regression without newspaper 
model2 = sm.ols(formula="sales~TV+radio", data=df_clean).fit()
print(model2.summary())

In [None]:
#What do we notice -- did R-square change when we removed newspaper

In [None]:
#Actual and predicted values
y_pred = model2.predict()
df1 = pd.DataFrame({'Actual': labels, 'Predicted': y_pred})  
df1.head(10)