In [None]:
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np


In [None]:
!pip install pyreadstat


In [None]:
df = pd.read_csv('YourFile')
df.info()
print(df)

# Descriptive Statistics

Before running the models, we first want to review the descriptive statistics of our variables. The following bar charts are created:


*   Average CTR per Campaign Element
*   Average CTR per Concept
*   Total Link Clicks for Campaign Element Variables
*   Total Link Clicks per Concept
*   Total Link Clicks and Impressions per Concept






In [None]:
#Average CTR per Campaign Element
X = df[['Emotion_Fear', 'Emotion_Love', 'Topic_Sprotection', 'Topic_Affiliation', 'Topic_Kincare',
        'Appeal_Exp', 'Appeal_Testi', 'Appeal_Infor', 'Appeal_Pers', 'LStyle_Fperson', 'LStyle_Tperson']]
Y = df['CTR_link_clickthrough_rate']

plt.figure(figsize=(12, 6))
bar_width = 0.6
colors = ['#717f9d'] * len(X.columns)

x_values_all = []
for i, column in enumerate(X.columns):
    true_values_df = df[df[column] == 1]

    if not true_values_df.empty:
        x_values = np.arange(len(true_values_df.groupby(column))) + i * 1.5 * bar_width
        plt.bar(x_values, true_values_df.groupby(column)['CTR (link click-through rate)'].mean(),
                width=bar_width, label=column, color=colors[i])
        x_values_all.extend(x_values)

        for idx, value in enumerate(true_values_df.groupby(column)['CTR (link click-through rate)'].mean()):
            plt.text(x_values[idx], value + 0.05, f"{value:.2f}", ha='center', va='bottom', fontsize=10) #color='#717f9d')
    else:
        print(f"No true values for {column}, skipping.")

plt.ylim(0, df['CTR (link click-through rate)'].mean() + 1)
plt.title('Average Link Click-Through-Rate (CTR) for Campaign Element Variables')
plt.xlabel('Campaign Element Variables')
plt.ylabel('CTR (%)')
plt.xticks(x_values_all, X.columns * len(x_values), rotation=45, ha= 'right')
plt.show()






In [None]:
#Average CTR per concept
X = df['concept']
Y = df['CTR (link click-through rate)']


filtered_data = df[(df['CTR (link click-through rate)'].notnull()) & (df['concept'] != 13)]
means = filtered_data.groupby('concept')['CTR (link click-through rate)'].mean()
means = means.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
bars = plt.bar(means.index.astype(str), means, color='#717f9d')


for bar in bars:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05, f"{bar.get_height():.2f}", ha='center')


plt.ylim(0, df['CTR (link click-through rate)'].mean() + 2)
plt.xlabel('Concept')
plt.ylabel('CTR (%)')
plt.title('Average Link Click-Through-Rate (CTR) per Concept')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()


plt.show()



In [None]:
# Total Link Clicks for Campaign Element Variables
X = df[['Emotion_Fear', 'Emotion_Love', 'Topic_Sprotection', 'Topic_Affiliation', 'Topic_Kincare',
        'Appeal_Exp', 'Appeal_Testi', 'Appeal_Infor', 'Appeal_Pers', 'LStyle_Fperson', 'LStyle_Tperson']]
Y = df['Link clicks']

plt.figure(figsize=(12, 6))
bar_width = 0.6
colors = ['#717f9d'] * len(X.columns)

x_values_all = []
for i, column in enumerate(X.columns):
    true_values_df = df[df[column] == 1]

    if not true_values_df.empty:
        x_values = np.arange(len(true_values_df.groupby(column))) + i * 1.5 * bar_width
        plt.bar(x_values, true_values_df.groupby(column)['Link clicks'].sum(),
                width=bar_width, label=column, color=colors[i])
        x_values_all.extend(x_values)

        for idx, value in enumerate(true_values_df.groupby(column)['Link clicks'].sum()):
            plt.text(x_values[idx], value + 0.05, f"{value:.0f}", ha='center', va='bottom', fontsize=10) #color='#717f9d')
    else:
        print(f"No true values for {column}, skipping.")

plt.title('Total Link Clicks for Campaign Element Variables')
plt.xlabel('Campaign Element Variables')
plt.ylabel('Link Clicks')

plt.xticks(x_values_all, X.columns * len(x_values), rotation=45, ha= 'right')

plt.show()




In [None]:
#Amount of Link Clicks per concept
X = df['concept']
Y = df['Link clicks']

filtered_data = df[(df['Link clicks'].notnull()) & (df['concept'] != 13)]

means = filtered_data.groupby('concept')['Link clicks'].sum()
means = means.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
bars = plt.bar(means.index.astype(str), means, color='#717f9d')

for bar in bars:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05, f"{bar.get_height():.0f}", ha='center')

plt.xlabel('Concept')
plt.ylabel('Link Clicks')
plt.title('Total Link Clicks per Concept')

plt.xticks(rotation=45, ha='right')

plt.tight_layout()

plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Amount of Link Clicks and Impressions per concept
X = df['concept']
Y_link_clicks = df['Link clicks']
Y_impressions = df['Impressions']

filtered_data = df[(df['Link clicks'].notnull()) & (df['Impressions'].notnull()) & (df['concept'] != 13)]
sums = filtered_data.groupby('concept')[['Link clicks', 'Impressions']].sum()

sums = sums.sort_values(by='Link clicks', ascending=False)
fig, ax1 = plt.subplots(figsize=(12, 6))

bar_width = 0.35
bar_positions = range(len(sums))


bars_link_clicks = ax1.bar([pos - bar_width/2 for pos in bar_positions], sums['Link clicks'], width=bar_width, label='Link clicks', color='#717f9d')

ax1.set_xlabel('Concept')
ax1.set_ylabel('Link Clicks')
ax1.set_title('Total Link Clicks and Impressions per Concept')


ax1.set_xticks(bar_positions)
ax1.set_xticklabels(sums.index.astype(str), rotation=45, ha='right')

ax2 = ax1.twinx()
bars_impressions = ax2.bar([pos + bar_width/2 for pos in bar_positions], sums['Impressions'], width=bar_width, label='Impressions', color='#dadfe9')

ax2.set_ylabel('Impressions', fontsize=11)

plt.tight_layout()

plt.show()

# Linear Regression

In order to make the linear regression model make sense, we need to filter the data so that only CTR (Link clicks) values of higher than 0 are taken into consideration.

In [None]:
filtered_df = df[df['CTR_link_clickthrough_rate'] > 0]
row_count = len(filtered_df)
filtered_df.info()

# Model 1: Main Communication Variables


In [None]:
X = df[['Emotion_Fear', 'Topic_Affiliation', 'Topic_Kincare', 'Appeal_Testi', 'Appeal_Pers', 'LStyle_Tperson']]
Y = df['CTR_link_clickthrough_rate']

In [None]:
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)

# Model 2: Supportive Model with Control Variables
Control variables to include:


*   Platform: Platform_Insta
*   Device: Device_desktop, Device_ipad, Device_iphone, Device_ipod, Device_other

In [None]:
X = df[['Platform_Insta','Device_desktop','Device_ipad','Device_iphone','Device_ipod','Device_other']]
Y = df['CTR_link_clickthrough_rate']

In [None]:
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)

# Model 3: Campaign dummies and control variables
This section contains out models that measure the effect the content related dummies have on CTR while also includig the control variables.

The campaign dummies:


*  Emotion (Love vs. Fear)
*  Topic (Self-protection vs. Affiliation vs. Kincare)
*  Appeal 1 (Expert vs. Testimonial)
*  Appeal 2 (Informative vs. Persuasive)
*  Linguistic style (First-person vs. Third-person)

Control variables that are included:

*   Platform: Platform_Insta
*   Device: Device_desktop, Device_ipad, Device_iphone, Device_ipod, Device_other


In [None]:
X = df[['Emotion_Fear','Topic_Affiliation','Topic_Kincare','Appeal_Testi','Appeal_Pers','LStyle_Tperson','Platform_Insta','Device_desktop','Device_ipad','Device_iphone','Device_ipod','Device_other']]
Y = df['CTR_link_clickthrough_rate']

In [None]:
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)