In [None]:
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np


In [None]:
!pip install pyreadstat


In [None]:
df = pd.read_csv('YourFile')
df.info()
print(df)

# Descriptive Statistics
Before running the models, we first want to review the descriptive statistics of our variables. The following bar charts are created:


*   Average Engagement Time per Campaign Element
*   Average Engagement Time per Concept


In [None]:
# Average Engagement Time per Campaign Element
X = df[['Emotion_Fear', 'Emotion_Love', 'Topic_Sprotection', 'Topic_Affiliation', 'Topic_Kincare',
        'Appeal_Exp', 'Appeal_Testi', 'Appeal_Infor', 'Appeal_Pers', 'LStyle_Fperson', 'LStyle_Tperson']]
Y = df['avg_engagement_time_sec']


plt.figure(figsize=(12, 6))


bar_width = 0.6
colors = ['#d83555'] * len(X.columns)

x_values_all = []
for i, column in enumerate(X.columns):
    true_values_df = df[df[column] == 1]

    if not true_values_df.empty:
        x_values = np.arange(len(true_values_df.groupby(column))) + i * 1.5 * bar_width
        plt.bar(x_values, true_values_df.groupby(column)['avg_engagement_time_sec'].mean(),
                width=bar_width, label=column, color=colors[i])
        x_values_all.extend(x_values)


        for idx, value in enumerate(true_values_df.groupby(column)['avg_engagement_time_sec'].mean()):
            plt.text(x_values[idx], value + 0.05, f"{value:.0f}", ha='center', va='bottom', fontsize=10) #color='#d83555')
    else:
        print(f"No true values for {column}, skipping.")

plt.title('Average Engagement Time for Campaign Element Variables')
plt.xlabel('Campaign Element Variables')
plt.ylabel('Average Engagement Time (Seconds)')

plt.xticks(x_values_all, X.columns * len(x_values), rotation=45, ha= 'right')

plt.show()






In [None]:
#Average Engagement Time per Concept
X = df['concept']
Y = df['avg_engagement_time_sec']

filtered_data = df[(df['avg_engagement_time_sec'].notnull()) & (df['concept'] != 13)]

means = filtered_data.groupby('concept')['avg_engagement_time_sec'].mean()

means = means.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
bars = plt.bar(means.index.astype(str), means, color='#d83555')

for bar in bars:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05, f"{bar.get_height():.0f}", ha='center')

plt.xlabel('Concept')
plt.ylabel('Average Engagement Time (Sec)')
plt.title('Average Engagement Time per Concept')

plt.xticks(rotation=45, ha='right')

plt.tight_layout()

plt.show()



In order to make the linear regression model make sense, we need to filter the data so that only avg_engagement_time_sec values of higher than 0 are taken into consideration.

In [None]:
filtered_df = df[df['avg_engagement_time_sec'] > 0]
row_count = len(filtered_df)
filtered_df.info()

# Model 1: Communication Concepts main effects
This section contains out models that measure the effect the content related dummies have on Avg_engagement_time.

The communication variables:


*  Emotion (Love vs. Fear)
*  Topic (Self protection vs. Affiliation vs. Kin-Care)
*  Appeal 1 (Expert vs. Testimonial)
*  Appeal 2 (Informative vs. Persuasive)
*  Linguistic style (First-person vs. Third-person)

In [None]:
X = df[['Emotion_Fear', 'Topic_Affiliation', 'Topic_Kincare', 'Appeal_Testi', 'Appeal_Pers', 'LStyle_Tperson']]
Y = df['avg_engagement_time_sec']

In [None]:
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)

# Model 2: Supportive Model with Control Variables
Control variables to include:


*   Part of week (PW): PW_weekend_dummy
*   Part of day  (PD): PD_afternoon, PD_evening_dummy, PD_night_dummy
*   Working hours (WH): WH_workhours_dummy
*   Device Category (DC): DC_desktop_dummy, DC_tablet_dummy

In [None]:
X = df[['PW_weekend_dummy','PD_afternoon_dummy','PD_evening_dummy','PD_night_dummy','WH_workhours_dummy','DC_desktop_dummy','DC_tablet_dummy']]
Y = df['avg_engagement_time_sec']

In [None]:
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)

# Model 3: Campaign dummies and control variables
This section contains out models that measure the effect the content related dummies have on avg_engagement_time_sec while also includig the control variables.

The communication variables:


*  Emotion (Love vs. Fear)
*  Topic (Self-protection vs. Affiliation vs. Kincare)
*  Appeal 1 (Expert vs. Testimonial)
*  Appeal 2 (Informative vs. Persuasive)
*  Linguistic style (First-person vs. Third-person)

Control variables that are included:


*   Part of week (PW): PW_weekend_dummy
*   Part of day  (PD): PD_afternoon, PD_evening_dummy, PD_night_dummy
*   Working hours (WH): WH_workhours_dummy
*   Device Category (DC): DC_desktop_dummy, DC_tablet_dummy

In [None]:
X = df[['Emotion_Fear','Topic_Affiliation','Topic_Kincare','Appeal_Testi','Appeal_Pers','LStyle_Tperson','PW_weekend_dummy','PD_afternoon_dummy','PD_evening_dummy','PD_night_dummy','WH_workhours_dummy','DC_desktop_dummy','DC_tablet_dummy']]
Y = df['avg_engagement_time_sec']

In [None]:
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)

# Model 4: Interaction effects
This section contains out of models that measure the effect of different interactions between our campaign's predictor variables on avg_engagement_time.

**The interactions:**

Emotion and Device:
*   Emotion_Fear x Topic_Affiliation
*   Emotion_Fear x Topic_Kincare



In [None]:

#Emotion and Topic
df['Emotion_Fear x Topic_Affiliation'] = df['Emotion_Fear'] * df['Topic_Affiliation']
df['Emotion_Fear x Topic_Kincare'] = df['Emotion_Fear'] * df['Topic_Kincare']


In [None]:
X = df[['Emotion_Fear','Topic_Affiliation','Topic_Kincare','Appeal_Testi','Appeal_Pers','LStyle_Tperson','PW_weekend_dummy','PD_afternoon_dummy','PD_evening_dummy','PD_night_dummy','WH_workhours_dummy','DC_desktop_dummy','DC_tablet_dummy','Emotion_Fear x Topic_Affiliation','Emotion_Fear x Topic_Kincare',]]
Y = df['avg_engagement_time_sec']

In [None]:
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)