In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error
import numpy as np

In [2]:
# Load the dataset
data = pd.read_csv('ha3_data.csv', header=0)

# Data preprocessing
data = pd.get_dummies(data, columns=['platform', 'weekday', 'time'], drop_first=False)

In [3]:
# Check for non-numeric values in 'website_id' column
non_numeric_values = data['website_id'].str.contains('siteid_').any()

if non_numeric_values:
    # If non-numeric values are found, extract numeric identifiers from 'website_id'
    data['website_id'] = data['website_id'].str.extract(r'(\d+)').astype(float)

In [4]:
# Splitting the data into train and test sets
X = data.drop(columns=['stopgo'])
y = data['stopgo']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Model building
model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("MAE:", mae)

Accuracy: 0.8929158624633641
Precision: 0.8929158624633641
Recall: 1.0
F1 Score: 0.943429003021148
MAE: 0.10708413753663593


In [None]:
# Coefficients interpretation
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0].astype(float)
})
coefficients.sort_values(by="Coefficient", ascending=False, inplace=True)
print(coefficients)

# Answering the questions
# 1. Device Influence
print("\nDevice Influence:")
print(coefficients[coefficients['Feature'].str.startswith('platform')])

# 2. Time Influence
print("\nTime Influence:")
print(coefficients[coefficients['Feature'].str.startswith('weekday_') | coefficients['Feature'].str.startswith('time_')])

# 3. Topic Influence
print("\nTopic Influence:")
print(coefficients[coefficients['Feature'].str.startswith('post25_include_')])

# 4. Emotional Language Influence
print("\nEmotional Language Influence:")
print(coefficients[['Feature', 'Coefficient']].loc[coefficients['Feature'].isin(['posemo', 'anger', 'fear', 'sadness'])])

                    Feature  Coefficient
31         platform_Desktop     0.491900
43  time_hour_early_morning     0.341231
44        time_hour_evening     0.331692
42      time_hour_afternoon     0.316290
32          platform_Mobile     0.311001
10         post25_include_5     0.244229
33          platform_Tablet     0.236173
26        post25_include_21     0.227263
38                weekday_4     0.221124
39                weekday_5     0.209381
40                weekday_6     0.172903
35                weekday_1     0.146928
36                weekday_2     0.110767
9          post25_include_4     0.101898
37                weekday_3     0.101836
41                weekday_7     0.093315
5                   sadness     0.071639
7          post25_include_2     0.067327
45        time_hour_morning     0.067043
20        post25_include_15     0.063507
25        post25_include_20     0.060253
14         post25_include_9     0.049336
30        post25_include_25     0.046972
17        post25

In [None]:
# Coefficients interpretation
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0].astype(float)
})
coefficients.sort_values(by="Coefficient", ascending=False, inplace=True)
print(coefficients)

# Answering the questions
# 1. Device Influence
print("\nDevice Influence:")
print(coefficients[coefficients['Feature'].str.startswith('platform')])

# 2. Time Influence
print("\nTime Influence:")
print(coefficients[coefficients['Feature'].str.startswith('weekday_') | coefficients['Feature'].str.startswith('time_')])

# 3. Topic Influence
print("\nTopic Influence:")
print(coefficients[coefficients['Feature'].str.startswith('post25_include_')])

# 4. Emotional Language Influence
print("\nEmotional Language Influence:")
print(coefficients[['Feature', 'Coefficient']].loc[coefficients['Feature'].isin(['posemo', 'anger', 'fear', 'sadness'])])

                    Feature  Coefficient
31         platform_Desktop     0.491900
43  time_hour_early_morning     0.341231
44        time_hour_evening     0.331692
42      time_hour_afternoon     0.316290
32          platform_Mobile     0.311001
10         post25_include_5     0.244229
33          platform_Tablet     0.236173
26        post25_include_21     0.227263
38                weekday_4     0.221124
39                weekday_5     0.209381
40                weekday_6     0.172903
35                weekday_1     0.146928
36                weekday_2     0.110767
9          post25_include_4     0.101898
37                weekday_3     0.101836
41                weekday_7     0.093315
5                   sadness     0.071639
7          post25_include_2     0.067327
45        time_hour_morning     0.067043
20        post25_include_15     0.063507
25        post25_include_20     0.060253
14         post25_include_9     0.049336
30        post25_include_25     0.046972
17        post25

In [None]:
# Coefficients interpretation
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0].astype(float)
})
coefficients.sort_values(by="Coefficient", ascending=False, inplace=True)
print(coefficients)

# Answering the questions
# 1. Device Influence
print("\nDevice Influence:")
print(coefficients[coefficients['Feature'].str.startswith('platform')])

# 2. Time Influence
print("\nTime Influence:")
print(coefficients[coefficients['Feature'].str.startswith('weekday_') | coefficients['Feature'].str.startswith('time_')])

# 3. Topic Influence
print("\nTopic Influence:")
print(coefficients[coefficients['Feature'].str.startswith('post25_include_')])

# 4. Emotional Language Influence
print("\nEmotional Language Influence:")
print(coefficients[['Feature', 'Coefficient']].loc[coefficients['Feature'].isin(['posemo', 'anger', 'fear', 'sadness'])])

                    Feature  Coefficient
31         platform_Desktop     0.491900
43  time_hour_early_morning     0.341231
44        time_hour_evening     0.331692
42      time_hour_afternoon     0.316290
32          platform_Mobile     0.311001
10         post25_include_5     0.244229
33          platform_Tablet     0.236173
26        post25_include_21     0.227263
38                weekday_4     0.221124
39                weekday_5     0.209381
40                weekday_6     0.172903
35                weekday_1     0.146928
36                weekday_2     0.110767
9          post25_include_4     0.101898
37                weekday_3     0.101836
41                weekday_7     0.093315
5                   sadness     0.071639
7          post25_include_2     0.067327
45        time_hour_morning     0.067043
20        post25_include_15     0.063507
25        post25_include_20     0.060253
14         post25_include_9     0.049336
30        post25_include_25     0.046972
17        post25

In [6]:
# Coefficients interpretation
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0].astype(float)
})
coefficients.sort_values(by="Coefficient", ascending=False, inplace=True)
print(coefficients)

# Answering the questions
# 1. Device Influence
print("\nDevice Influence:")
print(coefficients[coefficients['Feature'].str.startswith('platform')])

# 2. Time Influence
print("\nTime Influence:")
print(coefficients[coefficients['Feature'].str.startswith('weekday_') | coefficients['Feature'].str.startswith('time_')])

# 3. Topic Influence
print("\nTopic Influence:")
print(coefficients[coefficients['Feature'].str.startswith('post25_include_')])

# 4. Emotional Language Influence
print("\nEmotional Language Influence:")
print(coefficients[['Feature', 'Coefficient']].loc[coefficients['Feature'].isin(['posemo', 'anger', 'fear', 'sadness'])])

                    Feature  Coefficient
31         platform_Desktop     0.491900
43  time_hour_early_morning     0.341231
44        time_hour_evening     0.331692
42      time_hour_afternoon     0.316290
32          platform_Mobile     0.311001
10         post25_include_5     0.244229
33          platform_Tablet     0.236173
26        post25_include_21     0.227263
38                weekday_4     0.221124
39                weekday_5     0.209381
40                weekday_6     0.172903
35                weekday_1     0.146928
36                weekday_2     0.110767
9          post25_include_4     0.101898
37                weekday_3     0.101836
41                weekday_7     0.093315
5                   sadness     0.071639
7          post25_include_2     0.067327
45        time_hour_morning     0.067043
20        post25_include_15     0.063507
25        post25_include_20     0.060253
14         post25_include_9     0.049336
30        post25_include_25     0.046972
17        post25