In [7]:
import pandas as pd
import numpy as np

# Clickbait dataset 
df = pd.read_csv('clickbait_data.csv')

# 1 = 'yes' and 0 = 'no' 
df['clickbait'] = df['clickbait'].apply(lambda x: 'yes' if x == 1 else 'no')

# 100 fictional users and 50 fictional interactions per user (chosen randomly)
n_users = 100  
n_interactions_per_user = 50  

interaction_data = []

for user_id in range(1, n_users + 1):
    articles_interacted = df.sample(n_interactions_per_user).index
    for article_id in articles_interacted:
        interaction_data.append({
            'userId': user_id,
            'headlineId': article_id,
            'headlineName': df.loc[article_id, 'headline'],
            'clickbait': df.loc[article_id, 'clickbait'],
            'interaction': 1
        })

interaction_df = pd.DataFrame(interaction_data)
interaction_df.to_csv('user_interactions_random.csv', index=False)

In [1]:
import pandas as pd
import numpy as np

# Same thing, but the interaction are made with 50% clickbait and 50% non-clickbait 

df = pd.read_csv('clickbait_data.csv')

df['clickbait'] = df['clickbait'].apply(lambda x: 'yes' if x == 1 else 'no')

n_users = 100
n_interactions_per_user = 50

interaction_data = []

for user_id in range(1, n_users + 1):
    n_clickbait = n_interactions_per_user // 2
    n_non_clickbait = n_interactions_per_user - n_clickbait
    
    clickbait_articles = df[df['clickbait'] == 'yes'].sample(n_clickbait).index
    non_clickbait_articles = df[df['clickbait'] == 'no'].sample(n_non_clickbait).index
    
    articles_interacted = list(clickbait_articles) + list(non_clickbait_articles)
    np.random.shuffle(articles_interacted)
    
    for article_id in articles_interacted:
        interaction_data.append({
            'userId': user_id,
            'headlineId': article_id,
            'headlineName': df.loc[article_id, 'headline'],
            'clickbait': df.loc[article_id, 'clickbait'],
            'interaction': 1
        })

interaction_df = pd.DataFrame(interaction_data)
interaction_df.to_csv('user_interactions_50_50.csv', index=False)


In [2]:
import pandas as pd
import numpy as np

# Same thing, but the interaction are made with 55% clickbait and 45% non-clickbait (little bias)

df = pd.read_csv('clickbait_data.csv')

df['clickbait'] = df['clickbait'].apply(lambda x: 'yes' if x == 1 else 'no')

n_users = 100
n_interactions_per_user = 50
bias_ratio = 0.55

interaction_data = []

for user_id in range(1, n_users + 1):
    n_clickbait = int(n_interactions_per_user * bias_ratio)
    n_non_clickbait = n_interactions_per_user - n_clickbait
    
    clickbait_articles = df[df['clickbait'] == 'yes'].sample(n_clickbait).index
    non_clickbait_articles = df[df['clickbait'] == 'no'].sample(n_non_clickbait).index
    
    articles_interacted = list(clickbait_articles) + list(non_clickbait_articles)
    np.random.shuffle(articles_interacted)
    
    for article_id in articles_interacted:
        interaction_data.append({
            'userId': user_id,
            'headlineId': article_id,
            'headlineName': df.loc[article_id, 'headline'],
            'clickbait': df.loc[article_id, 'clickbait'],
            'interaction': 1
        })

interaction_df = pd.DataFrame(interaction_data)
interaction_df.to_csv('user_interactions_bias_55_45.csv', index=False)


In [3]:
import pandas as pd

# Saving in CSV format 

interaction_df = pd.read_csv('user_interactions_random.csv')

user_stats = interaction_df.groupby(['userId', 'clickbait']).size().unstack(fill_value=0).reset_index()
user_stats.columns = ['userId', 'non_clickbait_interactions', 'clickbait_interactions']

user_stats.to_csv('user_random_clickbait_stats.csv', index=False)

print("Statistics on interactions by user:")
print(user_stats.head())


Statistiques des interactions par utilisateur:
   userId  non_clickbait_interactions  clickbait_interactions
0       1                          24                      26
1       2                          26                      24
2       3                          18                      32
3       4                          25                      25
4       5                          21                      29


In [4]:
import pandas as pd

# Saving in CSV format 

interaction_df = pd.read_csv('user_interactions_50_50.csv')

user_stats = interaction_df.groupby(['userId', 'clickbait']).size().unstack(fill_value=0).reset_index()
user_stats.columns = ['userId', 'non_clickbait_interactions', 'clickbait_interactions']

user_stats.to_csv('user_50_50_clickbait_stats.csv', index=False)

print("Statistics on interactions by user:")
print(user_stats.head())


Statistiques des interactions par utilisateur:
   userId  non_clickbait_interactions  clickbait_interactions
0       1                          25                      25
1       2                          25                      25
2       3                          25                      25
3       4                          25                      25
4       5                          25                      25


In [5]:
import pandas as pd

# Saving in CSV format 

interaction_df = pd.read_csv('user_interactions_bias_55_45.csv')

user_stats = interaction_df.groupby(['userId', 'clickbait']).size().unstack(fill_value=0).reset_index()
user_stats.columns = ['userId', 'non_clickbait_interactions', 'clickbait_interactions']

user_stats.to_csv('user_55_45_clickbait_stats.csv', index=False)

print("Statistics on interactions by user:")
print(user_stats.head())

Statistiques des interactions par utilisateur:
   userId  non_clickbait_interactions  clickbait_interactions
0       1                          23                      27
1       2                          23                      27
2       3                          23                      27
3       4                          23                      27
4       5                          23                      27
