# The purpose of this notebook is to get data from multiple data sources and transform them to common structure.

In [1]:
import pandas as pd

 ## #1 Data source: kaggle
 https://www.kaggle.com/datasets/amananandrai/clickbait-dataset

In [7]:
df_1 = pd.read_csv("../data/clickbait_data.csv")
df_1.rename(columns={'headline': 'title'}, inplace=True)

In [8]:
df_1

Unnamed: 0,title,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


 ## #2 Data source: hugging-face
https://huggingface.co/datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English

In [14]:
df_train_2 = pd.read_csv("hf://datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/" + 'train.tsv', sep="\t")
df_test_2 = pd.read_csv("hf://datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/" + 'test.tsv', sep="\t")
df_validation_2 = pd.read_csv("hf://datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/" + 'validation.tsv', sep="\t")

In [17]:
df_2 = pd.concat([df_test_2, df_train_2, df_validation_2])
df_2 = df_2[["title","label"]]
df_2.rename(columns={'label': 'clickbait'}, inplace=True)
df_2['clickbait'] = 1 - df_2['clickbait']

In [18]:
df_2

Unnamed: 0,title,clickbait
0,Conservatives Will HATE What Donald Trump Just...,1
1,Trump victory may create new tension between U...,0
2,WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...,1
3,"Democratic Senator Franken to resign: CNN, cit...",0
4,GANG OF DOMESTIC TERRORISTS Violently Attack L...,1
...,...,...
5995,Trump's Jerusalem plan revives tensions in EU ...,0
5996,Donald Trump Rings In The New Year With A Vici...,1
5997,Russian parliament speaker says hopes for bett...,0
5998,Trump tax plan will sharply slash corporate ta...,0


 ## #3 Data source: kaggle 
https://www.kaggle.com/datasets/vikassingh1996/news-clickbait-dataset?select=train2.csv

In [19]:
df_news_3 = pd.read_csv("../data/train2.csv")
df_news_3['label'] = df_news_3['label'].apply(lambda x: 1 if x == 'clickbait' else 0)

df_news_3.rename(columns={'label': 'clickbait'}, inplace=True)
df_3 = df_news_3[['title', 'clickbait']]

 ## #4, #5 Data source: clickbait challenge 
https://webis.de/events/clickbait-challenge/shared-task.html

In [20]:
file_path = '../data/clickbait17-train-170331/instances.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    data_df = pd.read_json(file,lines=True)

file_path = '../data/clickbait17-train-170331/truth.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    labels = pd.read_json(file,lines=True)
    
labels = labels[["id", "truthClass"]]
data_df = data_df[["id", "targetTitle"]]

df_4 = pd.merge(data_df, labels, left_on='id', right_on='id', how='left')
df_4['truthClass'] = df_4['truthClass'].apply(lambda x: 1 if x == 'clickbait' else 0)
df_4.rename(columns={'targetTitle': 'title', 'truthClass': 'clickbait'}, inplace=True)
df_4 = df_4[["title", "clickbait"]]

# b)

file_path = '../data/clickbait17-validation-170630/instances.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    data_df = pd.read_json(file,lines=True)

file_path = '../data/clickbait17-validation-170630/truth.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    labels = pd.read_json(file,lines=True)
    
labels = labels[["id", "truthClass"]]
data_df = data_df[["id", "targetTitle"]]

df_5 = pd.merge(data_df, labels, left_on='id', right_on='id', how='left')
df_5['truthClass'] = df_5['truthClass'].apply(lambda x: 1 if x == 'clickbait' else 0)
df_5.rename(columns={'targetTitle': 'title', 'truthClass': 'clickbait'}, inplace=True)
df_5 = df_5[["title", "clickbait"]]

## Concatenate datasets

In [21]:
dfs = [
    df_1,
    df_2,
    df_3,
    df_4,
    df_5
]

In [22]:
for df_ in dfs:
    print(df_.shape)

(32000, 2)
(44267, 2)
(21029, 2)
(2459, 2)
(19538, 2)


In [23]:
for i,df_ in enumerate(dfs):
    print("=========  " + str(i+1) +"- dataset =============")
    print(df_["clickbait"].value_counts())

clickbait
0    16001
1    15999
Name: count, dtype: int64
clickbait
1    22851
0    21416
Name: count, dtype: int64
clickbait
0    16738
1     4291
Name: count, dtype: int64
clickbait
0    1697
1     762
Name: count, dtype: int64
clickbait
0    14777
1     4761
Name: count, dtype: int64


In [24]:
df_merged = pd.concat([df_1,df_2,df_3,df_4,df_5])

In [13]:
df_merged.to_csv("../data/merged_all_datasets.csv", index=False)

In [8]:
df_merged["clickbait"].value_counts()

clickbait
0    70629
1    48664
Name: count, dtype: int64

In [11]:
df_0_huge = df_merged[df_merged['clickbait'] == 0].sample(n=40000, random_state=42)
df_1_huge = df_merged[df_merged['clickbait'] == 1].sample(n=40000, random_state=42)
df_balanced_huge = pd.concat([df_0_huge, df_1_huge])

df_balanced_huge.to_csv("../data/merged_datasetes_balanced_huge.csv", index=False)

In [14]:
df_0 = df_merged[df_merged['clickbait'] == 0].sample(n=10000, random_state=42)
df_1 = df_merged[df_merged['clickbait'] == 1].sample(n=10000, random_state=42)

df_balanced = pd.concat([df_0, df_1])

In [15]:
df_balanced.to_csv("../data/merged_datasetes_balanced.csv", index=False)

## + Dataset with body: clickbait challange
https://webis.de/events/clickbait-challenge/shared-task.html

In [56]:
def load_data_cc(name, columns=[]):
    """
    loads data from clickbait challenge webis corpus
    
    name (str): name of the folder
    columns (list): list of columns names to extract apart from 'id' and 'targetTitle'
    """
    data_file_path = f'../data/{name}/instances.jsonl'
    with open(data_file_path, 'rb') as file:
        data_df = pd.read_json(file,lines=True)

    target_file_path = f'../data/{name}/truth.jsonl'
    with open(target_file_path, 'rb') as file:
        labels = pd.read_json(file,lines=True)
    
    data_df = data_df[["id", "targetTitle"] + columns]
    labels = labels[["id", "truthClass"]]

    df = pd.merge(data_df, labels, on='id', how='left')
    df.drop(['id'], axis=1, inplace=True)
    df['truthClass'] = df['truthClass'].apply(lambda x: 1 if x == 'clickbait' else 0)
    df.rename(columns={'targetTitle': 'title', 'truthClass': 'clickbait'}, inplace=True)
    return df

In [60]:
files = ["clickbait17-train-170331", "clickbait17-test-170720", "clickbait17-validation-170630"]
body_dfs = []
for filename in files:
    part_df = load_data_cc(filename, columns=["targetParagraphs"])
    part_df['body'] = part_df['targetParagraphs'].apply(lambda x: " ".join(x))
    part_df = part_df[['title', 'body', 'clickbait']]
    body_dfs.append(part_df)
df_body = pd.concat(body_dfs).reset_index(drop=True)

In [99]:
df_body['clickbait'].value_counts()

clickbait
0    30938
1    10038
Name: count, dtype: int64

In [88]:
df_body.to_csv("../data/cc_dataset.csv", index=False)

In [89]:
df_00 = df_body[df_body['clickbait'] == 0].sample(n=10000, random_state=37)
df_01 = df_body[df_body['clickbait'] == 1].sample(n=10000, random_state=37)

df_body_balanced = pd.concat([df_00, df_01])

In [101]:
df_body_balanced.to_csv("../data/cc_dataset_balanced.csv", index=False)

## Clickbait spoiling dataset

https://pan.webis.de/semeval23/pan23-web/clickbait-challenge.html

In [None]:
"""
Vocab:
task 1: spoiler type classification
task 2: spoiler generation / extraction

Columns:
targetTitle: The title of the linked web page.
targetParagraphs: The main content of the linked web page. Consists of the paragraphs of manually extracted main content.
humanSpoiler: The human generated spoiler (abstractive) for the clickbait post from the linked web page. Beware, there is a lot of missing values.
spoiler: The human extracted spoiler for the clickbait post from the linked web page.
spoilerPositions: The position of the human extracted spoiler for the clickbait post from the linked web page.
tags: The spoiler type (might be "phrase", "passage", or "multi") that is to be classified in task 1 (spoiler type classification).

How to use spoilerPositions:
if `data['spoilerPositions'][0]` is `[[[3, 151], [3, 186]]]`,
then `data['targetParagraphs'][0][3][151:186]` equals `data['spoiler']`
"""

In [4]:
names = ['train', 'validation']
spoil_dfs = []
for name in names:
    data_file_path = f'../../data/spoiling-clickbait-22/{name}.jsonl'
    with open(data_file_path, 'rb') as file:
        part_df = pd.read_json(file,lines=True)

    part_df['humanSpoiler'] = part_df['provenance'].apply(lambda x: x['humanSpoiler'])
    part_df = part_df[["targetTitle", "targetParagraphs", "humanSpoiler", "spoiler", "spoilerPositions", "tags"]]
    spoil_dfs.append(part_df)
spoil_df = pd.concat(spoil_dfs).reset_index(drop=True)

In [37]:
spoil_df.to_csv("../data/spoiling_data.csv", index=False)