# The purpose of this notebook is to get data from multiple data sources and transform them to common structure.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import re
from sklearn.model_selection import train_test_split

 ## #1 Data source: kaggle
 https://www.kaggle.com/datasets/amananandrai/clickbait-dataset

In [2]:
df_1 = pd.read_csv("data/clickbait_data.csv")
df_1.rename(columns={'headline': 'title'}, inplace=True)

In [3]:
df_1

Unnamed: 0,title,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


 ## #2 Data source: hugging-face
https://huggingface.co/datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English

In [4]:
df_train_2 = pd.read_csv("hf://datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/" + 'train.tsv', sep="\t")
df_test_2 = pd.read_csv("hf://datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/" + 'test.tsv', sep="\t")
df_validation_2 = pd.read_csv("hf://datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/" + 'validation.tsv', sep="\t")

In [5]:
df_2 = pd.concat([df_test_2, df_train_2, df_validation_2])
df_2 = df_2[["title","label"]]
df_2.rename(columns={'label': 'clickbait'}, inplace=True)
df_2['clickbait'] = 1 - df_2['clickbait']

In [6]:
df_2

Unnamed: 0,title,clickbait
0,Conservatives Will HATE What Donald Trump Just...,1
1,Trump victory may create new tension between U...,0
2,WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...,1
3,"Democratic Senator Franken to resign: CNN, cit...",0
4,GANG OF DOMESTIC TERRORISTS Violently Attack L...,1
...,...,...
5995,Trump's Jerusalem plan revives tensions in EU ...,0
5996,Donald Trump Rings In The New Year With A Vici...,1
5997,Russian parliament speaker says hopes for bett...,0
5998,Trump tax plan will sharply slash corporate ta...,0


 ## #3 Data source: kaggle 
https://www.kaggle.com/datasets/vikassingh1996/news-clickbait-dataset?select=train2.csv

In [7]:
df_news_3 = pd.read_csv("data/train2.csv")
df_news_3['label'] = df_news_3['label'].apply(lambda x: 1 if x == 'clickbait' else 0)

df_news_3.rename(columns={'label': 'clickbait'}, inplace=True)
df_3 = df_news_3[['title', 'clickbait']]

 ## #4, #5 Data source: clickbait challenge 
https://webis.de/events/clickbait-challenge/shared-task.html

In [8]:
file_path = 'data/clickbait17-train-170331/instances.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    data_df = pd.read_json(file,lines=True)

file_path = 'data/clickbait17-train-170331/truth.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    labels = pd.read_json(file,lines=True)
    
labels = labels[["id", "truthClass"]]
data_df = data_df[["id", "targetTitle"]]

df_4 = pd.merge(data_df, labels, left_on='id', right_on='id', how='left')
df_4['truthClass'] = df_4['truthClass'].apply(lambda x: 1 if x == 'clickbait' else 0)
df_4.rename(columns={'targetTitle': 'title', 'truthClass': 'clickbait'}, inplace=True)
df_4 = df_4[["title", "clickbait"]]

# b)

file_path = 'data/clickbait17-validation-170630/instances.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    data_df = pd.read_json(file,lines=True)

file_path = 'data/clickbait17-validation-170630/truth.jsonl'

# Read the JSON data from the file
with open(file_path, 'rb') as file:
    labels = pd.read_json(file,lines=True)
    
labels = labels[["id", "truthClass"]]
data_df = data_df[["id", "targetTitle"]]

df_5 = pd.merge(data_df, labels, left_on='id', right_on='id', how='left')
df_5['truthClass'] = df_5['truthClass'].apply(lambda x: 1 if x == 'clickbait' else 0)
df_5.rename(columns={'targetTitle': 'title', 'truthClass': 'clickbait'}, inplace=True)
df_5 = df_5[["title", "clickbait"]]

In [9]:
dfs = [
    df_1,
    df_2,
    df_3,
    df_4,
    df_5
]

In [10]:
for df_ in dfs:
    print(df_.shape)

(32000, 2)
(44267, 2)
(21029, 2)
(2459, 2)
(19538, 2)


In [11]:
for i,df_ in enumerate(dfs):
    print("=========  " + str(i+1) +"- dataset =============")
    print(df_["clickbait"].value_counts())

clickbait
0    16001
1    15999
Name: count, dtype: int64
clickbait
1    22851
0    21416
Name: count, dtype: int64
clickbait
0    16738
1     4291
Name: count, dtype: int64
clickbait
0    1697
1     762
Name: count, dtype: int64
clickbait
0    14777
1     4761
Name: count, dtype: int64


In [12]:
df_merged = pd.concat([df_1,df_2,df_3,df_4,df_5])

In [13]:
df_merged.to_csv("data/merged_all_datasets.csv", index=False)

In [14]:
df_0 = df_merged[df_merged['clickbait'] == 0].sample(n=10000, random_state=42)
df_1 = df_merged[df_merged['clickbait'] == 1].sample(n=10000, random_state=42)

df_balanced = pd.concat([df_0, df_1])

In [15]:
df_balanced.to_csv("data/merged_datasetes_balanced.csv", index=False)