<a href="https://colab.research.google.com/github/magicaltrap/clickbait_fastai/blob/master/preprocessed_to_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
path_clickbait = "/content/gdrive/My Drive/data/clickbait/"

Mounted at /content/gdrive


In [0]:
import json
import pandas as pd

# Open labeled clickbait dataset 

## Save "id" and "postText" in a dictionary

In [0]:
id_features = {}
with open(path_clickbait + 'data/instances.jsonl', 'r') as reader:
    for line in reader:
        item = json.loads(line)
        
        item_id = item["id"]  #ID of example
        postText = item["postText"][0] #postText = Twitter post
        
        id_features[item_id] = postText #save as {id: postText}

In [0]:
for item in id_features.items():
  print(item)
  break

('858462320779026433', 'UK’s response to modern slavery leaving victims destitute while abusers go free')


In [0]:
len(id_features)

19538

## Save "id" and "TruthClass" (label) in a dictionary

In [0]:
id_labels = {}
with open(path_clickbait + 'data/truth.jsonl', 'r') as reader:
    for line in reader:
        item = json.loads(line)
        
        item_id = item["id"]
        truthClass = item["truthClass"]
        
        id_labels[item_id] = truthClass #save as {id: truthCLass}

In [0]:
for item in id_labels.items():
  print(item)
  break

('858464162594172928', 'clickbait')


In [0]:
len(id_labels)

19538

## Save both dictionaries elements in a list to save it later as a .csv file

In [0]:
labels_text = [["label", "postText"]]

for key in id_labels:
    tmp = [id_labels[key], id_features[key]] #label and postText in one list element
    labels_text.append(tmp)

In [0]:
len(labels_text)

19539

In [0]:
labels_text[:5]

[['label', 'postText'],
 ['clickbait', "Facts that will be truly upsetting to '90s girls"],
 ['no-clickbait',
  'UK’s response to modern slavery leaving victims destitute while abusers go free'],
 ['no-clickbait', "Inside North Korea's secret prisons"],
 ['no-clickbait',
  "Trump has flip-flopped. But his supporters aren't upset — or haven't noticed."]]

In [0]:
import csv

with open(path_clickbait + 'data/labels_text.csv', 'w', newline='', encoding='utf-8') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(labels_text)

## Load it as DataFrame and check if there are empty postText entries (isnull())

In [0]:
df = pd.read_csv(path_clickbait + 'data/labels_text.csv')
df[:5]

Unnamed: 0,label,postText
0,clickbait,Facts that will be truly upsetting to '90s girls
1,no-clickbait,UK’s response to modern slavery leaving victim...
2,no-clickbait,Inside North Korea's secret prisons
3,no-clickbait,Trump has flip-flopped. But his supporters are...
4,no-clickbait,Trump now agrees with the majority of American...


In [0]:
df.isnull().any().any()

True

In [0]:
df.isnull().any()

label       False
postText     True
dtype: bool

In [0]:
null_rows = df[df["postText"].isnull()]
print(len(null_rows))
null_rows[:5]

54


Unnamed: 0,label,postText
880,clickbait,
1574,clickbait,
2384,no-clickbait,
2480,no-clickbait,
2483,clickbait,


**remove the rows with NaN postText entry**

In [0]:
null_rows_indices = null_rows.index.values
null_rows_indices

array([ 880, 1574, 2384, 2480, 2483, 2509, 2529, 2546, 2567, 2646, 2681,
       2742, 2749, 2773, 2795, 2828, 2896, 2907, 2911, 2956, 2958, 3071,
       3082, 3146, 3177, 3179, 3188, 3257, 3266, 3289, 3315, 3332, 3340,
       3395, 3428, 3434, 3441, 3464, 3523, 3529, 3560, 3572, 3577, 3579,
       3591, 3599, 3602, 3662, 3701, 3736, 3772, 3798, 4557, 8875])

In [0]:
modified_df = df.drop(list(null_rows_indices))
len(modified_df)

19484

In [0]:
modified_df.isnull().any().any()

False

In [0]:
modified_df.to_csv(path_clickbait + "data/modified_labels_text.csv", index=False)

# Open unlabeled clickbait dataset

## Save "id" and "postText in a dictionary"

In [0]:
id_features = {}
with open(path_clickbait + 'data/instances_unlabeled.jsonl', 'r') as reader:
    for line in reader:
        item = json.loads(line)
        
        item_id = item["id"]  #ID of example
        postText = item["postText"][0] #postText = Twitter post
        
        id_features[item_id] = postText #save as {id: postText}

In [0]:
for item in id_features.items():
  print(item)
  break

('799360299060989952', 'Are high-end, big-budget films the future of fashion advertising?')


**the unlabeled dataset has 80000 examples (4x time more than the labeled one)**

In [0]:
len(id_features)

80013

In [0]:
labels_text_unlabeled = [["label", "postText"]]

for key in id_features:
    tmp = ["unlabeled", id_features[key]] #"unlabeled" and postText
    labels_text_unlabeled.append(tmp)

In [0]:
labels_text_unlabeled[:5]

[['label', 'postText'],
 ['unlabeled',
  'Are high-end, big-budget films the future of fashion advertising?'],
 ['unlabeled', 'India has more than its cash crisis to worry about'],
 ['unlabeled',
  "12 years after her murder, DNA from teen victim's fingernails leads to suspect"],
 ['unlabeled', 'Hey, @ArvindKejriwal, you did it - New #Delhi is number 1.']]

In [0]:
len(labels_text_unlabeled)

80014

## Save it as .csv file

In [0]:
with open(path_clickbait + 'data/unlabeled_text.csv', 'w', newline='', encoding='utf-8') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(labels_text_unlabeled)

## Check if the unlabeled data has also empty postText entries or not

In [0]:
df_unlab = pd.read_csv(path_clickbait + 'data/unlabeled_text.csv')
df_unlab[:5]

Unnamed: 0,label,postText
0,unlabeled,"Are high-end, big-budget films the future of f..."
1,unlabeled,India has more than its cash crisis to worry a...
2,unlabeled,"12 years after her murder, DNA from teen victi..."
3,unlabeled,"Hey, @ArvindKejriwal, you did it - New #Delhi ..."
4,unlabeled,"""You heard it here first"": Man predicted 2016 ..."


In [0]:
df_unlab.isnull().any().any()

False

# Combine both labeled and unlabeled datasets into one file (to train our Language Model with more data later)

In [0]:
df_lab = pd.read_csv(path_clickbait + "data/modified_labels_text.csv")
df_lab[:5]

Unnamed: 0,label,postText
0,clickbait,Facts that will be truly upsetting to '90s girls
1,no-clickbait,UK’s response to modern slavery leaving victim...
2,no-clickbait,Inside North Korea's secret prisons
3,no-clickbait,Trump has flip-flopped. But his supporters are...
4,no-clickbait,Trump now agrees with the majority of American...


**We use this combined file later to fine-tune a pretrained "Language Model" and we just need the "postText" column for that. So the label here is not important (clickbait, no-clickbait or unlabeled)**

In [0]:
lab_unlab = pd.concat([df_lab, df_unlab], ignore_index=True)
lab_unlab

Unnamed: 0,label,postText
0,clickbait,Facts that will be truly upsetting to '90s girls
1,no-clickbait,UK’s response to modern slavery leaving victim...
2,no-clickbait,Inside North Korea's secret prisons
3,no-clickbait,Trump has flip-flopped. But his supporters are...
4,no-clickbait,Trump now agrees with the majority of American...
5,no-clickbait,We spoke with the guy who recreated hip-hop al...
6,no-clickbait,Almost 20% of Pres. Trump's first 100 days wer...
7,no-clickbait,Turkey has blocked access to Wikipedia over wh...
8,clickbait,The dictionary left no uncertainty as to whom ...
9,clickbait,In the latest airline episode captured on vide...


In [0]:
len(lab_unlab)

99497

In [0]:
lab_unlab.isnull().any().any()

False

In [0]:
lab_unlab.to_csv(path_clickbait + "data/lab_unlab_text.csv", index=False)