Here we download Cornel Newsroom summarization dataset (https://summari.es/) and make dataset for fake news detection. To generate fake texts we use gpt-2 model (a real sentence is passed to the model, and the output text is considered as fake). 

In [0]:
!pip install transformers
# !pip install -U -q PyDrive

In [2]:
import json
import re

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm.notebook import tqdm as tqdm_notebook
from tqdm  import tqdm
import torch
import numpy as np
import pandas as pd

import nltk
from nltk import tokenize
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

# idd = '1h_kUrWbKvZR0iRms8zKMUJO5zq9wbIXW' # (Cornell Newsroom Summarization Dataset)
# downloaded_ = drive.CreateFile({'id':idd}) 
# downloaded_.GetContentFile('train.jsonl')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


device(type='cuda')

# Initial Dataset

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
def clean_data(text):
    text = re.sub(r'\<[^>]*\>', '', text)
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'&[a-z]{0,7};', ' ', text)
    text = re.sub(r'\s{2,10}', ' ', text)
    text = re.sub(r'\s{2,10}', ' ', text)
    text = re.sub(r"\\'", r"'", text)
    text = re.sub(r'\\x\d{1,4}', '', text)
    return text

def get_sentences(data):
    """
    splits texts into sentences
    return: list of sentences to pass to gpt model 
            list of sentences to pass to classification model as real texts
    """
    texts_gpt2 = []
    texts_real = []

    for text in data:
        tokenized = tokenize.sent_tokenize(text)
        if len(tokenized) >= 2 :
            texts_gpt2.append(tokenized[0])
            texts_real.append(' '.join(tokenized[:2]))
    assert len(texts_real) == len(texts_gpt2)
    return texts_gpt2, texts_real

In [0]:
with open('/content/drive/My Drive/train-stats.jsonl', 'r') as json_file:
        json_list = list(json_file)
        
data = []
for json_str in json_list:
        result = json.loads(json_str)
        data.append(clean_data(result['text']))
del json_list

texts_gpt2, texts_r = get_sentences(data)

In [7]:
print('Total amount of texts: {}'.format(len(texts_gpt2)))

Total amount of texts: 992803


In [8]:
texts_gpt2[:3], texts_r[:3]

(['HAMBURG, Germany, June 3 \x97 As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises.',
  'WASHINGTON, Dec. 23 - The National Security Agency has traced and analyzed large volumes of telephone and Internet communications flowing into and out of the United States as part of the eavesdropping program that President Bush approved after the Sept. 11, 2001, attacks to hunt for evidence of terrorist activity, according to current and former government officials.',
  "IF outsized executive pay has indeed become a source of outrage to American shareholders, then the contest this week between Pfizer Inc.'s investors and its board could prove the most compelling of the year."],
 ['HAMBURG, Germany, June 3 \x97 As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungb

In [9]:
texts_gpt2[-3:], texts_r[-3:]

(['Elizabeth Taylor has White Diamonds.',
  'BALTIMORE, May 18 -- A disease believed to be equine herpes virus has swept through the barn area at Churchill Downs, site of the Kentucky Derby, leading to the death of two horses and the placement of a quarantine on three barns.',
  'Columnist Michelle Singletary was online to field questions about everything from retirement planning to protecting your credit rating.'],
 ['Elizabeth Taylor has White Diamonds. Coco Chanel had Chanel No.',
  'BALTIMORE, May 18 -- A disease believed to be equine herpes virus has swept through the barn area at Churchill Downs, site of the Kentucky Derby, leading to the death of two horses and the placement of a quarantine on three barns. The outbreak of the rare neurological virus, which can cause symptoms ranging from mild fever and upper respiratory infection to paralysis, has led to the scratching of three horses scheduled to run this weekend at Pimlico in major stakes races.',
  'Columnist Michelle Singlet

# Generation 

In [10]:
MAX_LENGTH = 50
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device);

HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=224, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=548118077, style=ProgressStyle(description_…




In [0]:
def generate_fake(texts, sampling_type, tokenizer=tokenizer, model=model):
    """
    Generates texts depending on sampling_type. 
    sampling type: tuple (sampling_type : value)
    return: list of fake texts
    """
    fake = []
    samplings = []

    data_iter = tqdm_notebook(enumerate(texts), total = len(texts))
    for _, el in data_iter:    
        sent = generate_sentence(el, model, tokenizer, sampling_type)            
        sent = re.sub(r'\n', ' ', sent)
        fake.append(sent)
        samplings.append(sampling_type[0])        
        data_iter.set_postfix(sampling = sampling_type)
    return fake, samplings

def generate_sentence(sentence, model, tokenizer, sampling_type, max_length=MAX_LENGTH):    
    """
    Generates sentence depending on sampling_type
    return: str
    """
    eos = tokenizer.encode('.?!...! ?')
    context = torch.tensor([tokenizer.encode(sentence)]).to(device)
    max_length += context.size()[-1]
    
    if sampling_type[0] == 'beam_search':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, num_beams=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id,
                                 repetition_penalty=2.3)
    elif sampling_type[0] == 'temperature':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, temperature=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id)
    elif sampling_type[0] == 'top_k':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, temperature=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id)
    elif sampling_type[0] == 'nucleus':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, top_p=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id)
    else:  # Argmax otherwise
        outputs = model.generate(input_ids=context, max_length=max_length, do_sample=False,
                                 pad_token_id=tokenizer.pad_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def df_checkpoint(texts_fake, texts_real, samplings):
    """
    creates pandas DataFrame with texts (both real and fake) and sampling type
    saves df to google drive and working directory
    return: pandas DataFrame
    """
    df = pd.DataFrame(columns=['text', 'label', 'sampling'])
    df['text'] = texts_fake + texts_real
    df['label'] = ['fake']* len(texts_fake) + ['real']* len(texts_real)
    df['sampling'] = samplings + ['No sampling'] * len(texts_real)
    df.to_csv('dataset.csv', index=False)
    df.to_csv('/content/drive/My Drive/dpl_dataset.csv', index=False)
    return df

In [0]:
n = 70000  # Number of texts to generate for every sampling type
texts_fake = []
texts_real = []
samplings = []
sampling_types = [('temperature', 0.9), ('temperature', 0.8), ('top_k', 20), ('top_k', 100), ('nucleus', 0.9), ('nucleus', 0.8), ('argmax', 23), ('beam_search', 3), ('beam_search',5)]


for ind, sampling_type in enumerate(sampling_types):
    fake, sampling  = generate_fake(texts_gpt2[ind*n:ind*n+n], sampling_type, tokenizer=tokenizer,model=model)
    texts_fake.extend(fake)
    samplings.extend(sampling)
    texts_real.extend(texts_r[ind*n:ind*n+n])

    df = df_checkpoint(texts_fake, texts_real, samplings)

In [94]:
df.head()

Unnamed: 0,text,label,sampling
0,"HAMBURG, Germany, June 3  As he left the socc...",fake,beam_search
1,"WASHINGTON, Dec. 23 - The National Security Ag...",fake,beam_search
2,IF outsized executive pay has indeed become a ...,fake,beam_search
3,"BY A.J. I don't know what's wrong with me, but...",fake,beam_search
4,Spinach has terrorized generations of veggie-p...,fake,temperature


In [95]:
df.tail()

Unnamed: 0,text,label,sampling
31,By JAMES RUTENBERG and CORKY SIEMASZKO Wednesd...,real,No sampling
32,Published: 2:27PM GMT 26 Nov 2009 Information ...,real,No sampling
33,"Thursday, June 22th 2006, 7:34AM The Goldstein...",real,No sampling
34,"John Edwards' former aide said he is ""skeptica...",real,No sampling
35,"Wednesday, March 8th 1995, 3:63AM OUR GAME By ...",real,No sampling


In [98]:
assert len(texts_fake) == len(texts_real)
texts_fake[:5], texts_real[:5]

(["HAMBURG, Germany, June 3 \x97 As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises. The incident took place at about 8:30 p.m., according to police reports reported by local media outlets (see below). At least three people were injured when an unidentified man threw smoke bombs from his car into crowds outside Stuttgart's",
  "WASHINGTON, Dec. 23 - The National Security Agency has traced and analyzed large volumes of telephone and Internet communications flowing into and out of the United States as part of the eavesdropping program that President Bush approved after the Sept. 11, 2001, attacks to hunt for evidence of terrorist activity, according to current and former government officials.    (BEGIN VIDEOTAPE) SENATOR RUSSIA: Well I think this is a very important issue right now because if you look at what we're seeing in some pl

In [0]:
with open('fake.txt', 'w', encoding='utf-8') as f:    
    for item in texts_fake:
        f.write('{}\n'.format(item))

with open('real.txt'.format(sampling_type[0]), 'w', encoding='utf-8') as f:    
    for item in texts_real:
        f.write('{}\n'.format(item))