In [0]:
# !pip install transformers
# !pip install -U -q PyDrive

In [2]:
import json
import re

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm_notebook
import torch
import numpy as np

import nltk
from nltk import tokenize
nltk.download('punkt')

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

idd = '1h_kUrWbKvZR0iRms8zKMUJO5zq9wbIXW' # (Cornell Newsroom Summarization Dataset)
downloaded_ = drive.CreateFile({'id':idd}) 
downloaded_.GetContentFile('train.jsonl')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
def clean_data(text):
    text = re.sub(r'\<[^>]*\>', '', text)
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'&[a-z]{0,7};', ' ', text)
    text = re.sub(r'\s{2,10}', ' ', text)
    text = re.sub(r'\s{2,10}', ' ', text)
    text = re.sub(r"\\'", r"'", text)
    text = re.sub(r'\\x\d{1,4}', '', text)
    return text

def sent_first(text): # to gpt2
    return tokenize.sent_tokenize(text)[0]

def sent_splitter(text): # to classification model
    return ' '.join(tokenize.sent_tokenize(text)[:2])

In [0]:
# Загрузка файла и создание списков
with open('train.jsonl', 'r') as json_file:
        json_list = list(json_file)  # [:10]
        
data = []
for json_str in json_list:
        result = json.loads(json_str)
        data.append(clean_data(result['text']))

del json_list

text_gpt2 = [sent_first(text) for text in data]
text_real = [sent_splitter(text) for text in data] 

In [5]:
text_gpt2[:3], text_real[:3]

(['HAMBURG, Germany, June 3 \x97 As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises.',
  'WASHINGTON, Dec. 23 - The National Security Agency has traced and analyzed large volumes of telephone and Internet communications flowing into and out of the United States as part of the eavesdropping program that President Bush approved after the Sept. 11, 2001, attacks to hunt for evidence of terrorist activity, according to current and former government officials.',
  "IF outsized executive pay has indeed become a source of outrage to American shareholders, then the contest this week between Pfizer Inc.'s investors and its board could prove the most compelling of the year."],
 ['HAMBURG, Germany, June 3 \x97 As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungb

In [6]:
MAX_LENGTH = 50
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device);

HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=224, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=548118077, style=ProgressStyle(description_…




In [0]:
def generate_fake(texts, sampling_type, fake=[], tokenizer=tokenizer, model=model):
    """
    Функция для генерации гекстов в зависимости от типа сэмплирования. 
    sampling type: tuple (sampling_type : value)
    Возвращает список фейковых текстов.
    """

    data_iter = tqdm_notebook(enumerate(texts), total = len(texts))
    for ind, el in data_iter:
        
        sent = generate_sentence(el, model, tokenizer, sampling_type)            
        sent = re.sub(r'\n', ' ', sent)
        fake.append(sent)
        
        data_iter.set_postfix(sampling = sampling_type)
        
    return fake

def generate_sentence(sentence, model, tokenizer, sampling_type, max_length=MAX_LENGTH):    
    """
    Функция генерирует предложение в зависимости от типа сэмплирования
    """
    eos = tokenizer.encode('.?!...! ?')
    context = torch.tensor([tokenizer.encode(sentence)]).to(device)
    max_length += context.size()[-1]
    
    if sampling_type[0] == 'beam_search':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, num_beams=sampling_type[1])
    elif sampling_type[0] == 'temperature':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, temperature=sampling_type[1])
    elif sampling_type[0] == 'top_k':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, temperature=sampling_type[1])
    elif sampling_type[0] == 'nucleus':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, top_p=sampling_type[1])
    else: # Иначе аргмакс
        outputs = model.generate(input_ids=context, max_length=max_length, do_sample=False)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [0]:
n = 100000 # Столько текстов будет генерироваться для каждого типа сэмплирования
texts_fake = []
texts_real = []
sampling_types = [('beam_search', 3), ('beam_search',5), ('temperature', 0.9), ('temperature', 0.8), ('top_k', 20), ('top_k', 100), ('nucleus', 0.9), ('nucleus', 0.8), ('argmax', 23)]


for ind, sampling_type in enumerate(sampling_types):
    texts_fake.extend(generate_fake(text_gpt2[ind*n:ind*n+n], sampling_type, fake=[], tokenizer=tokenizer,model=model))
    texts_real.extend(text_real[ind*n:ind*n+n])

In [17]:
assert len(texts_fake) == len(texts_real)
texts_fake[:5], texts_real[:5]

(["HAMBURG, Germany, June 3 \x97 As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises.  An international player on FIFA's FIFA U20 Under 17 Championship, Ogungbure represented Nigeria in the Under 17 World Cup qualifying period.  In a statement provided to Eurogamer, Ogungbure addressed the incident and said that",
  'WASHINGTON, Dec. 23 - The National Security Agency has traced and analyzed large volumes of telephone and Internet communications flowing into and out of the United States as part of the eavesdropping program that President Bush approved after the Sept. 11, 2001, attacks to hunt for evidence of terrorist activity, according to current and former government officials.  The program, which officials called "the largest program of its kind of the world," is aimed at gathering information on the activities of other governme

In [0]:
with open('fake.txt', 'w', encoding='utf-8') as f:    
    for item in texts_fake:
        f.write('{}\n'.format(item))

with open('real.txt'.format(sampling_type[0]), 'w', encoding='utf-8') as f:    
    for item in texts_real:
        f.write('{}\n'.format(item))