<a href="https://colab.research.google.com/github/ovbystrova/dpl/blob/master/notebooks/Fake_texts_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers

In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
import re
import torch
import numpy as np
import json

import nltk
from nltk import tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
def clean_data(text):
    text = re.sub(r'\<[^>]*\>', '', text)
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'&[a-z]{0,7};', ' ', text)
    text = re.sub(r'\s{2,10}', ' ', text)
    text = re.sub(r'\s{2,10}', ' ', text)
    text = re.sub(r"\\'", r"'", text)
    text = re.sub(r'\\x\d{1,4}', '', text)
    return text

def get_sentences(data):
    """
    splits texts into sentences
    return: list of sentences to pass to gpt model 
            list of sentences to pass to classification model as real texts
    """
    texts_gpt2 = []
    texts_real = []

    for text in data:
        tokenized = tokenize.sent_tokenize(text)
        if len(tokenized) >= 2 and len(tokenized[0].split(' ')) < 50:
            texts_gpt2.append(tokenized[0])
            texts_real.append(' '.join(tokenized[:2]))
    assert len(texts_real) == len(texts_gpt2)
    return texts_gpt2, texts_real

In [0]:
with open('/content/drive/My Drive/train-stats.jsonl', 'r') as json_file:
        json_list = list(json_file)[100:]
        
data = []
for json_str in json_list:
        result = json.loads(json_str)
        data.append(clean_data(result['text']))
del json_list

texts_gpt2, texts_r = get_sentences(data)

In [0]:
MAX_LENGTH = 50
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Generation stuff

In [0]:
def generate_fake(texts, sampling_type, tokenizer=tokenizer, model=model):
    """
    Generates texts depending on sampling_type. 
    sampling type: tuple (sampling_type : value)
    return: list of fake texts
    """
    fake = []
    samplings = []

    for el in tqdm(texts):    
        sent = generate_sentence(el, model, tokenizer, sampling_type)            
        sent = re.sub(r'\n', ' ', sent)
        fake.append(sent)
        samplings.append(sampling_type[0])        
    return fake, samplings


def generate_sentence(sentence, model, tokenizer, sampling_type, max_length=MAX_LENGTH):    
    """
    Generates sentence depending on sampling_type
    return: str
    """
    eos = tokenizer.encode('.?!...! ?')
    context = torch.tensor([tokenizer.encode(sentence)][:500])
    max_length += context.size()[-1]
    
    if sampling_type[0] == 'beam_search':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, num_beams=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id,
                                 repetition_penalty=2.3)
    elif sampling_type[0] == 'temperature':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, temperature=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id)
    elif sampling_type[0] == 'top_k':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, temperature=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id)
    elif sampling_type[0] == 'nucleus':
        outputs = model.generate(input_ids=context, max_length=max_length,
                                 do_sample=True, top_p=sampling_type[1],
                                 pad_token_id=tokenizer.pad_token_id)
    else:  # Argmax otherwise
        outputs = model.generate(input_ids=context, max_length=max_length, do_sample=False,
                                 pad_token_id=tokenizer.pad_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [27]:
argmax_fake = generate_fake(texts_gpt2[:5], ('argmax', 23), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:11<00:00,  2.35s/it]


In [32]:
temperature08_fake = generate_fake(texts_gpt2[:5], ('temperature', 0.8), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:10<00:00,  2.03s/it]


In [33]:
temperature09_fake = generate_fake(texts_gpt2[:5], ('temperature', 0.9), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:10<00:00,  2.09s/it]


In [34]:
temperature2_fake = generate_fake(texts_gpt2[:5], ('temperature', 2), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:11<00:00,  2.35s/it]


In [35]:
top_k20_fake = generate_fake(texts_gpt2[:5], ('top_k', 20), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:09<00:00,  1.99s/it]


In [36]:
top_k100_fake = generate_fake(texts_gpt2[:5], ('top_k', 100), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:12<00:00,  2.45s/it]


In [44]:
nucleus09_fake = generate_fake(texts_gpt2[:5], ('nucleus', 0.9), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:10<00:00,  2.03s/it]


In [37]:
nucleus08_fake = generate_fake(texts_gpt2[:5], ('nucleus', 0.8), tokenizer=tokenizer, model=model)

100%|██████████| 5/5 [00:12<00:00,  2.55s/it]


In [0]:
argmax_tuple =  ('Argmax', argmax_fake)
temperature08_tuple = ('Temperature', temperature08_fake)
temperature09_tuple = ('Temperature', temperature09_fake)
temperature2_tuple = ('Temperature', temperature2_fake)
top_k20_tuple = ('Top K', top_k20_fake)
top_k100_tuple = ('Top K', top_k100_fake)
nucleus08_tuple = ('Nucleus', nucleus08_fake)
nucleus09_tuple = ('Nucleus', nucleus09_fake)

In [39]:
texts_r[:5]

['Candlestick Park has been my part-time office for more than two decades. But, from a sentimental standpoint, it has always felt more like a room in my childhood home, full of faded memories, familiar smells and an odd kind of (cold, windy and damp) comfort.',
 "Damien Hirst's The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern. Photograph: Kerim Okten/EPA A major retrospective of Damien Hirst's work helped Tate Modern attract a record-breaking 5.3 million people last year.",
 'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change. Most of the money, about $40 million, will be distributed as grants to help cities and towns install backup power systems using clean technologies, such as advanced batteries that store energy from solar panels.',
 'Friend and B

In [38]:
argmax_fake

(["Candlestick Park has been my part-time office for more than two decades. I've been a member of the Board of Directors since the early 1990s, and I've been a member of the Board since the early 2000s. I've been a member of the Board since the early 2000s, and I've been a",
  "Damien Hirst's The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern.  The book is a collection of essays by Hirst, who has written about the subject of death in the mind of someone living. The essays are divided into three parts:  The first part is about the subject of death. The second",
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change.  The governor will also announce a $100 million grant to help communities in the state prepare for and protect themselves from the rising sea

In [40]:
top_k20_fake

(['Candlestick Park has been my part-time office for more than two decades. On November 27-22 my youngest sister called in calls throughout my adult, life journey seeking more help than I and two brothers that met this need would normally handle; no question for those not connected in families whose support to each son needed be tested today',
  "Damien Hirst's The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern.",
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change. By doing something together across nearly all parts the world "that has real life promise on that occasion. Let no energy question ever again occur on that record... that won to America." GovTee-President James Wilson declared March 28 and March 23 the',
  'Friend and Barber continued to w

In [41]:
top_k100_fake

(["Candlestick Park has been my part-time office for more than two decades. Even beyond The White Collar – at that venue only 3 inches beyond New Delhi city line's main concanning station of Chambachur Chai Pulkuk-based IAF on a day dedicated a quarter to touring. There I could",
  "Damien Hirst's The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern. Now here go go take out aspera flourballs or use any nonfiction genre of storytelling when researching horror books; these writers often fall off a hill because it hurts someone (read this for further reasons!). Heh it is. You might try",
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change. By his authority, Governor Perry could impose measures related specifically for people within 20 meters, which also in

In [42]:
nucleus08_fake

(["Candlestick Park has been my part-time office for more than two decades. My wife and I have been running our business there ever since our wedding. We've never really let the weather affect our business, but we love being able to stay here and run our business while our customers enjoy a good deal on their tickets. ",
  'Damien Hirst\'s The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern. It is no wonder that the title of Hirst\'s book is the title of the next film in the series, The Killing of a Sacred Deer.  I was invited to speak by Hirst for his "The Art of Writing" lecture and',
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change.  The proposal would cover the state\'s electricity supply and utility costs in Massachusetts, as well as its water 

In [45]:
nucleus09_fake

(['Candlestick Park has been my part-time office for more than two decades. I have worked on the development of the park for the last six years and we are continuing to improve it. So I am grateful for all of your support and interest in getting us going.  My main hope is to put on a good show',
  'Damien Hirst\'s The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern. "So I\'m on a roll when I see the head on the desk. Then I\'m at the table with my friend. But it wasn\'t that hard to see what I looked like and my hands."',
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change.  Patrick will pledge $25 million to help municipalities and homeowners develop an emergency plan to protect their buildings and properties from flooding and storm surges, according 

In [46]:
temperature08_fake

(['Candlestick Park has been my part-time office for more than two decades. Over the past decade, it grew into a complex of offices, offices that have provided me with a living, a living history, a living memory of the past, and an environment that has helped me feel at home.  In my home office',
  'Damien Hirst\'s The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern. He says the film was a "very important piece of work" to her. "I was a huge critic. I would go out of my way to talk about it. It\'s so much work to get to that point of no return." As',
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change.  The $50 million grant was provided to the Massachusetts Commission on Environmental Quality, which has the power to approve projects. The agency also h

In [47]:
temperature09_fake

(['Candlestick Park has been my part-time office for more than two decades. A few months ago, I took the plunge and opened a new house on the corner of 14th Street and S. 26th Street. The place was completely vacant. I moved it to the top right of the shop and had to put up with',
  'Damien Hirst\'s The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern. The following week we read the review of Jules Verne\'s "The Other Side of the Sea" and it was clear his intent in reading them was to show us that the mind is made up of many different components that have to be kept perfectly',
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change.  The bill, which takes effect on Sept. 3, includes $75 million over 10 years to upgrade storm surge defenses, replace power l

In [48]:
temperature2_fake

(['Candlestick Park has been my part-time office for more than two decades. The staff here are committed to a wide variety of jobs as well. We will provide a space conducive to professional leadership, but as one of the busiest times for all employees in Downtown, working at least twice week would greatly increase customer satisfaction from just 2',
  "Damien Hirst's The Physical Impossibility of Death in the Mind of Someone Living (1991) at the Tate Modern.  This quote seems obvious given Mr O'Regan will not be appearing to tell you much. However, as long as someone keeps to saying that his argument won't stop after death, as well as if he makes that more salient. The",
  'Governor Deval Patrick will commit more than $50 million on Tuesday to help Massachusetts communities and utilities prepare for and protect themselves from the increasing number of destructive storms and rising sea levels blamed on climate change.   Patrick will announce a State-of-the-Union Address Monday evening f

# Comparison

In [0]:
# TODO Переделать под новый формат если нужно

In [0]:
def compare(real, *args):
    for ind, el in enumerate(real):
        print('REAL TEXT: {}'.format(el))
        
        for type_fake, fake_list in args:
            for fake_text in fake_list:
                if fake_text.startswith(el[:20]):
                    print('GENERATED TEXT ({}): {}'.format(type_fake, fake_text))
        print('=======================================')

In [0]:
compare(texts[:10], argmax_tuple, temperature_tuple, top_k_tuple, beam_search_tuple, nucleus_tuple)

REAL TEXT: HAMBURG, Germany, June 3  As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises.

GENERATED TEXT (Argmax): HAMBURG, Germany, June 3  As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises.Ogungbure, who was born in Nigeria, was arrested in Germany on March 25 and charged with "racial discrimination" after!
GENERATED TEXT (Temperature): HAMBURG, Germany, June 3  As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises.The incident sparked a national debate, with many Nigerian supporters condemning the act and others call