# Movie Critics 
This notebook generates positive and negative moview reviews from a finetuned GPT model.
Code from huggingface has been used as a base and adapted to this task

In [None]:
#The code in this module uses and extends the Huggingface chatbot Code 
#Original source for the chatbot code can be found at https://github.com/huggingface/transfer-learning-conv-ai
#The full fine-tuning training run of the model needs to be executed from the bash shell once the conv-ai libraries have been installed
#The python bash command to use is:
#python ./train.py --gradient_accumulation_steps=2 --lm_coef=2.0 --max_history=1 --n_epochs=1 --num_candidates=2 --personality_permutations=1 --train_batch_size=2 --valid_batch_size=2 --dataset_path="./../notebooks/review_train.txt" --dataset_cache="./../notebooks/review_train.bin" --device="cuda"


from datetime import datetime
import json
import logging
import os
import tarfile
import tempfile
import socket
import re
import torch
import random

from transformers import cached_path

__file__ ="log.txt"
logger = logging.getLogger(__file__)

def download_pretrained_model():
    """ Download and extract finetuned model from S3 """
    resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
    tempdir = tempfile.mkdtemp()
    logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    return tempdir


def get_dataset(tokenizer, dataset_path, dataset_cache):
    """ Get tokenized PERSONACHAT dataset from S3 or cache."""
    dataset_path = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # To avoid using GPT cache for GPT-2 and vice-versa
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)
    else:
        logger.info("Download dataset from %s", dataset_path)
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")
        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)
        dataset = tokenize(dataset)
        torch.save(dataset, dataset_cache)
    return dataset


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


def make_logdir(model_name: str):
    """Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2"""
    # Code copied from ignite repo
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    logdir = os.path.join(
        'runs', current_time + '_' + socket.gethostname() + '_' + model_name)
    return logdir

#shuffle two lists in unison
def unison_shuffles(a,b):
    assert len(a)==len(b)
    c = list(zip(a,b))
    random.shuffle(c)
    return zip(*c)

#parse a line of text
def parse_line(line):
    line = line.strip().lower()
    line = line.replace("&nbsp;", " ")
    line = re.sub(r'<br(\s\/)?>', ' ', line)
    line = re.sub(r' +', ' ', line)  # merge multiple spaces into one
    return line


In [None]:
import logging
import random
from argparse import ArgumentParser
from itertools import chain
from pprint import pformat
import warnings

import torch
import torch.nn.functional as F

from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>']}

def add_special_tokens_(model, tokenizer):
    """ Add special tokens to the tokenizer and the model if they have not already been added. """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    if num_added_tokens > 0:
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)

def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = [-100] * len(instance["input_ids"])
    if lm_labels:
        instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
    return instance

def top_filtering(logits, top_k=0., top_p=0.9, threshold=-float('Inf'), filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
                whose total probability mass is greater than or equal to the threshold top_p.
                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
                the threshold top_p.
            threshold: a minimal threshold to keep logits
    """
    assert logits.dim() == 1  # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
    top_k = min(top_k, logits.size(-1))
    if top_k > 0:
        # Remove all tokens with a probability less than the last token in the top-k tokens
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        # Compute cumulative probabilities of sorted tokens
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probabilities > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Back to unsorted indices and set them to -infinity
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value

    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value

    return logits


def sample_sequence(personality, history, tokenizer, model, args, current_output=None):
    special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
    if current_output is None:
        current_output = []

    for i in range(args['max_length']):
        instance = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False)

        input_ids = torch.tensor(instance["input_ids"], device=args['device']).unsqueeze(0)
        token_type_ids = torch.tensor(instance["token_type_ids"], device=args['device']).unsqueeze(0)

        logits=None
        logits = model(input_ids, token_type_ids=token_type_ids)
            
        if isinstance(logits, tuple):  # for gpt2 and maybe others
            logits = logits[0]
        logits = logits[0, -1, :] / args['temperature']
        logits = top_filtering(logits, top_k=args['top_k'], top_p=args['top_p'])
        probs = F.softmax(logits, dim=-1)

        prev = torch.topk(probs, 1)[1] if args['no_sample'] else torch.multinomial(probs, 1)
        if i < args['min_length'] and prev.item() in special_tokens_ids:
            while prev.item() in special_tokens_ids:
                if probs.max().item() == 1:
                    warnings.warn("Warning: model generating special token with probability 1.")
                    break  # avoid infinitely looping over special token
                prev = torch.multinomial(probs, num_samples=1)

        if prev.item() in special_tokens_ids:
            break
        current_output.append(prev.item())

    return current_output



In [None]:
import numpy as np
from glob import glob
import string

#We build a custom dataset of IMDB negative reviews.
PATH='aclImdb/'
names = ['neg','pos']

#function to load the labels
def load_texts_labels_from_folders(path, folders):
    texts,labels = [],[]
    for idx,label in enumerate(folders):
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(idx)
    # stored as np.int8 to save space 
    return texts, np.array(labels).astype(np.int8)

negative_reviews_train = []
positive_reviews_train = []
negative_reviews_dev = []
positive_reviews_dev = []

X_train_raw,_ = load_texts_labels_from_folders(f'{PATH}train',names)
X_dev_raw,_ = load_texts_labels_from_folders(f'{PATH}test',names)

all_sets = {'train': {'positives':positive_reviews_train, 'negatives':negative_reviews_train, 'raw':X_train_raw}, 
           'dev': {'positives':positive_reviews_dev, 'negatives':negative_reviews_dev, 'raw':X_dev_raw}}

for key, data_items in all_sets.items():
    # read in the train data
    for i in range(len(data_items['raw'])):
        review = parse_line(data_items['raw'][i])
        s =  [e.strip()+' .' for e in review.split('.') if e]
        #further split the sentences up so that they are at most 170 characters or 60 tokens
        #deal with max characters first
        s_maxlen =[]
        for si in s:
            while len(si) >170:
                s_170,si = si[0:170],si[170:]
                s_maxlen.append(s_170)
            s_maxlen.append(si)
        #now with max tokens
        s_maxtkn =[]
        for si in s_maxlen:
            si = si.split(' ')
            while len(si) >60:
                s_60,si = si[0:60],si[60:]
                s_maxtkn.append(' '.join(s_60))
            s_maxtkn.append(' '.join(si))

        if(i>12499):        
            data_items['positives'].append(s_maxtkn)
        else:
            data_items['negatives'].append(s_maxtkn)

#now we shuffle
random.shuffle(positive_reviews_train)
random.shuffle(negative_reviews_train)
random.shuffle(positive_reviews_dev)
random.shuffle(negative_reviews_dev)

#X_train,y_train=unison_shuffles(X_train,y_train)
#X_dev,y_dev=unison_shuffles(X_dev,y_dev)


In [None]:
import json
#we need to create the data to finetune the model

train_entries = []

#we define the movie critic personalities
#positive and negative
personalities = {
    'Positive': {
        'train_data': positive_reviews_train,
        'personality': [
            "I'm an optimist .",
            "I'm rarely disappointed by a film .",
            "I am a film critic .",
            "I love watching movies ."
        ],
        'distractions': negative_reviews_train
    },
    'Negative': {
        'train_data': negative_reviews_train,
        'personality': [
            "I'm very critical .",
            "I find bad acting disappointing .",
            "I am a film critic .",
            "I watch a lot of bad movies."
        ],
        'distractions': positive_reviews_train
    }
}

#generate the input data in the correct format
for key, traits in personalities.items():
    #add the personality reviews
    for review in traits['train_data']:
        #add the personality type
        entry = {
            "personality": traits['personality'],
            "utterances" : []
        }
        history= []
        h=0
        for sentence in review:
            if h%2==0:
                history.append(sentence)
            else:
                #we process this entry
                #create 19 distractor candidates from the sentences of the opposing sentiment
                candidates=[]
                for i in range(19):
                    candidates.append(random.choice(random.choice(traits['distractions'])))
                #now append the gold truth
                candidates.append(sentence)
                #now add this
                utterance = {"candidates": candidates, "history": history.copy()}
                entry["utterances"].append(utterance)
                #append the current sentence to the history
                history.append(sentence)
            h+=1
        #add this to the list
        train_entries.append(entry)

#now mix it all up
random.shuffle(train_entries)

format_entries = {'train': train_entries[0:10000],'valid': train_entries[10000:11000]}

###Commented out - text to save the training results
#with open('review_train.txt', 'w') as outfile:
#    json.dump(format_entries, outfile)


In [None]:
#Print out an example of the training data we are using 
for key,value in format_entries.items():
    for listitem in value:        
        for key2, value2 in listitem.items():
            for l2 in range(len(value2)):
                if(type(value2[l2]) is dict):
                    for key3, value3 in value2[l2].items():
                        print("")
                        print(f"{key}->{key2}->{key3}")
                        for l3 in range(len(value3)):
                            print(f"({l3+1}) {value3[l3]}")
                else:
                    print("")
                    print(f"{key}->{key2}")
                    print(f"({l2+1}) {value2[l2]}")
                #for key3, value3 in listitem2.items():
        break

In [None]:
#we want to generate reviews

#use a random seed or -1 
RNDSEED=42
SAVED_MODEL='./../transfer-learning-conv-ai/runs/Mar08_13-02-33_ip-172-31-5-229_openai-gpt/'
CACHED_PERSONALITY_TOKENS='./review_train.bin'
PERSONALITY_DATA='./review_train.txt'

if RNDSEED>-1:
    random.seed(RNDSEED)
    torch.random.manual_seed(RNDSEED)
    torch.cuda.manual_seed(RNDSEED)

#setup the model
tokenizer_class, model_class = (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
tokenizer = tokenizer_class.from_pretrained(SAVED_MODEL)
model = model_class.from_pretrained(SAVED_MODEL)
model.to("cuda")
add_special_tokens_(model, tokenizer)

#load the critic personalities
dataset = get_dataset(tokenizer, PERSONALITY_DATA, CACHED_PERSONALITY_TOKENS)

#positive critic encoding
print(format_entries['train'][0]['personality'])
print(dataset['train'][0]['personality'])
positive_critic = dataset['train'][0]['personality']

#negative critic encoding
print(format_entries['train'][2]['personality'])
print(dataset['train'][2]['personality'])
negative_critic = dataset['train'][2]['personality']



In [None]:
# Read in a list of films to seed the model when generating random positive and negative reviews
import pandas as pd
df_movies = pd.read_csv('movies_metadata.csv')
df_movies=df_movies.fillna('')

In [None]:
#create empty files
if True==False:
    with open('generated_positive_reviews.csv', 'w') as myfile:
        myfile.write('')
    
    with open('generated_negative_reviews.csv', 'w') as myfile:
        myfile.write('')


In [None]:
#generate text
import csv
args={
    'min_length': 50,
    'max_length': 150,
    'top_k': 0,
    'temperature': 0.7,
    'top_p' :0.95,
    'device': 'cuda',
    'no_sample':False,
    'max_history': 5
}

#generate reviews.
positive_reviews=[]
negative_reviews=[]
#reload the generated output
with open('generated_positive_reviews.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    positive_reviews = list(reader)

with open('generated_negative_reviews.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    negative_reviews = list(reader)

personalities =[{'critic': positive_critic, 'save' :positive_reviews, 'file': 'generated_positive_reviews.csv'}
                ,{'critic':negative_critic, 'save': negative_reviews, 'file' : 'generated_negative_reviews.csv'}]

###Comment this this row out to run the generation (slow)
#personalities =[]

for personality in personalities:
    while(len(personality['save'])<15000):
        #create 1 review for each movie
        for j in range(1):
            history=[]
            tokcnt=0
            row_id=len(personality['save'])
            seed_text= 'the movie was called ' + df_movies['title'][row_id] +'.'
            review=seed_text
            history.append(tokenizer.encode(seed_text))
            tokcnt+=len(history[-1])
            for i in range(10):
                if(tokcnt<400):
                    #print(review)
                    with torch.no_grad():
                        out_ids = sample_sequence(personality['critic'], history, tokenizer, model, args)
                    history.append(out_ids)
                    tokcnt+=len(history[-1])
                    history = history[-(2*args['max_history']+1):]
                    out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
                    review+=' '+ out_text
            if(row_id%100==0):
                print(row_id)
                print("\n" +review+"\n")
            personality['save'].append(review)
            with open(personality['file'], 'a') as myfile:
                myfile.write(review+'\n')

            

In [None]:
print(personalities[0]['save'][1])

In [None]:
#reload the generated output
with open('generated_positive_reviews.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    positive_reviews = list(reader)[0]

with open('generated_negative_reviews.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    negative_reviews = list(reader)[0]

In [None]:
## print a couple of random examples
print('A positive example, \n')
print(random.choice(positive_reviews))

print('\nA negative example, \n')
print(random.choice(negative_reviews))