In [4]:
!pip install -q gpt_2_simple
import gpt_2_simple as gpt2
from datetime import datetime
from google.colab import files
import json
import numpy as np
import random
from os import listdir
from os.path import isfile, join, splitext
import tensorflow as tf
import nltk
import nltk.sentiment.vader as vader
import shutil



In [0]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [0]:
gpt2.download_gpt2()

Fetching checkpoint: 1.00kit [00:00, 204kit/s]                                                      
Fetching encoder.json: 1.04Mit [00:00, 45.2Mit/s]                                                   
Fetching hparams.json: 1.00kit [00:00, 293kit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [00:07, 63.2Mit/s]                                  
Fetching model.ckpt.index: 6.00kit [00:00, 1.07Mit/s]                                               
Fetching model.ckpt.meta: 472kit [00:00, 30.4Mit/s]                                                 
Fetching vocab.bpe: 457kit [00:00, 36.8Mit/s]                                                       


In [9]:
gpt2.mount_gdrive()
gpt2.copy_checkpoint_from_gdrive()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
!git clone https://github.com/danster101/moviescriptmining.git

Cloning into 'moviescriptmining'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 14054 (delta 12), reused 24 (delta 8), pack-reused 14022[K
Receiving objects: 100% (14054/14054), 152.19 MiB | 22.47 MiB/s, done.
Resolving deltas: 100% (4953/4953), done.
Checking out files: 100% (5463/5463), done.


In [0]:
sid = vader.SentimentIntensityAnalyzer()

In [0]:
def sample_random(sess, prefix, temp=0.7, top_k=0):
  return gpt2.generate(sess, return_as_list=True, length=len(prefix), temperature=temp, prefix=prefix, top_k=top_k, include_prefix=False)[0]

def sample_sentiment(sess, prefix, sentiment_vec, max_iter=5, thresh=0):
  """ 
  Sentiment based rejection sampling
  returns the best sample
  Sample either hundred times or when we do better than threshold distance
  note thresh is between [0,2]. It's an early cutoff
  """
  best_score = 2
  best_result = None
  for i in range(max_iter):
    print(f'Sentiment sample {i}')
    sample = sample_random(sess, prefix, temp=(random.random() / 3 + 0.66))
    if not sample:
      continue
    ss = sid.polarity_scores(sample)
    curr_sent = np.array([ss["compound"], ss["neg"], ss["neu"], ss["pos"]])
    dist = np.linalg.norm(sentiment_vec-curr_sent)
    if (dist < thresh):
      return sample
    if dist < best_score:
      best_result = sample
      best_score = dist
  return best_result

In [0]:
def test_single(sid, train_file, test_instance, save_dest):
  """
  Takes the single testing JSON file as a dict.
  Runs sampling on all 3, saves results to dest dir, each training example to
  its own file
  """
  tf.reset_default_graph()
  sess = gpt2.start_tf_sess()
#   shutil.copyfile('drive/My Drive/checkpoint/run1/checkpoint', 'checkpoint/run1/checkpoint')
  
  gpt2.finetune(sess,
                dataset=train_file,
                steps=2,
                print_every=5,
                sample_every=1000,
                sample_length=1,
                restore_from='fresh',
                save_every=30,
                ) # don't want to save
  print('finetune done')
  results = []
  for i in range(1,4):
    inp = test_instance["test{}_input".format(i)]
    tru = test_instance["test{}_true".format(i)]
    id_ = test_instance["test{}_id".format(i)]
    # Shorten input
    inp = inp[:inp.find('\n', 200)] + '\n'
    # Generate random sample
    print('Random sample')
    samp_rand = sample_random(sess, inp)
    print(samp_rand)
    # Generate sentiment sample
    ss = sid.polarity_scores(tru)
    sentiment_vec = np.array([ss["compound"], ss["neg"], ss["neu"], ss["pos"]])
    samp_sent = sample_sentiment(sess, inp, sentiment_vec)
    sample = {"id": id_,
              "prefix": inp,
              "truth": tru,
              "random": samp_rand,
              "sentiment": samp_sent
             }
    results.append(sample)
  with open(save_dest, "w") as f:
    json.dump(results, f)
  print('finished test single')

def run_tests(test_dir, train_dir):
  # Load model from checkpoint
  for f in sorted(listdir(train_dir))[0::4]:
    fname = splitext(f)[0]
    train = join(train_dir, f)
    test = join(test_dir, fname+".json")
    if isfile(train) and isfile(test):
      with open(test, "r") as test_file:
        test_instance = json.load(test_file)
        print(f'Testing {fname}')
        test_single(sid, train, test_instance, "/content/drive/My Drive/text mining results/{}_sample.json".format(f))

In [0]:
run_tests('moviescriptmining/test_cases', 'moviescriptmining/training_cases')

Testing 10-Things-I-Hate-About-You
Loading checkpoint models/117M/model.ckpt
INFO:tensorflow:Restoring parameters from models/117M/model.ckpt


100%|██████████| 1/1 [00:00<00:00, 722.53it/s]

Loading dataset...
dataset has 13286 tokens
Training...





Saving checkpoint/run1/model-2
finetune done
Random sample
EGLISH CLASS - DAY 
A room full of bored seniors doodle and scare off into space MS. BLAISE, the one-step-away-from-medication English Teacher, tries to remember what she's talking about. 
MRS. BLAISE Well, then.  Oh, yes.  I guess that does it for our analysis of The Old Man and the Sea.  Any other comments? (with dread) Kat? 
MRS. BLAISE "I was a little worried you were going to do something like that, but having your first Ph.D. is a special privilege. I'm still learning. I'm just worried about my country. I just want to go home and be a queen. I know you're a royal, but I'm not going to let you have your second Ph.D. When I get home, I'll be giving you a big hug, and I'll be standing there at the window, saying, 'I'm so glad you're here. You're the one where I work. I'm the one who knew my dad's name. I'm the one who got to be a part of the story. I'm the one who got to be the one to get the job done. I'm the one who got to

100%|██████████| 1/1 [00:00<00:00, 194.54it/s]

Loading dataset...
dataset has 11396 tokens
Training...





Saving checkpoint/run1/model-2
finetune done
Random sample
KISTI ... and suddenly there's a guy right behind you and `wait a minute, why is he wearing a HOCKEY 
MASK!' They all laugh. 
ARON Let me guess... You're here for the paintings or the Cathedral? 
KRISTI The Cathedral? We've got a bit disorientated and the map isn't great. Where is it? 
ARON I can't see it. It's a picture of the Cathedral, but I've got a map of the cathedral. I've been here for 2 hours. They're looking at me. And I've never seen anything like that. I can see the cathedral - but it's only a picture of the Cathedral. I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral, and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see the Cathedral and I can see t

  0%|          | 0/1 [00:00<?, ?it/s]

Loading dataset...


100%|██████████| 1/1 [00:00<00:00,  4.29it/s]


dataset has 24759 tokens
Training...
Saving checkpoint/run1/model-2
finetune done
Random sample
Trevor hastily rifles through the teacher's drawers.     A startling discovery abruptly halts his search... 
CLOSER now INTO a brown paper bag Trevor has opened. Inside the bag is a .357 Magnum. 
OFF Trevor's narrowing gaze, an offstage voice interrupts... 
"What the hell is this?"
"Well, I'm not sure - I was just thinking about the one thing I saw that killed my friend, and I'm getting a little tired of it. 
"It's a .357, a .357, this thing. 
"I'm going to go get it, then I'll go get it. 
"I'm going to get it. 
"What, what? 
"This is the one thing I really don't know. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I don't know what it is. 
"I 

100%|██████████| 1/1 [00:00<00:00,  6.27it/s]

Loading dataset...
dataset has 15004 tokens
Training...





Saving checkpoint/run1/model-2
finetune done
Random sample
DWAYNE (CONT'D) You go anywhere near a police station, and Fourth of July comes early this year. 
Dwayne nods at the timer on the side of the vest. 
DWAYNE It's 9:00 AM. You got 8 hours. 
He hands Will a slip of paper. 
EVEY (CONT'D) It's just a piece of paper. I'll take it back. Let's go. I can't forget it. 
DWAYNE (CONT'D) We're not here to kill you, you know. We're here to help you. We're here to get you to safety. We're here so you can help yourself. We're here to take care of you. We're here to be your friend. You're never going to ask me for anything. We're there to help you. We're here to help you. We're here to make sure that you're safe. We're here to make sure that you're safe. And we're here because we got to. We're here because we got to. We got to make sure you're safe. We're here to make sure that you're safe. We're here because we got to. We're there because we got to. We got to make sure you're safe. We're here 