In [35]:
!pip install transformers
!pip install sentence_transformers
!pip install spacy
!python -m spacy download en_core_web_lg
!pip install simpleneighbors

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
import re
import pandas as pd
import numpy as np
import random
from simpleneighbors import SimpleNeighbors
import spacy, random
from sentence_transformers import SentenceTransformer

In [0]:
def sentence_mean(nlp, s):
    if s == "":
        s = " "
    doc = nlp(s, disable=['tagger', 'parser'])
    return np.mean(np.array([w.vector for w in doc]), axis=0)

In [0]:
def processScript(scriptFile) -> pd.DataFrame:
    m_title = scriptFile.name[:4]
    lines = scriptFile.readlines()
    dialogs = [re.sub(r'[\(\[].*?[\)\]]', '', x) for x in lines]
    dialog_df = []

    line_count, scene_count = 1, 1
    poc_lines = {}
    context_list = []
    current_context = []
    for l in dialogs:
        row = l.split(':')

        # if row[0].startswith('SCENE') or row[0].startswith('Scene') or row[0].startswith('END') or row[0].startswith('THE END'):
        if row[0].startswith(('SCENE','Scene','END', 'THE END','Picture')):
            scene_count += 1
            context_list.append(current_context)
            current_context = []
        if len(row) >= 2 and not row[0].startswith('SCENE') and not row[0].startswith('Scene'):
            label = m_title + '_' +'SC' + str(scene_count) + '_' + str(line_count)
            dialog_df.append({'Conv': label,'Character': row[0], 'Dialog': row[1].rstrip()})
            poc_lines[label] = row[1].rstrip()
            current_context.append(label)
            line_count += 1
            # if label == 'POC4_SC2_46':
            #     pass

    dialog_df = pd.DataFrame(dialog_df)
    return dialog_df, poc_lines, context_list

In [0]:
def sentence_vector_trnf(embedder, s):
    if s == "":
        s = " "
    sentence_embedding = embedder.encode([s])
    return sentence_embedding[0]

In [0]:
# Process POC scripts
f1 = open("POC1.txt", "r")
f2 = open("POC2.txt", "r")
f3 = open("POC3.txt", "r")

poc1_df, poc1_lines, poc1_context_list = processScript(f1)
poc2_df, poc2_lines, poc2_context_list = processScript(f2)
poc3_df, poc3_lines, poc3_context_list = processScript(f3)
# poc 4 - character convesations are not on a new line. Scene chnge picture ?
# poc 5 - no scene changes

In [0]:
# Merge Content
all_context_list = poc1_context_list + poc2_context_list + poc3_context_list
all_poc_lines = {**poc1_lines, **poc2_lines, **poc3_lines}

In [0]:
# Build
responses = {}
for line_ids in all_context_list:
    for first, second in zip(line_ids[:-1], line_ids[1:]):
        responses[first] = second

# Pull Responses from all context
response_sample = random.sample(list(responses.keys()), 2431)

In [0]:
#  Using GLOVE
nlp = spacy.load('en_core_web_lg')

In [9]:
nns = SimpleNeighbors(300)
for i, line_id in enumerate(response_sample):
    # show progress
    if i % 10 == 0: print(i, line_id, all_poc_lines[line_id])
    line_text = all_poc_lines[line_id]
    summary_vector = sentence_mean(nlp, line_text)
    if np.any(summary_vector):
        nns.add_one(line_id, summary_vector)
nns.build()

0 POC3_SC6_109  The Dutchman sails as its captain commands.
10 POC1_SC15_419  Stop rubbing it.
20 POC3_SC15_842  Your chariot awaits, your highness. The oars are inside.
30 POC2_SC11_516  What do you want from me?
40 POC2_SC8_347  And if there are Crewmen?
50 POC2_SC8_317  No no no no, I heard it was the sea he fell in love with.
60 POC2_SC14_646  It’s a mythological creature; I can calls it what I wants!
70 POC2_SC3_27   Sorry, mate.  Do you mind if we make a little side trip? I didn’t think so.
80 POC3_SC9_297  Blessed sweet westerlies, we’re back!
90 POC1_SC22_692   Hold fire.
100 POC1_SC13_380  A better one.
110 POC2_SC15_663  How did you get here?
120 POC1_SC20_636  Cotton ?ere says you missed a bit.
130 POC1_SC3_98  Unless, of course, he knew you wouldn?t believe the truth even if he told it to you.
140 POC1_SC3_75  Yes, it is, I?ve seen it.
150 POC2_SC12_524  Agreed.
160 POC1_SC10_322  It'll linger.
170 POC2_SC15_680   Lord Beckett desires the contents of that chest. I deliver i

In [10]:
# Glove Predictions
sentence = "Good evening, Captain."
picked = nns.nearest(sentence_mean(nlp, sentence), 5)[0]
response_line_id = responses[picked]

print("Your line:\n\t", sentence)
print("Most similar turn:\n\t", all_poc_lines[picked])
print("Response to most similar turn:\n\t", all_poc_lines[response_line_id])

Your line:
	 Good evening, Captain.
Most similar turn:
	  Hehehe, welcome to the Crew, lad.
Response to most similar turn:
	  Twelve five’s.  Twelve five’s. Call me a liar, I upped the bid.


In [11]:
# Using Sentence Embedder
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:47<00:00, 8.47MB/s]


In [12]:
nns_trf = SimpleNeighbors(768)
for i, line_id in enumerate(response_sample):
    # show progress
    if i % 10 == 0: print(i, line_id, all_poc_lines[line_id])
    line_text = all_poc_lines[line_id]
    summary_vector = sentence_vector_trnf(embedder, line_text)
    if np.any(summary_vector):
        nns_trf.add_one(line_id, summary_vector)
nns_trf.build()

0 POC3_SC6_109  The Dutchman sails as its captain commands.
10 POC1_SC15_419  Stop rubbing it.
20 POC3_SC15_842  Your chariot awaits, your highness. The oars are inside.
30 POC2_SC11_516  What do you want from me?
40 POC2_SC8_347  And if there are Crewmen?
50 POC2_SC8_317  No no no no, I heard it was the sea he fell in love with.
60 POC2_SC14_646  It’s a mythological creature; I can calls it what I wants!
70 POC2_SC3_27   Sorry, mate.  Do you mind if we make a little side trip? I didn’t think so.
80 POC3_SC9_297  Blessed sweet westerlies, we’re back!
90 POC1_SC22_692   Hold fire.
100 POC1_SC13_380  A better one.
110 POC2_SC15_663  How did you get here?
120 POC1_SC20_636  Cotton ?ere says you missed a bit.
130 POC1_SC3_98  Unless, of course, he knew you wouldn?t believe the truth even if he told it to you.
140 POC1_SC3_75  Yes, it is, I?ve seen it.
150 POC2_SC12_524  Agreed.
160 POC1_SC10_322  It'll linger.
170 POC2_SC15_680   Lord Beckett desires the contents of that chest. I deliver i

In [30]:
# SBert predictions
sentence = "Where is my ship ?"
picked = nns_trf.nearest(sentence_vector_trnf(embedder, sentence), 5)[0]
response_line_id = responses[picked]

print("Your line:\n\t", sentence)
print("Most similar turn:\n\t", all_poc_lines[picked])
print("Response to most similar turn:\n\t", all_poc_lines[response_line_id])

Your line:
	 Where is my ship ?
Most similar turn:
	  Where does it make berth?
Response to most similar turn:
	  Where does it make berth? Have you not heard the stories? Captain Barbossa and his crew of miscreants sail from the dreaded Isla de Muerta . It?s an island that cannot be found except by those who already know where it is.


In [0]:
# Get 5 nearest semantic responses
picked_list_nns_trf = nns_trf.nearest(sentence_vector_trnf(embedder, sentence), 5)
picked_list_nns = nns.nearest(sentence_mean(nlp, sentence), 5)

In [34]:
# Five closest responses using SBert
choice_count = 1
for choice in picked_list_nns_trf:
  response_line_id = responses[choice]
  print(choice_count)
  print("Most similar turn:\n\t", all_poc_lines[choice])
  print("Response to most similar turn:\n\t", all_poc_lines[response_line_id])
  choice_count += 1


1
Most similar turn:
	  Where does it make berth?
Response to most similar turn:
	  Where does it make berth? Have you not heard the stories? Captain Barbossa and his crew of miscreants sail from the dreaded Isla de Muerta . It?s an island that cannot be found except by those who already know where it is.
2
Most similar turn:
	  Which ship do we follow?
Response to most similar turn:
	  Signal the Dutchman to track down Sao Feng. We follow the Pearl. How soon can we have the ship ready to pursue?
3
Most similar turn:
	  Where are ye goin’?
Response to most similar turn:
	   You’ve got to tell me what’s happening.
4
Most similar turn:
	   Where’s the Commodore?
Response to most similar turn:
	   Fell behind.
5
Most similar turn:
	  Which port?
Response to most similar turn:
	  I didn’t say port. I said land, any land.  Ah!
