# 1. Generation of Locations

In [37]:
import sys
import pandas as pd
import os
import wikipediaapi
import numpy as np
import json
import random
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re
import requests

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lauraluckert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lauraluckert/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
PATH = "~/Desktop/"
FILENAME = "tmdb_5000_credits.csv"

full_path = os.path.expanduser(PATH)
os.chdir(full_path)

### 0. Example Structure for New Game Field

In [4]:
new_field = {
    "streets": {"1-3": [],"4-6": [],"7-9":[] , "10-12": [], \
                "13-15": [], "16-18": [], "expensive": [], "cheap": []},
    "stations": [],
    "prison": [],
    "free_parking": [],
    "special": {"1": [], "2": []}
}

### 1. Clean and read-in Movie Data
Data Source:
https://www.kaggle.com/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv

In [8]:
def clean_movie_dataset(movie_data):
    """
    :movie_data: Pandas DataFrame holding movie titles, character cast
    Preprocess the data, we only need the characters from the movie in a dict
    
    :returns: dictionary with key = movie title, value = 
    """
    
    cast_rows = []

    for malformed_string in movie_data.cast:
        imd_string = list(malformed_string[1:(len(malformed_string)-1)].split("}"))
    
        new_list = []

        for item in imd_string:
            try: 
                if item[0] != "{":
                    item = item[2:(len(item))]
                item += "}"
                new_item =json.loads(item)
                person = new_item["character"]
                #gender = new_item["gender"]
                new_list.append(person)
            except IndexError:
                break
        cast_rows.append(new_list)

    
    cast_dict = {}
    for movie, cast in zip(movie_data.title,cast_rows):
        cast_dict[movie] = cast
    
    return cast_dict

## preprocess dataset
movie_characters = pd.read_csv(FILENAME, sep=",")
cast_dict = clean_movie_dataset(movie_characters)

### 3. Random Selection of Topic

In [12]:
## get random movie from our dictionary
random_key = random.choice(list(cast_dict))

## topic and cast is selected
topic = random_key
cast = cast_dict[topic]

## for testing purposes, we set the topic manually
topic = "Furious 7"
cast = cast_dict[topic]
print(topic, cast)

Furious 7 ['Dominic Toretto', "Brian O'Conner", 'Hobbs', 'Letty', 'Roman', "Tej (as Chris 'Ludacris' Bridges)", 'Mia', 'Jakande', 'Kiet', 'Kara', 'Ramsey', 'Mr. Nobody', 'Deckard Shaw', 'Han', 'Gisele', 'Sean Boswell', 'Elena', 'Hector', 'Sheppard', 'Owen Shaw', 'Safar', 'Jack', 'Jack', 'Samantha Hobbs', 'Letty Fan', 'Female Racer', 'Male Racer', 'Race Starter', 'Hot Teacher', 'Doctor', 'Priest', 'Merc Tech', 'Weapons Tech', 'Billionaire', 'Dominican Priest', 'Hana', 'Merc Driver (as Ben Blankenship)', 'DJ', 'DJ', 'Drone Tech', 'Jasmine', 'Mando', 'Advisor', 'Field Reporter', 'Cop', 'Leo (uncredited / archive)', 'Neela (uncredited / archive)', 'Twinkie (uncredited)', 'Santos (uncredited / archive)', 'Race Wars Racer (uncredited)', "Brian O'Conner (uncredited)", "Brian O'Connor (uncredited)"]


### 4. Select Characters as New Locations
#### 4.1 *To Do*:  Cleaning of character names (no brakets in names etc.)
#### 4.2 *To Do*: Useful combination of streetnames with selected characters

In [24]:
## possible street names for combination with characters
street_names = [#'Avenue', 
                'Park', 'Street', 'Boulevard', 'Road', 'Main Street', 'Drive', 'Lane', 'Alley']

## fill location entries with characters from film cast (cast_dict)
## example default combination
new_field["streets"]["expensive"] = [x + " Avenue" for x in cast[0:2]]
new_field["streets"]["cheap"] = [x + " Drive" for x in cast[8:10]]
new_field["streets"]["1-3"] = [x + " " + random.choice(street_names) for x in cast[11:14]]
new_field["streets"]["4-6"] = [x + " " + random.choice(street_names) for x in cast[15:18]]
new_field["streets"]["7-9"] = [x + " " + random.choice(street_names) for x in cast[19:22]]
new_field["streets"]["10-12"] = [x + " " + random.choice(street_names) for x in cast[23:26]]
new_field["streets"]["13-15"] = [x + " " + random.choice(street_names) for x in cast[27:30]]
new_field["streets"]["16-18"] = [x + " " + random.choice(street_names) for x in cast[31:33]]
new_field["stations"] = [x + " Station" for x in cast[3:7]]

print(new_field)

{'streets': {'1-3': ['Mr. Nobody Boulevard', 'Deckard Shaw Lane', 'Han Road'], '4-6': ['Sean Boswell Alley', 'Elena Alley', 'Hector Main Street'], '7-9': ['Owen Shaw Boulevard', 'Safar Road', 'Jack Park'], '10-12': ['Samantha Hobbs Alley', 'Letty Fan Alley', 'Female Racer Park'], '13-15': ['Race Starter Main Street', 'Hot Teacher Lane', 'Doctor Street'], '16-18': ['Merc Tech Road', 'Weapons Tech Road'], 'expensive': ['Dominic Toretto Avenue', "Brian O'Conner Avenue"], 'cheap': ['Kiet Drive', 'Kara Drive']}, 'stations': ['Letty Station', 'Roman Station', "Tej (as Chris 'Ludacris' Bridges) Station", 'Mia Station'], 'prison': 'Walker', 'free_parking': 'Tokyo', 'special': {'1': "God's Eye", '2': 'Los Angeles'}}


### 5. Question Answering to Select Characters/Locations for Special Places
#### 5.1 Get Wikipedia Data as Q&A Basis Data
https://pypi.org/project/Wikipedia-API/0.3.5/
- *To Do* : Select only "Plot" Section from Wikipedia Data/Find a way to get relevant data only

In [20]:
## for regular text output
wiki_en_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI)

## check if page for topic exists
if wiki_en_wiki.page(topic).exists():
    print("Topic is ok.")
    wiki_page = wiki_en_wiki.page(topic)
    topic_text = wiki_page.text
else:
    print("Find a new topic")

Topic is ok.


In [18]:
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

q_a = pipeline('question-answering', model=model_name, tokenizer=model_name)

#### 5.2 Select Questions for Q&A model to get wikipedia responses

In [16]:
## Prepare questions (examples, to discuss)
question_dict = {"special_1": "What is an important monument in the movie ",
                "special_2": "What is an expensive location in the movie ",
                "prison": "Which one is a tragic area in the movie ",
                "free_parking": "What is the loveliest place in the movie "}

In [21]:
for category, question_body in question_dict.items():
    question = question_body + topic + "?"
    print(question)
    
    QA_input = {
        'question': question,
        'context': topic_text
    }
    response = q_a(QA_input)
    print(response)
    
    if category == "special_1":
        new_field["special"]["1"] = response["answer"]
    elif category == "special_2":
        new_field["special"]["2"] = response["answer"]
    else:
        new_field[category] = response["answer"]

What is an import monument in the movie Furious 7?


  tensor = as_tensor(value)
  p_mask = np.asarray(


{'score': 0.7775615453720093, 'start': 4598, 'end': 4607, 'answer': "God's Eye"}
What is an expensive location in the movie Furious 7?
{'score': 0.8790752291679382, 'start': 4684, 'end': 4695, 'answer': 'Los Angeles'}
Which one is a tragic area in the movie Furious 7?
{'score': 0.7376387715339661, 'start': 29316, 'end': 29322, 'answer': 'Walker'}
What is the loveliest place in the movie Furious 7?
{'score': 0.568051815032959, 'start': 3437, 'end': 3442, 'answer': 'Tokyo'}


#### 5.3 Evaluate Responses:
- To Do: Check if location/character already exists in new_field
- To Do: filter for bad scores, retrigger question generation

In [25]:
new_field

{'streets': {'1-3': ['Mr. Nobody Boulevard', 'Deckard Shaw Lane', 'Han Road'],
  '4-6': ['Sean Boswell Alley', 'Elena Alley', 'Hector Main Street'],
  '7-9': ['Owen Shaw Boulevard', 'Safar Road', 'Jack Park'],
  '10-12': ['Samantha Hobbs Alley', 'Letty Fan Alley', 'Female Racer Park'],
  '13-15': ['Race Starter Main Street', 'Hot Teacher Lane', 'Doctor Street'],
  '16-18': ['Merc Tech Road', 'Weapons Tech Road'],
  'expensive': ['Dominic Toretto Avenue', "Brian O'Conner Avenue"],
  'cheap': ['Kiet Drive', 'Kara Drive']},
 'stations': ['Letty Station',
  'Roman Station',
  "Tej (as Chris 'Ludacris' Bridges) Station",
  'Mia Station'],
 'prison': 'Walker',
 'free_parking': 'Tokyo',
 'special': {'1': "God's Eye", '2': 'Los Angeles'}}

# 2. Generation of Action Cards 

### 1. Plagiarism: Read in Monopoly Data
- Get action verbs from monopoly data
- use real action cards for few-shot learning

In [26]:
FILENAME_MONOPOLY = "monopoly_action_cards_keywords.csv"
monopoly_data = pd.read_csv(FILENAME_MONOPOLY, sep=";")

#### 1.1 Keyword Preparation from Monopoly Data

In [34]:
## get pos tags
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

## Get action verbs from real monopoly action cards
text_data = ""
for text_item in monopoly_data["content"]:
    text_data += ". " + text_item
    
inspect_actions = preprocess(text_data)

## use only verbs
keyword_list_verbs = []

for pos_tag in inspect_actions:
    if re.match("VB.*", pos_tag[1]):
        if pos_tag[0] == "DM":
            keyword_list_verbs.append("Euro")
        else:
            keyword_list_verbs.append(pos_tag[0])  
        
print("The topic's keyword_list is: ", keyword_list_verbs)

The topic's keyword_list is:  ['Pay', 'take', 'come', '..', 'Go', 'get', '..', 'pays', 'are', 'pays', 'Go', 'receive', 'inherit', 'receive', '..', 'is', 'win', 'won', 'is', '..', 'receives', 'has', 'buy', 'get', 'have', 'been', 'elected', 'Have', 'renovated', 'Euro', 'be', 'called', 'do', 'Pay', 'Do', 'pass', 'collect', 'be', 'released', 'keep', 'need', 'sell', 'Do', 'pass', 'collect', 'be', 'released', 'keep', 'need', 'sell']


Data Source Further Action Words:
https://www.citationmachine.net/resources/grammar-guides/verb/list-verbs/

In [35]:
## POS Tag == "VB.*" from real monopoly action cards
action_verbs_monopoly = ["Pay","Take","Come","Go","Get","Receive","Inherit","Win","Pass",
                         "Collect","being released","Keep","Sell"]
action_verbs = ["Act","Answer","Approve","Arrange","Break","Build","Buy","Coach","Color","Cough","Create", 
                "Complete","Cry","Dance","Describe","Draw","Drink","Eat","Edit","Enter","Exit",
                "Imitate","Invent","Jump","Laugh","Lie","Listen","Paint","Plan","Play","Read","Replace",
                "Run","Scream","See","Shop","Shout","Sing","Skip","Sleep","Sneeze","Solve","Study","Teach",
                "Touch","Turn","Walk","Win","Write","Whistle","Yank","Zip"]



In [52]:
## locations into flat list
locations = []
for _, value in new_field["streets"].items():
    for item in value:
        locations.append(item)
for _, value in new_field["special"].items():
    for item in value:
        locations.append(item)
for item in new_field["stations"]:
    locations.append(item)
locations.append(new_field["prison"])
locations.append(new_field["free_parking"])        

#### 1.2 Few-Shot Learning Training Data Preparation

In [36]:
## generate few shot training data for text generation
prompt_text = ""

for text, keywords in zip(monopoly_data["content"], monopoly_data["keywords"]):
    imd = "key: " + keywords + "\ntweet: " + text + "\n###"
    prompt_text += imd

### 2. Random Keyword Generation for New Action Cards

In [53]:
## once locations available, randomly select location
#LOCATION = "Dominic Toretto Avenue"

## randomly select verbs, pronouns, locations, fixed number
first_verb = random.choice(action_verbs_monopoly).lower()
second_verb = random.choice(action_verbs).lower()
pronoun = random.choice(["you","your","yours"]).lower()
LOCATION = random.choice(locations)
number = 2000

## special case for prison
if LOCATION == new_field["prison"]:
    keyword_list = [LOCATION, "not pass", "not collect"]

else:
    ## randomly select if second verb, pronoun and location should be considered
    select_second_verb = random.choice([0,1])
    select_pronoun = random.choice([0,1])
    select_location = random.choice([0,1])

    print("\nfirst_verb:", first_verb, "\nsecond_verb:", second_verb, "\npronoun:",  \
          pronoun, "\nLOCATION:",LOCATION, "\nnumber:",number )

    print("\nselect_second_verb:", select_second_verb, "\nselect_pronoun:", select_pronoun, \
          "\nselect_location:",  select_location,)

    if select_second_verb and select_pronoun and select_location:
        keyword_list = [first_verb, second_verb, pronoun, LOCATION]
    elif select_second_verb == 0 and select_pronoun and select_location:
        keyword_list = [first_verb, pronoun, LOCATION]
    elif select_second_verb == 0 and select_pronoun == 0 and select_location:
        keyword_list = [first_verb, LOCATION]
    elif select_second_verb == 0 and select_pronoun == 1 and select_location == 0:
        keyword_list = [first_verb, pronoun, number]
    elif select_second_verb == 0 and select_pronoun == 0 and select_location == 1:
        keyword_list = [first_verb, LOCATION]
    elif select_second_verb == 1 and select_pronoun == 0 and select_location == 0:
        keyword_list = [first_verb, second_verb]
    elif select_second_verb == 1 and select_pronoun == 0 and select_location == 1:
        keyword_list = [first_verb, second_verb, LOCATION]
    elif select_second_verb == 1 and select_pronoun == 1 and select_location == 0:
        keyword_list = [first_verb, second_verb, pronoun]
    elif select_second_verb == 0 and select_pronoun == 0 and select_location == 0:
        keyword_list = [first_verb, number]


keyword_string = ""

for item in keyword_list:
    if keyword_string == "":
        keyword_string += str(item)
    else:
        keyword_string += ", " + str(item)
        
print(keyword_string)


first_verb: take 
second_verb: dance 
pronoun: you 
LOCATION: Female Racer Park 
number: 2000

select_second_verb: 0 
select_pronoun: 0 
select_location: 0
take, 2000


### 2. Few-Shot Learning Key-to-Text Generation for Action Cards
Source Inference API: https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api

In [54]:
## api token can be generated via free huggingface account
API_TOKEN = "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"

def query(payload='',parameters=None,options={'use_cache': False}):
    API_URL = "https://api-inference.huggingface.co/models/EleutherAI/gpt-neo-2.7B"
    headers = {"Authorization": f"Bearer {API_TOKEN}"}
    body = {"inputs":payload,'parameters':parameters,'options':options}
    response = requests.request("POST", API_URL, headers=headers, data= json.dumps(body))
    try:
      response.raise_for_status()
    except requests.exceptions.HTTPError:
        return "Error:"+" ".join(response.json()['error'])
    else:
      return response.json()[0]['generated_text']

In [55]:
parameters = {
    'max_new_tokens':25,  # number of generated tokens
    'temperature': 1,   # controlling the randomness of generations
    'end_sequence': "###" # stopping sequence for generation
}

prompt = prompt_text + "\nkey: " + keyword_string + "\ntweet:"


data = query(prompt,parameters)

action_card = re.findall(r"(?<=tweet:\s).*", data)[-1] 
#print(data)
print(keyword_string)
print(action_card)

take, 2000
Take 2000,000 from the bank. 


### 3. Evaluation of Generated Action Card

- To Do: Which action cards should be used as reference for which input tokens?

In [39]:
def pos_distribution(pos_tuples_of_sentence):
    """
    :pos_tuple_of_sentences: tuple (token, pos_tag) as returned from preprocess function
    
    crop pos tags into relevant groups (first two letters)
    count occurences of pos tags in input sentence
    
    :returns: pandas DataFrame with pos_tag and its frequency
    
    """
    pos_df = pd.DataFrame(pos_tuples_of_sentence,columns=["token","long_pos_tag"])
    pos_df["pos_tag"] = [x[0:2] for x in pos_df["long_pos_tag"]]
    freq_df = pos_df["pos_tag"].value_counts()
    
    return freq_df

def evaluate_generated_sentence(reference, new_sentence):

    ## preprocess both
    reference = preprocess(reference)
    new_sentence = preprocess(new_sentence)
    
    ## pos distribution
    reference = pos_distribution(reference)
    new_sentence = pos_distribution(new_sentence)
    
    ## merge vectors
    merged_df = pd.merge(reference,new_sentence,how="outer", left_index=True,right_index=True).fillna(0)
    merged_df.columns=["reference","target"]
    
    ## calc cosine similarity 
    cos_similarity = 1 - cosine(merged_df["reference"], merged_df["target"])
    
    ## calc scalar product between vectors
    return cos_similarity