In [2]:
import json
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
import string
from nltk.corpus import stopwords 

In [4]:
ps = PorterStemmer() 
stop_words = set(stopwords.words('english')) 

In [5]:
## ADDED STEMMING

def process_text(s, typ='unigram', rem_stop=True, do_stem=True):
    s = s.lower()
    s = s.translate(str.maketrans('', '', string.punctuation))
    word_tokens = word_tokenize(s) 
    filtered_sentence = word_tokens
    if rem_stop:
        filtered_sentence = [w for w in filtered_sentence if not w in stop_words] 
    if do_stem:
        filtered_sentence = [ps.stem(w) for w in filtered_sentence] 
    if typ=='bigram':
        filtered_sentence = [ '_'.join(filtered_sentence[i:i+2]) for i in range(len(filtered_sentence)-1) ] 
    return filtered_sentence
    

In [6]:
DATA_FILE = '../../data/personachat_self_original_comet_validation.json'  # ** VALIDATION SPLIT ONL
split = 'valid'
def load_data(dump_fname):
    annotated_data = json.load(open(dump_fname,'r'))
    return annotated_data
annotated_data = load_data(DATA_FILE)
len(annotated_data), len(annotated_data[split]), annotated_data[split][0].keys()

(1, 1000, dict_keys(['personality', 'utterances', 'coment_annotation']))

In [35]:
annotated_data[split][0]['utterances']

[{'candidates': ['oh really ? i am actually in high school and i am graduating as class of 2019 !',
   "that's an interesting choice . i'd have to pick french fries",
   'i just got a pet fish for my 18th birthday yesterday from my parents .',
   'yeah , well what about you ?',
   'my favorite watch is the rolex ? what is yours ?',
   "what is in spain that's so interesting",
   "i don't like clowns . they are scary to a kid like me",
   'poetry . roses are red . violet are . . . ?',
   'my father is a member of the army , served for 10 years now .',
   'oh i like mexican food , but my favorite food are cheeseburgers',
   'hey there , are you a mother ?',
   "it sure is . i'd like to see more of the city though .",
   'it is not so fun i have 2 friend who speak a different langues',
   "i'd like some honey though . do you sell it ?",
   'i am a recovering heavy drinker . full time . how about you ?',
   'hi ! i have three kids . how many do you have ?',
   'awesome ! i own 2 dogs , lov

In [16]:
annotated_data[split][1]['utterances']

[{'candidates': ["yes i do one of the reason i'm in a poly amorous relationship",
   'i really like oranges a lot',
   'wow impressive . not sure you would be impressed with my rainbow coloured hair',
   'usually older generation , like ps1 and super nintendo . do you watch twitch ?',
   "i'm in law school , i will be practicing in manhattan when i graduate .",
   'what is your favorite type of music , mine is country music',
   "congratulations ! i'm a pro wrestler .",
   "i'm mostly good . tired and some wicked heartburn . is that over sharing ? you ?",
   'me too , and i already have the long red hair . it goes to my waist .',
   'just finished working in the supermarket',
   'hey how are you today ?',
   "yes we do . i'm glad you like to workout .",
   'how are you doing today',
   "i'm trying . i do a lot of training . four hours every day . are you active ?",
   'really what teams . and who are you rooting for .',
   'well , i did have cancer , but it is gone now .',
   'only whe

In [9]:
def get_scores(s1, s2, idx=None): # both processed
    s1 =set(s1)
    s2 = set(s2)
    num = len(s1.intersection(s2))
    den = len(s1.union(s2))    
    if den>0.0:
        score = num*1.0/den
    else:
        score = 0.0
    return {'score':score, 'idx':idx, 's1':list(s1), 's2':list(s2)}
    

In [10]:
def match_sentence(sentence, reference, k=3, score_thresh=0.1):
    ret = [ get_scores(sentence, ref_sentence, i) for i,ref_sentence in enumerate(reference) ]
    ret = sorted( ret, key=lambda x:-x['score'] )
    ret = [vals for vals in ret if vals['score']>=score_thresh]
    return ret[:k]
    

In [30]:
def heuristic_matching(item, typ='unigram', rem_stop=True, persona_idx=None, k=3, score_thresh=0.1, alternate=True):
    
    utterances = item['utterances']
    history = utterances[-1]['history']
    personality = item['personality']
    coment_annotation = item['coment_annotation']
    
    all_dialog = history + [utterances[-1]['candidates'][-1]]
    
    personality_processed = [ process_text(p,typ=typ,rem_stop=rem_stop) for p in personality]
        
    weak_label_persona = []
    for h,sent_h in enumerate(all_dialog):
        sent_h_processed = process_text(sent_h)
        label_persona = match_sentence(sent_h_processed, personality_processed,k=k,score_thresh=score_thresh)
        if alternate and h%2==0:
            label_persona = []
        else:
            label_persona = match_sentence(sent_h_processed, personality_processed,k=k,score_thresh=score_thresh)
        cur = {'label_persona':label_persona, 'sentence':sent_h}
        weak_label_persona.append(cur)
    
    ret = weak_label_persona
    
    return ret
    

In [31]:
tmp = heuristic_matching(annotated_data[split][0], k=3, score_thresh=0.15)  
# tmp = json.dumps(tmp, indent=4)
for j in range(len(tmp)):
    print(j, tmp[j])

0 {'label_persona': [], 'sentence': 'hello what are doing today ?'}
1 {'label_persona': [], 'sentence': 'i am good , i just got off work and tired , i have two jobs .'}
2 {'label_persona': [], 'sentence': 'i just got done watching a horror movie'}
3 {'label_persona': [{'score': 0.42857142857142855, 'idx': 0, 's1': ['rather', 'book', 'year', '20', 'read', 'ive'], 's2': ['read', 'book', 'twenti', 'year']}], 'sentence': "i rather read , i've read about 20 books this year ."}
4 {'label_persona': [], 'sentence': 'wow ! i do love a good horror movie . loving this cooler weather'}
5 {'label_persona': [], 'sentence': 'but a good movie is always good .'}
6 {'label_persona': [], 'sentence': 'yes ! my son is in junior high and i just started letting him watch them too'}
7 {'label_persona': [], 'sentence': 'i work in the movies as well .'}
8 {'label_persona': [], 'sentence': 'neat ! ! i used to work in the human services field'}
9 {'label_persona': [{'score': 0.18181818181818182, 'idx': 1, 's1': [

In [32]:
tmp = heuristic_matching(annotated_data[split][123], k=3, score_thresh=0.15)  
for j in range(len(tmp)):
    print(j, tmp[j])

0 {'label_persona': [], 'sentence': 'do you want to hear something strange ?'}
1 {'label_persona': [], 'sentence': "sure ! i'd love to ! what is it ?"}
2 {'label_persona': [], 'sentence': "i'm due to have my fifth set of twins in two months !"}
3 {'label_persona': [], 'sentence': 'wow . my boyfriend would go crazy . we just moved into a house .'}
4 {'label_persona': [], 'sentence': 'we just bought our first home . we were living in a two bedroom apartment before .'}
5 {'label_persona': [{'score': 0.25, 'idx': 1, 's1': ['nice', 'take', 'neighbor', 'enjoy', 'hood', 'walk', 'around'], 's2': ['like', 'take', 'walk']}], 'sentence': 'nice . i enjoy taking walks around my neighbor hood .'}
6 {'label_persona': [], 'sentence': 'i wish i had time for walks . my job at the bank keeps me busy .'}
7 {'label_persona': [{'score': 0.4444444444444444, 'idx': 0, 's1': ['fri', 'french', 'hamburg', 'get', 'eat', 'yeah', 'ever', 'busi'], 's2': ['fri', 'french', 'hamburg', 'eat', 'like']}], 'sentence': 'yea

In [33]:
tmp = heuristic_matching(annotated_data[split][153], k=3, score_thresh=0.15)  
for j in range(len(tmp)):
    print(j, tmp[j])

0 {'label_persona': [], 'sentence': 'hello , how are you today ?'}
1 {'label_persona': [], 'sentence': 'tired . spent all night trying to do algebra homework i don t understand .'}
2 {'label_persona': [], 'sentence': 'oh no , how old are you . i am 60 , retiring in a few years .'}
3 {'label_persona': [{'score': 0.5, 'idx': 3, 's1': ['older', '17', '13', 'brother'], 's2': ['older', 'brother']}], 'sentence': '13 . i have an older brother who is 17 .'}
4 {'label_persona': [], 'sentence': 'nice , i love kids , i work as a librarian'}
5 {'label_persona': [], 'sentence': 'i like our school librarian . she s also our soccer coach . really nice .'}
6 {'label_persona': [], 'sentence': "that's great , does your family like to travel ? i do"}
7 {'label_persona': [{'score': 0.2, 'idx': 0, 's1': ['thanksgiv', 'take', 'mostli', 'much', 'home', 'stay', 'bu', 'except', 'school'], 's2': ['bu', 'ride', 'school']}], 'sentence': 'no . not too much . i take a school bus but we mostly stay at home . except 

In [34]:
tmp = heuristic_matching(annotated_data[split][113], k=3, score_thresh=0.15)  
for j in range(len(tmp)):
    print(j, tmp[j])

0 {'label_persona': [], 'sentence': 'what do you do for work ?'}
1 {'label_persona': [{'score': 0.5, 'idx': 0, 's1': ['teacher', 'school', 'im'], 's2': ['elementari', 'school', 'teacher']}], 'sentence': "i'm a school teacher , who about you ?"}
2 {'label_persona': [], 'sentence': 'i work on my parents farm'}
3 {'label_persona': [], 'sentence': 'i bet that is hard work'}
4 {'label_persona': [], 'sentence': 'it is , but its a lifestyle'}
5 {'label_persona': [], 'sentence': 'true do you have any hobbies outside of the farm ?'}
6 {'label_persona': [], 'sentence': 'i love listening to country music , its the best genre'}
7 {'label_persona': [], 'sentence': 'i love photography and foreign language'}
8 {'label_persona': [], 'sentence': 'those sound like fun hobbies'}
9 {'label_persona': [], 'sentence': 'they are very simple but enjoyable'}
10 {'label_persona': [], 'sentence': 'do you have any pets ?'}
11 {'label_persona': [], 'sentence': 'unfortunately no , do you have any ?'}
12 {'label_pers