In [43]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from numpy.linalg import norm
import pandas as pd
import re

In [44]:
# Turn json into dataframe for easier data cleaning
with open("corpus/train.json") as f:
    s = f.read()
    data1 = json.loads(s)
df1 = pd.DataFrame.from_dict(data1)

with open("corpus/test.json") as f:
    s = f.read()
    data2 = json.loads(s)
df2 = pd.DataFrame.from_dict(data2)

with open("corpus/val.json") as f:
    s = f.read()
    data3 = json.loads(s)
df3 = pd.DataFrame.from_dict(data3)

df = pd.concat([df1, df2, df3])

In [45]:
df

Unnamed: 0,id,summary,dialogue
0,13818513,Amanda baked cookies and will bring Jerry some...,Amanda: I baked cookies. Do you want some?\r\...
1,13728867,Olivia and Olivier are voting for liberals in ...,Olivia: Who are you voting for in this electio...
2,13681000,Kim may try the pomodoro technique recommended...,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa..."
3,13730747,Edward thinks he is in love with Bella. Rachel...,"Edward: Rachel, I think I'm in ove with Bella...."
4,13728094,"Sam is confused, because he overheard Rick com...",Sam: hey overheard rick say something\r\nSam:...
...,...,...,...
813,13829423,Carla's date for graduation is on June 4th. Di...,Carla: I've got it...\r\nDiego: what?\r\nCarla...
814,13727710,Bev is going on the school trip with her son. ...,"Gita: Hello, this is Beti's Mum Gita, I wanted..."
815,13829261,Greg cheated on Julia. He apologises to her. R...,"Julia: Greg just texted me\r\nRobert: ugh, del..."
816,13680226,Marry broke her nail and has a party tomorrow....,"Marry: I broke my nail ;(\r\nTina: oh, no!\r\n..."


In [46]:
# quick test
def get_people(txt):
    names = re.findall(r"(?:^|\n)(\w+):", txt, re.MULTILINE)
    people = list(set(names))
    return people

In [47]:
df['people'] = df['dialogue'].map(get_people)

In [48]:
df1 = df.loc[df['people'].str.len() == 2]

In [49]:
df1['people'].str.len().max()

2

In [67]:
names = [n.lower() for n in set.union(*[set(p) for p in df1['people']])]
vectorizer = TfidfVectorizer(ngram_range = (1, 1), token_pattern = r"\b([\w']{2,}|I)\b", stop_words = names, norm='l1')
tfidf = vectorizer.fit_transform(df1['dialogue'])
scores = tfidf.toarray()
# df1['tfidf'] = scores
scores

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [69]:
def get_size(txt):
    tokens = re.findall(r"\b([\w']{2,}|I)\b", txt, re.MULTILINE)
    names = set(re.findall(r"(?:^|\n)(\w+):", txt, re.MULTILINE))
    return len([i for i in tokens if i not in names])
df1['size'] = df1['dialogue'].map(get_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['size'] = df1['dialogue'].map(get_size)


In [70]:
df1

Unnamed: 0,id,summary,dialogue,people,size
0,13818513,Amanda baked cookies and will bring Jerry some...,Amanda: I baked cookies. Do you want some?\r\...,"[Jerry, Amanda]",12
1,13728867,Olivia and Olivier are voting for liberals in ...,Olivia: Who are you voting for in this electio...,"[Olivia, Oliver]",14
2,13681000,Kim may try the pomodoro technique recommended...,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...","[Tim, Kim]",87
3,13730747,Edward thinks he is in love with Bella. Rachel...,"Edward: Rachel, I think I'm in ove with Bella....","[Edward, rachel]",23
4,13728094,"Sam is confused, because he overheard Rick com...",Sam: hey overheard rick say something\r\nSam:...,"[Naomi, Sam]",143
...,...,...,...,...,...
813,13829423,Carla's date for graduation is on June 4th. Di...,Carla: I've got it...\r\nDiego: what?\r\nCarla...,"[Diego, Carla]",154
814,13727710,Bev is going on the school trip with her son. ...,"Gita: Hello, this is Beti's Mum Gita, I wanted...","[Bev, Gita]",276
815,13829261,Greg cheated on Julia. He apologises to her. R...,"Julia: Greg just texted me\r\nRobert: ugh, del...","[Julia, Robert]",213
816,13680226,Marry broke her nail and has a party tomorrow....,"Marry: I broke my nail ;(\r\nTina: oh, no!\r\n...","[Tina, Marry]",47


In [50]:
def size_cost(new_dialogue, pool):
    pass

In [51]:
def content_cost(new_dialogue, pool):
    pass

In [52]:
def sort_by_cost(pool, new_dialogue, w_size=0.5, w_content=0.5):
    pass