In [1]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import pandas as pd
import re


In [2]:
# Turn json into a dataframe for easier data cleaning
with open("corpus/train.json") as f:
    s = f.read()
    data1 = json.loads(s)
df1 = pd.DataFrame.from_dict(data1)

with open("corpus/test.json") as f:
    s = f.read()
    data2 = json.loads(s)
df2 = pd.DataFrame.from_dict(data2)

with open("corpus/val.json") as f:
    s = f.read()
    data3 = json.loads(s)
df3 = pd.DataFrame.from_dict(data3)

df = pd.concat([df1, df2, df3])

In [3]:
df

Unnamed: 0,id,summary,dialogue
0,13818513,Amanda baked cookies and will bring Jerry some...,Amanda: I baked cookies. Do you want some?\r\...
1,13728867,Olivia and Olivier are voting for liberals in ...,Olivia: Who are you voting for in this electio...
2,13681000,Kim may try the pomodoro technique recommended...,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa..."
3,13730747,Edward thinks he is in love with Bella. Rachel...,"Edward: Rachel, I think I'm in ove with Bella...."
4,13728094,"Sam is confused, because he overheard Rick com...",Sam: hey overheard rick say something\r\nSam:...
...,...,...,...
813,13829423,Carla's date for graduation is on June 4th. Di...,Carla: I've got it...\r\nDiego: what?\r\nCarla...
814,13727710,Bev is going on the school trip with her son. ...,"Gita: Hello, this is Beti's Mum Gita, I wanted..."
815,13829261,Greg cheated on Julia. He apologises to her. R...,"Julia: Greg just texted me\r\nRobert: ugh, del..."
816,13680226,Marry broke her nail and has a party tomorrow....,"Marry: I broke my nail ;(\r\nTina: oh, no!\r\n..."


In [4]:
# get people in the dialogue
def get_people(txt):
    names = re.findall(r"(?:^|\n)(\w+):", txt, re.MULTILINE)
    people = list(set(names))
    return people
df['people'] = df['dialogue'].map(get_people)

In [5]:
# Filter out data that is not equal to 2-people conversation
df1 = df.loc[df['people'].str.len() == 2]

# Checking whether is a valid operation
df1['people'].str.len().max()

# reset index
df1.reset_index(drop=True, inplace=True)

In [6]:
# Get TFIDF for each word
names = [n.lower() for n in set.union(*[set(p) for p in df1['people']])]
vectorizer = TfidfVectorizer(ngram_range = (1, 1), token_pattern = r"\b((?:[A-Za-z]{2,}|I)(?:'[A-Za-z]+)?)\b", stop_words = names, norm='l1')
tfidf = vectorizer.fit_transform(df1['dialogue'])
scores = tfidf.toarray()
# df1['tfidf'] = scores
tfi_df = pd.DataFrame(scores, columns=vectorizer.get_feature_names_out(), index=df1['id'])
tfi_df

Unnamed: 0_level_0,aa,aaa,aaaa,aaaaa,aaaaaaa,aaaaaaaa,aaaaaaaaaa,aaaaaaaaaaaa,aaaaaaaaaaaaaa,aaaaaaaaaaaaaand,...,zoomed,zorba,zowie,zubac,zucchini,zulczyk,zumba,zur,zzz,zzzzzz
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13818513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13728867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13681000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13730747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13728094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13829423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13727710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13829261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13680226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# def modify(txt):
#     tokens = re.findall(r"\b([\w']{2,}|I)\b", txt, re.MULTILINE)
#     names = set(re.findall(r"(?:^|\n)(\w+):", txt, re.MULTILINE))
#     return [i for i in tokens if i not in names]
# df1['modify_dialogue'] = df1['dialogue'].apply(modify)

In [8]:
# df1

In [9]:
def get_size(txt):
    tokens = re.findall(r"\b([\w']{2,}|I)\b", txt, re.MULTILINE)
    names = set(re.findall(r"(?:^|\n)(\w+):", txt, re.MULTILINE))
    return len([i for i in tokens if i not in names])
    
# Size of the diaglogue, names not included
df1['size'] = df1['dialogue'].map(get_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['size'] = df1['dialogue'].map(get_size)


In [10]:
df1.reset_index(drop=True, inplace=True)