# Imports

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import Dataset, load_metric, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline, T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
import evaluate
from rouge_score import rouge_scorer
import spacy
import re
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from tqdm.auto import tqdm
tqdm.pandas()


pd.options.plotting.backend = "plotly"
#!python -m spacy download en_core_web_trf
nlp = spacy.load('en_core_web_trf')

# Utils

In [2]:
EMOJI_PATTERN = re.compile(
    "["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251" 
    "]+"
)

def data_preprocess(df:pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    for column in columns:
        df[column] = df[column].str.replace('\r\n', '. ') # Replacing newline characters with dot
        df[column] = df[column].str.replace(r'([?!\.]+)(\.)', r'\1', n=1, regex=True) # Removing unnecessary dots
        df[column] = df[column].str.replace(EMOJI_PATTERN, '', regex=True) # Removing Emojis
        df[column] = df[column].str.replace(r'\s{2,}', ' ', regex=True) # removing multiple space

    return df

# EDA

### Dialogues

In [3]:
df_d = pd.read_csv('data/dialogues.csv')
df_d = data_preprocess(df_d, ['dialogue'])
df_d.head()

Unnamed: 0,id,dialogue
0,13728094,Sam: hey overheard rick say something. Sam: i ...
1,13611672,John: Ave. Was there any homework for tomorrow...
2,13809912,Matt: Do you want to go for date? Agnes: Wow! ...
3,13828741,Aria: You won't believe who I've just met! Ari...
4,13828591,Victor: do you want to go to the museum tonigh...


In [4]:
df_d.isna().sum()

id          0
dialogue    0
dtype: int64

In [5]:
df_d['dialogue_len'] = df_d.dialogue.apply(len)
fig = px.box(df_d, y="dialogue_len", title="Boxplot of dialogue length")
fig.show()

### Summary pieces

In [6]:
df_s = pd.read_csv('data/summary_pieces.csv')
df_s = data_preprocess(df_s, ['summary_piece'])
df_s.head()

Unnamed: 0,summary_piece
0,Stacy isn't sure vodka is a good gift.
1,"Veronica, Tom and Caroline will play the follo..."
2,Angella has a birthday.
3,Helen recommends Betty her accountant.
4,They'll meet at Rylen's and help her mother ma...


In [7]:
df_s.isna().sum()

summary_piece    0
dtype: int64

We notice that summaries were chunked into sentences

In [8]:
df_s['nb_sentences'] = df_s.summary_piece.str.split(pat=".").apply(len)
df_s[df_s.nb_sentences > 2]

Unnamed: 0,summary_piece,nb_sentences
25,They decide to go to see a documentary about A...,3
56,The guest will receive Natalie's contact detai...,3
71,Carson also wants to watch Maple Leafs vs. LA ...,3
103,Jo will be in around 6.30 as the train is dela...,3
173,Jazz is going to see Bohemian Rapsody with Sar...,3
214,Sharly will collect 1 kg of onion and 0.5 kg o...,3
249,Mark will cancel his day off on Mr. Benson's r...,3
327,Rick was missing 200.14 points.,3
339,Guy needs size L. There is only one size for w...,3
489,Chris asks Tom to go in to school at 7.10 am.,3


In [9]:
df_s['summary_len'] = df_s.summary_piece.apply(len)
fig = px.box(df_s, y="summary_len", title="Boxplot of summaries length")
fig.show()

In [10]:
df_s.summary_piece.iloc[1]

"Veronica, Tom and Caroline will play the following songs: Hit the Road Jack, I Will Survive , Sugar, I Can't Feel My Face and I'm So Excited."

# Reference

In [11]:
df_r = pd.read_csv('data/reference.csv')
df_r = data_preprocess(df_r, ['dialogue', 'summary'])
df_r.head()

Unnamed: 0,id,dialogue,summary
0,13682362-1,Sam: guess what?! I went on a date last night!...,"Sam went on a blind date, with a guy she met o..."
1,13819891,"Linda: you remember Mariusz, my English studen...",Linda found out that her ex student had passed...
2,13828522,Corin: Did you hear about the outing tonight? ...,Ersin has been sick and hasn't heard about Ros...
3,13810987,Cam: Any plans for Monday? Sam: Yes. I'm busy....,Cam wants to catch up with Sam. Sam is very bu...
4,13681991,Emma: Hi! Kate: Hello! Long time. Emma: Too lo...,"For the visit of Jim's sister this weekend, Ka..."


In [12]:
df_r.isna().sum()

id          0
dialogue    0
summary     0
dtype: int64

In [13]:
df_r['dialogue_len'] = df_r.dialogue.apply(len)
df_r['summary_len'] = df_r.summary.apply(len)

vars = ['dialogue_len', 'summary_len']
fig = make_subplots(rows=1, cols=len(vars))
for i, var in enumerate(vars):
    fig.add_trace(
        go.Box(y=df_r[var],
        name=var),
        row=1, col=i+1
    )

fig.update_traces(boxpoints='all', jitter=.3)

In [14]:
df_r['entities'] = df_r['summary'].progress_apply(lambda sent: list(set((ent.text.lower(), ent.label_) for ent in nlp(sent).ents if ent.label_ == 'PERSON')))
df_r


  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,id,dialogue,summary,dialogue_len,summary_len,entities
0,13682362-1,Sam: guess what?! I went on a date last night!...,"Sam went on a blind date, with a guy she met o...",1456,99,"[(sam, PERSON)]"
1,13819891,"Linda: you remember Mariusz, my English studen...",Linda found out that her ex student had passed...,839,95,"[(linda, PERSON)]"
2,13828522,Corin: Did you hear about the outing tonight? ...,Ersin has been sick and hasn't heard about Ros...,946,189,"[(ersin, PERSON), (corin, PERSON), (rose, PERS..."
3,13810987,Cam: Any plans for Monday? Sam: Yes. I'm busy....,Cam wants to catch up with Sam. Sam is very bu...,923,130,"[(sam, PERSON), (cam, PERSON)]"
4,13681991,Emma: Hi! Kate: Hello! Long time. Emma: Too lo...,"For the visit of Jim's sister this weekend, Ka...",1428,154,"[(kate, PERSON), (emma, PERSON), (jim, PERSON)]"
...,...,...,...,...,...,...
357,13717011,Perry: have you thought about holiday yet? Mar...,"Perry, Marlow, Janet and Forster discuss their...",972,187,"[(forster, PERSON), (perry, PERSON), (janet, P..."
358,13811786,Lee: I'm scared and confused. The digital wiza...,The client manager suited up. He has a present...,1447,181,"[(lee, PERSON)]"
359,13828346,Adam: good morning!! Rachel: good morning adam...,"Rachel has a new siamese cat, a girl called Po...",1058,216,"[(rachel, PERSON), (adam, PERSON), (portia, PE..."
360,13818744,Amka: Have you ever been to Erasmus? . Amka: a...,"Mick's never done any student exchange before,...",1035,238,"[(mick, PERSON), (amka, PERSON)]"


In [15]:
def calculate_score(row):
    # Count the number of entities in summaries that are present in the dialogue
    common_entities = [entity[0] for entity in row['entities'] if entity[0].lower() in row['dialogue'].lower()]

    # Calculate the score based on the number of common words
    # I there is no person entities we put a score of 1. We don't want to penalize summary model misclasification
    return len(common_entities) / len(row['entities']) if len(row['entities']) > 0 else 1

df_r['entity_score'] = df_r.apply(calculate_score, axis=1)
df_r

Unnamed: 0,id,dialogue,summary,dialogue_len,summary_len,entities,entity_score
0,13682362-1,Sam: guess what?! I went on a date last night!...,"Sam went on a blind date, with a guy she met o...",1456,99,"[(sam, PERSON)]",1.0
1,13819891,"Linda: you remember Mariusz, my English studen...",Linda found out that her ex student had passed...,839,95,"[(linda, PERSON)]",1.0
2,13828522,Corin: Did you hear about the outing tonight? ...,Ersin has been sick and hasn't heard about Ros...,946,189,"[(ersin, PERSON), (corin, PERSON), (rose, PERS...",1.0
3,13810987,Cam: Any plans for Monday? Sam: Yes. I'm busy....,Cam wants to catch up with Sam. Sam is very bu...,923,130,"[(sam, PERSON), (cam, PERSON)]",1.0
4,13681991,Emma: Hi! Kate: Hello! Long time. Emma: Too lo...,"For the visit of Jim's sister this weekend, Ka...",1428,154,"[(kate, PERSON), (emma, PERSON), (jim, PERSON)]",1.0
...,...,...,...,...,...,...,...
357,13717011,Perry: have you thought about holiday yet? Mar...,"Perry, Marlow, Janet and Forster discuss their...",972,187,"[(forster, PERSON), (perry, PERSON), (janet, P...",1.0
358,13811786,Lee: I'm scared and confused. The digital wiza...,The client manager suited up. He has a present...,1447,181,"[(lee, PERSON)]",1.0
359,13828346,Adam: good morning!! Rachel: good morning adam...,"Rachel has a new siamese cat, a girl called Po...",1058,216,"[(rachel, PERSON), (adam, PERSON), (portia, PE...",1.0
360,13818744,Amka: Have you ever been to Erasmus? . Amka: a...,"Mick's never done any student exchange before,...",1035,238,"[(mick, PERSON), (amka, PERSON)]",1.0


We can see that name entity recognition on PERSON is not 100% accurate. This number decreases if we start taking in consideration other labels. This few misclassification are due the to some outputs errors of the summarization.

However, we an conclude that there is a 'PERSON' entity in each and single entity.

In [16]:
df_r[df_r.entity_score < 1]

Unnamed: 0,id,dialogue,summary,dialogue_len,summary_len,entities,entity_score
142,13731151,"Brian: hey love, how long till you get home. R...",Brian needs Melissa's toy. Mellisa looks upset...,690,154,"[(mellisa, PERSON), (melissa, PERSON), (brian,...",0.75
154,13681324,Andy: Hi! Thanks for the recommendation! Jane:...,Andy and Jane both liked the new film by Quent...,1169,152,"[(quentin tarantino, PERSON), (jane, PERSON), ...",0.666667
156,13716459,Pete: I need a present for Katie. Pete: Any su...,"Pete and Kate have anniversary. Pete, Mark and...",517,154,"[(kate, PERSON), (mark, PERSON), (john, PERSON...",0.75
197,13729605,"Anton: Hey, need you to come back to the offic...",Eve has to come back to the office on her day ...,1683,221,"[(anton, PERSON), (charlotte briggs, PERSON), ...",0.666667
203,13728829,"Jaca: Hi, we are in the Hospital now. Ostoja: ...",Daca was supposed to have an operation but he ...,1158,191,"[(jaca, PERSON), (daca, PERSON), (ostoja, PERS...",0.666667
223,13811659,"ALLISON: HI SIS, HOW ARE YOU? JUAN : WELL, I'M...",Juan is going to travel to Brazil. Hew went th...,1019,173,"[(hew, PERSON), (juan, PERSON)]",0.5
248,13728910,Tom: Little bit of rain. Tom: Wiem from office...,Internet connection is bad where Tom is. Tom w...,1226,300,"[(tom, PERSON), (vicki ellen's, PERSON)]",0.5
249,13828916,Amy: listen. Amy: maybe we'll throw Julie a su...,Amy and Jessica are planning a surprise birthd...,886,216,"[(julie, PERSON), (jessica, PERSON), (julia, P...",0.8
272,13728958,"Deirdre: Hi Beth, how are you love? Beth: Hi A...",Beth wants to organize a girls weekend to cele...,1884,227,"[(deidre, PERSON), (beth, PERSON)]",0.5
297,13729680,Malik: have you heard of that paleo diet? Mali...,Malik and Samanta want to lose weight. They wi...,952,109,"[(malik, PERSON), (samanta, PERSON)]",0.5


### rouge score in reference

In [17]:
def calculate_rouge(text, summary):
    # score = rouge.compute(references=[text], predictions=[summary])
    scorer = rouge_scorer.RougeScorer(['rouge1'])
    score = scorer.score(text, summary)
    return score['rouge1'].fmeasure
df_r['rouge1'] = df_r.progress_apply(lambda row: calculate_rouge(row['dialogue'], row['summary']), axis=1)
print(f'Reference dataset rouge1 score : {df_r.rouge1.describe()["mean"]}')

  0%|          | 0/362 [00:00<?, ?it/s]

Reference dataset rouge1 score : 0.18544601639417493


### cosine similarity score in reference

In [18]:
model = SentenceTransformer('all-MiniLM-L6-v2')
def calculate_cos_sim(text, summary):
    embeddings_d = model.encode(text)
    embeddings_s = model.encode(summary)
    cos_sim = cosine_similarity([embeddings_d], [embeddings_s])[0][0]
    norm_cos_sim = (cos_sim - (-1)) / (1 - (-1))
    return  norm_cos_sim
df_r['cos_sim'] = df_r.progress_apply(lambda row: calculate_cos_sim(row['dialogue'], row['summary']), axis=1)
print(f'Reference dataset cosine similarity score : {df_r.cos_sim.describe()["mean"]}')

  0%|          | 0/362 [00:00<?, ?it/s]

Reference dataset cosine similarity score : 0.8258682524795691


# Strategy

## Name Entity proportion score and similarity matrix

In [19]:
df_s['entities'] = df_s['summary_piece'].progress_apply(lambda sent: list(set(ent.text.lower() for ent in nlp(sent).ents if ent.label_ == 'PERSON')))  
df_s[['summary_piece','entities']]

  0%|          | 0/4053 [00:00<?, ?it/s]

Unnamed: 0,summary_piece,entities
0,Stacy isn't sure vodka is a good gift.,[stacy]
1,"Veronica, Tom and Caroline will play the follo...","[tom, caroline, veronica]"
2,Angella has a birthday.,[angella]
3,Helen recommends Betty her accountant.,"[betty, helen]"
4,They'll meet at Rylen's and help her mother ma...,[]
...,...,...
4048,"Jasna's son is 9 months old, he smiles and he'...",[jasna]
4049,Quentin has had another delivery and is organi...,[quentin]
4050,Bret was sick last week and hit a deer with hi...,[bret]
4051,Audrey is after a tv series to watch and Tony ...,"[audrey, tony]"


In [20]:
# We create a matrix to store the proportion of entities from summaries present in dialogues
proportion_matrix = np.zeros((len(df_s), len(df_d)))

# Iterate through each row in A and B
for i, entities in enumerate(tqdm(df_s['entities'])):
    for j, text in enumerate(df_d['dialogue']):
        entities = set(entities)
        common_entities = entities.intersection(set(text.split()))
        proportion_matrix[i, j] = len(common_entities) / len(entities) if len(entities) > 0 else 0

  0%|          | 0/4053 [00:00<?, ?it/s]

### Similarity score

In [21]:
model = "philschmid/flan-t5-base-samsum"

# tokenizer = T5Tokenizer.from_pretrained(model)
# model = T5ForConditionalGeneration.from_pretrained(model)

# def decode(text: str, tokenizer=tokenizer, model=model) -> str:
#     inputs = tokenizer.batch_encode_plus(["summarize: " + text], max_length=3300, return_tensors="pt", padding='longest')  # Batch size 1
#     outputs = model.generate(inputs['input_ids'], num_beams=4, max_length=300, early_stopping=True)
#     return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in outputs][0]

# decode(text)


# df_d['summary'] = df_d.progress_apply(lambda row: decode(row['dialogue']), axis=1)
# df_d.to_pickle('df_d.pkl')
# df_d = pd.read_pickle('df_d.pkl')
model = SentenceTransformer('all-MiniLM-L6-v2')

dialogues = df_d.dialogue.values.tolist()
embeddings_d = model.encode(dialogues)
summaries = df_s.summary_piece.values.tolist()
embeddings_s = model.encode(summaries)

df_d['embeddings'] = embeddings_d.tolist()
df_s['embeddings'] = embeddings_s.tolist()

# Convert the 'embeddings' column to numpy arrays
vectors_d = np.array(df_d['embeddings'].tolist())
vectors_s = np.array(df_s['embeddings'].tolist())

# Create the similarity_matrix
similarity_matrix = cosine_similarity(vectors_s, vectors_d)

# Let's normalize the matrix
mat_min, mat_max = similarity_matrix.min(), similarity_matrix.max()
# similarity_matrix_norm = (similarity_matrix - mat_min) / (mat_max - mat_min)
similarity_matrix_norm = (similarity_matrix - (-1)) / (1 - (-1))
np.savetxt('similarity_matrix.txt', similarity_matrix_norm)


In [59]:
df_d['rouge1'] = df_d.progress_apply(lambda row: calculate_rouge(row['dialogue'], row['summary']), axis=1)
print(f'Reconstructed dialogue dataset rouge1 score : {df_d.rouge1.describe()["mean"]}')


  0%|          | 0/1345 [00:00<?, ?it/s]

Reconstructed dialogue dataset rouge1 score : 0.2130646905792265


We now create our final matrix wich is the combination of both scores. The 2 scores have the same importance.

In [24]:
score_matrix = proportion_matrix * 0.5 + similarity_matrix_norm * 0.5

# Keep the best dialogue for each summary base on the homemade score
most_similar_indices = np.argmax(score_matrix, axis=1)
scores = np.max(score_matrix, axis=1)

# Save the dialogues id
df_s['dialogue_id'] = most_similar_indices
df_s['cosim_nes'] = scores

# Aggergate the results
# res = pd.merge(df_d, df_s[df_s['cosim_nes'] > 0.6], left_index=True, right_on='dialogue_id')[['dialogue', 'summary_piece', 'dialogue_id']]
res = pd.merge(df_d, df_s, left_index=True, right_on='dialogue_id')[['dialogue', 'summary_piece', 'dialogue_id']]
res = res.groupby('dialogue', as_index=False)['summary_piece'].apply(list)
res

Unnamed: 0,dialogue,summary_piece
0,"A: How was your Oriental trip? . B: OMG, it wa...",[A went to Vietnam for a surgery conference he...
1,A: One box from the fist. :D. M: it's been lik...,"[Mash works about 12-15 hours per week, which ..."
2,Aaron: Hi Abbey! :). Abbey: Hi Aaron! It's bee...,"[He saw Abbey at the bus station., Aaron quit ..."
3,"Aaron: Hi Bob, are you sending Katie to the sc...",[Bob and Aaron want Sandra to convince Marie t...
4,"Abby: <file_video>. Abby: check out this song,...","[Olivia has found a new group of Kpop dance., ..."
...,...,...
1206,Zack: It's good to see you online! Lea: Hah! H...,"[They had a lot of fun., Lea is back from her ..."
1207,Zack: My guitar's busted!! Wilson: What happen...,"[John's is already broken., Wilson will lend h..."
1208,Zoe: Good morning. Caleb: good morning. Zoe: h...,[Caleb will come back at 11pm.]
1209,"Zoe: Hi Alan, i hope i'll be on time next mond...",[Alan will pick her up in 2 hours anyway.]


In [25]:
res['summary_raw'] = res.summary_piece.apply(' '.join)
res['rouge1'] = res.progress_apply(lambda row: calculate_rouge(row['dialogue'], row['summary_raw']), axis=1)
print(f'Reference dataset rouge1 score : {df_r.rouge1.describe()["mean"]}')
print(f'Re constructed dataset rouge1 score : {res.rouge1.describe()["mean"]}')

  0%|          | 0/1211 [00:00<?, ?it/s]

Reference dataset rouge1 score : 0.18544601639417493
Re constructed dataset rouge1 score : 0.15415890210389394


## Rouge matrix strategy

In [26]:
# df_d = pd.read_pickle('df_d.pkl')
# We create a matrix to store the proportion of entities from summaries present in dialogues
rouge_matrix = np.zeros((len(df_s), len(df_d)))
scorer = rouge_scorer.RougeScorer(['rouge1'])
dialogues = df_d.dialogue.values.tolist()
summaries = df_s.summary_piece.values.tolist()


# Iterate through each row in summaries and dialogues
for i, summary in enumerate(tqdm(summaries)):
    for j, text in enumerate(dialogues):
        score = scorer.score(text, summary)
        # score = rouge.compute(references=[text], predictions=[summary])
        rouge_matrix[i, j] = score['rouge1'].precision
np.savetxt('score_matrix.txt', rouge_matrix)

  0%|          | 0/4053 [00:00<?, ?it/s]

In [27]:
rouge_matrix = np.loadtxt('score_matrix.txt')
# Keep the best dialogue for each summary base on the rouge1-precision score
most_similar_indices = np.argmax(rouge_matrix, axis=1)
rouge_scores = np.max(rouge_matrix, axis=1)

# Save the dialogues id
df_s['dialogue_id'] = most_similar_indices
df_s['rouge_score'] = rouge_scores

# Aggergate the results
# res = pd.merge(df_d, df_s[df_s['rouge_score'] > 0.6], left_index=True, right_on='dialogue_id')[['dialogue', 'summary_piece']]
res_rouge = pd.merge(df_d, df_s, left_index=True, right_on='dialogue_id')[['dialogue', 'summary_piece','dialogue_id']]
res_rouge = res_rouge.groupby('dialogue', as_index=False)['summary_piece'].apply(list)
res_rouge.to_csv('check.csv')

In [28]:
res_rouge['summary_raw'] = res_rouge.summary_piece.apply(' '.join)
res_rouge['rouge1'] = res_rouge.progress_apply(lambda row: calculate_rouge(row['dialogue'], row['summary_raw']), axis=1)
print(f'Reference dataset rouge1 score : {df_r.rouge1.describe()["mean"]}')
print(f'Re constructed dataset rouge1 score : {res_rouge.rouge1.describe()["mean"]}')

  0%|          | 0/1203 [00:00<?, ?it/s]

Reference dataset rouge1 score : 0.18544601639417493
Re constructed dataset rouge1 score : 0.18623855025157252


# Summaries re-order

In [35]:
import math

import itertools
import sys
import munkres
import numpy as np

res_rouge['nb_summaries'] = res_rouge.summary_piece.apply(len)

def maximize_trace(a):
    """
    Maximize trace by minimizing the Frobenius norm of 
    `np.dot(p, a)-np.eye(a.shape[0])`, where `a` is square and
    `p` is a permutation matrix. Returns permuted version of `a` with
    maximal trace.
    """

    assert a.shape[0] == a.shape[1]
    d = np.zeros_like(a)
    n = a.shape[0]
    b = np.eye(n, dtype=int)
    for i, j in itertools.product(range(n), range(n)):
        d[j, i] = sum((b[j, :]-a[i, :])**2)
    m = munkres.Munkres()
    inds = m.compute(d)
    return inds

def split_text(text: str, n: int) -> list:
    step = math.ceil(len(text)/n)
    return [text[i:i + step] for i in range(0, len(text), step)]

def order_summary(text, summaries):
    nb_summaries = len(summaries)
    splits = split_text(text, nb_summaries)
    return splits

def dual_similarity(dialogues, summaries):
    embeddings_d = model.encode(dialogues)
    embeddings_s = model.encode(summaries)

    # Create the similarity_matrix
    similarity_matrix = cosine_similarity(embeddings_s, embeddings_d)
    # most_similar_indices = np.argmax(similarity_matrix, axis=1)
    inds = maximize_trace(similarity_matrix)
    inds.sort(key=lambda x: x[1])
    ordered_summaries = [summaries[i[0]] for i in inds]
    return ordered_summaries

# remove wrong matches
res = res_rouge[res_rouge['nb_summaries'] < 10]    

res['splitted_dialogue'] = res.apply(lambda row: order_summary(row['dialogue'], row['summary_piece']), axis=1)
res['ordered_summary_piece'] = res.progress_apply(lambda row: dual_similarity(row['splitted_dialogue'], row['summary_piece']), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  0%|          | 0/1148 [00:00<?, ?it/s]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [36]:
res['final_summary'] = res_rouge.summary_piece.apply(' '.join)
res[['dialogue', 'final_summary']]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,dialogue,final_summary
0,"A: How was your Oriental trip? . B: OMG, it wa...",A went to Vietnam for a surgery conference hel...
1,Aaron: Hi Abbey! :). Abbey: Hi Aaron! It's bee...,He saw Abbey at the bus station. Aaron quit hi...
2,"Aaron: Hi Bob, are you sending Katie to the sc...",Bob and Aaron want Sandra to convince Marie to...
3,"Abby: <file_video>. Abby: check out this song,...",The lead singer is Abby's former schoolmate.
4,Abby: Jack! Send me the link to that website! ...,"Since the website is not ready yet, Jack sends..."
...,...,...
1198,Zack: It's good to see you online! Lea: Hah! H...,He enjoyed his flight but didn't like the food...
1199,Zack: My guitar's busted!! Wilson: What happen...,Wilson will lend him one of his guitars so he ...
1200,Zoe: Good morning. Caleb: good morning. Zoe: h...,Caleb will visit Zoe today at 4pm. Caleb will ...
1201,"Zoe: Hi Alan, i hope i'll be on time next mond...",Oscar is free at 7pm and will come to Isla and...


In [37]:
result = res.progress_apply(lambda row: calculate_rouge(row['dialogue'], row['final_summary']), axis=1)
print(f'final dataset rouge1 score : {result.describe()["mean"]}')

  0%|          | 0/1148 [00:00<?, ?it/s]

final dataset rouge1 score : 0.1779096282055624


In [38]:
result = res.progress_apply(lambda row: calculate_cos_sim(row['dialogue'], row['final_summary']), axis=1)
print(f'final dataset cosine similarity score : {result.describe()["mean"]}')

  0%|          | 0/1148 [00:00<?, ?it/s]

final dataset cosine similarity score : 0.7954361644015365


In [47]:
res_ = res.drop_duplicates(['dialogue'])

In [52]:
res_ = res.drop_duplicates(['dialogue'])
last = df_d.merge(res_, on='dialogue', validate='many_to_one')
last = last[['id', 'dialogue', 'final_summary']]
last.to_csv('final.csv')

## DRAFT : Summarization fine tuning tentative
The following code is a draft. I tried to fine tune a bart and T5 model on the reference dataset.
I couldn't achieve it because I do not have enough computer power

In [None]:
# Dataset creation
ds = Dataset.from_pandas(df_r.drop(columns=['id','dialogue_len','summary_len'])).rename_column('dialogue', 'text')
ds_train = ds.train_test_split(test_size=0.2)
ds_eval_test = ds_train['test'].train_test_split(test_size=0.5)
ds = DatasetDict({
   'train': ds_train['train'],
   'eval': ds_eval_test['train'],
   'test': ds_eval_test['test']
})

# preprocess
max_input = 3300
max_target = 300
model = "ainize/bart-base-cnn"
tokenizer = AutoTokenizer.from_pretrained(model)
def preprocess_data(data_to_process):
  # get all the dialogues
  inputs = [dialogue for dialogue in data_to_process['text']]
  outputs = [summary for summary in data_to_process['summary']]
  #tokenize the dialogues
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  # tokenize the summaries
  targets = tokenizer(text_target=outputs, max_length=max_target, padding='max_length', truncation=True)
    
  # set labels
  model_inputs['labels'] = targets['input_ids']
  # return the tokenized data
  # input_ids, attention_mask and labels
  return model_inputs

tokenize_data = ds.map(preprocess_data, batched = True)

# training
rouge = load_metric("rouge")

def compute_metrics(prediction):
    labels_ids = prediction.label_ids
    pred_ids = prediction.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=['rouge2'])['rouge2'].mid

    return {
        'rouge2_precision': round(rouge_output.precision, 4),
        'rouge2_recall': round(rouge_output.recall, 4),
        'rouge2_f_measure': round(rouge_output.fmeasure, 4)
    }

model = AutoModelForSeq2SeqLM.from_pretrained(model)
args = Seq2SeqTrainingArguments(
    'bart-base-cnn-dialogue', #save directory
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=3
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['eval'],
    # data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()



In [None]:
ds = Dataset.from_pandas(df)
ds = ds.train_test_split(test_size=0.2)


checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=3300, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=300, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ds = ds.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="dialogue_sum",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
