In [1]:
# %pip install py7zr

In [2]:
import re
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("samsum")

train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(valid_data)}")
print(f"Test set size: {len(test_data)}")

Train set size: 14732
Validation set size: 818
Test set size: 819


In [4]:
print(train_data[0])

{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}


In [5]:
print(train_data[0]['dialogue'], '\n')
print(train_data[0]['summary'])

Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-) 

Amanda baked cookies and will bring Jerry some tomorrow.


In [6]:
train_df = pd.DataFrame(train_data)
valid_df = pd.DataFrame(valid_data)
test_df = pd.DataFrame(test_data)

train_df.to_csv("../dataset/samsum_train.csv", index=False)
valid_df.to_csv("../dataset/samsum_valid.csv", index=False)
test_df.to_csv("../dataset/samsum_test.csv", index=False)

In [7]:
train_df['is_train'] = 1
valid_df['is_train'] = 0
test_df['is_train'] = -1
total_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

In [8]:
def merge_same_speaker_turns(dialogue):
    lines = dialogue.split('\r\n')
    merged_lines = []
    current_speaker = None
    current_sentence = []

    for line in lines:
        match = re.match(r"(\w+): (.+)", line)
        if match:
            speaker = match.group(1)
            sentence = match.group(2)

            if speaker == current_speaker:
                current_sentence.append(sentence)
            else:
                if current_speaker:
                    merged_lines.append(f"{current_speaker}: {' '.join(current_sentence)}")
                current_speaker = speaker
                current_sentence = [sentence]
        else:
            merged_lines.append(line)

    if current_speaker:
        merged_lines.append(f"{current_speaker}: {' '.join(current_sentence)}")

    return '\r\n'.join(merged_lines)

total_df['processed_dialogue'] = total_df['dialogue'].apply(merge_same_speaker_turns)

In [9]:
org = total_df.iloc[2]['dialogue']
prc = total_df.iloc[2]['processed_dialogue']

print(f"{org}\n\n{prc}")

Tim: Hi, what's up?
Kim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating
Tim: What did you plan on doing?
Kim: Oh you know, uni stuff and unfucking my room
Kim: Maybe tomorrow I'll move my ass and do everything
Kim: We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies
Tim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores
Tim: It really helps
Kim: thanks, maybe I'll do that
Tim: I also like using post-its in kaban style

Tim: Hi, what's up?
Kim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating
Tim: What did you plan on doing?
Kim: Oh you know, uni stuff and unfucking my room Maybe tomorrow I'll move my ass and do everything We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies
Tim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores It really helps
Kim: thanks, maybe I'll do that
Tim: I also like usin

In [10]:
def extract_speakers(dialogue):
    speakers = set(re.findall(r"([A-Za-z]+):", dialogue))
    return speakers

all_speakers = set()
total_df['processed_dialogue'].apply(lambda x: all_speakers.update(extract_speakers(x)))

for speaker in all_speakers:
    print(speaker)

Garrett
Claude
Sasuka
Lucky
use
Jere
Erikah
Shannon
Temple
Denver
Magdalene
Tallia
Aldo
Timu
Forlan
Lang
written
Princeton
yet
necessarily
Dell
Ike
Michal
top
Gael
Audree
got
Thea
add
Job
Alisa
Iris
Ester
Ayden
Hedge
Debra
Leonor
Sinclair
Pietr
Ostoja
Hazel
Zoli
excellent
Apollo
Horace
Adrew
Barbra
Anastasia
Keano
Tessie
Odin
Zendaya
Keaton
calculated
Total
Khadija
Tex
Neva
options
Farah
Niko
Trinny
Carry
Dolloway
AJ
study
me
Siddhi
Alek
pics
Irwin
Siobhan
tonight
Bettina
Yen
look
Leonard
Viv
Mauri
Brooke
Pierce
Daria
Lexie
Younes
Cadence
phone
Twister
Dineo
Alianna
Olga
Mags
Palin
Elektra
Matilda
Charlotte
Nathalie
Bernard
Byron
Bryant
Emi
Mick
Mela
Geoffrey
Xenna
Corin
Jesus
Serena
Ely
Ana
Jannette
Dempsey
Sylvia
Wojtek
Anabella
Tyree
Tess
Jashua
Fisher
Thomson
Blaine
fundings
Marti
Eleonore
Ethel
Binni
Oona
Gavyn
do
Aurelia
Aneta
Pedro
Keifer
Gabriella
obviously
Lynda
Stephen
Hayley
Gwendolen
Jacopo
Angelo
Connor
haha
Kraig
Rosemary
Corinne
Felicity
beans
Agnes
Engelbert
Rylen
Jamis

In [11]:
def remove_emoticons(dialogue):
    emoticon_pattern = r"[:;=Xx8]-?[()PD3/\|\\*oO]|<3|:<|:<<<|;\)|;\)\)|<\|:‑\)|<<<<|<---|;>|<<|\)|<`ヘ´>"
    return re.sub(emoticon_pattern, '', dialogue)

total_df['processed_dialogue'] = total_df['processed_dialogue'].apply(remove_emoticons)

In [12]:
org = total_df.iloc[4]['dialogue']
prc = total_df.iloc[4]['processed_dialogue']

print(f"{org}\n\n{prc}")

Sam: hey  overheard rick say something
Sam: i don't know what to do :-/
Naomi: what did he say??
Sam: he was talking on the phone with someone
Sam: i don't know who
Sam: and he was telling them that he wasn't very happy here
Naomi: damn!!!
Sam: he was saying he doesn't like being my roommate
Naomi: wow, how do you feel about it?
Sam: i thought i was a good rommate
Sam: and that we have a nice place
Naomi: that's true man!!!
Naomi: i used to love living with you before i moved in with me boyfriend
Naomi: i don't know why he's saying that
Sam: what should i do???
Naomi: honestly if it's bothering you that much you should talk to him
Naomi: see what's going on
Sam: i don't want to get in any kind of confrontation though
Sam: maybe i'll just let it go
Sam: and see how it goes in the future
Naomi: it's your choice sam
Naomi: if i were you i would just talk to him and clear the air

Sam: hey  overheard rick say something i don't know what to do 
Naomi: what did he say??
Sam: he was talking o

In [13]:
def extract_special_tokens(dialogue):
    # 특수 토큰 패턴 정의 (<>로 감싸진 토큰)
    special_tokens = set(re.findall(r"<[^>]+>", dialogue))
    return special_tokens

all_special_tokens = set()
total_df['processed_dialogue'].apply(lambda x: all_special_tokens.update(extract_special_tokens(x)))

for tok in all_special_tokens:
    print(tok)

<file_link>
<file photo>
<file_other_>
<file_doc>
<file_zip>
<file_photo? or <file_photo>
<file:video>
<file:URGENT>
<file-photo>
<file:Amelia.doc>
<photo_file>
<thumb up>
<file other>
<file:jpg>
<send_file>
<foto>
<moa>
<picutre>
<file_ photo>
<gif>
<file _other>
<File:Excelsheet>
<File_photo>
<other_file>
<location>
<file_song>
<file_gif>
<link_video>
<flie_photo>
<video>
<emoticon>
<file_GIF>
<file_photo >
<OMG>
<File_link>
< link_photo>
<othre_file>
<file_movie>
<emoticon_:smiley:>
<other>
<emoticon_thumbup>
<file:photo>
<waves>
<file_photo_screenshot_from_phone>
<file_pic>
<thumbsup>
<file_photo>
<love>
<photo>
<file_git>
<file-other>
<link_other>
<file_gps>
<file_location>
<emoticon_stuck_out_tongue>
<link_photo>
<file>
<video_file>
<file_image>
<File_line>
<file_record>
<file _gif>
<file_video>
<file_others>
<file_photos>
<fIie_others>
<file.other>
<gif_file>
<file_audio>
<fole_other>
<file_docx>
<# <file_othetr>
< link>
<link>
<emoticon_smile>
<file_foto>
<‎file_photo>
<file_ o

In [14]:
total_df['dialogue'] = total_df['processed_dialogue']
total_df = total_df.drop(columns=['processed_dialogue'])

In [15]:
cleaned_train_df = total_df[total_df['is_train'] == 1].reset_index(drop=True)
cleaned_valid_df = total_df[total_df['is_train'] == 0].reset_index(drop=True)
cleaned_test_df = total_df[total_df['is_train'] == -1].reset_index(drop=True)

In [16]:
print(cleaned_train_df.columns)
print(cleaned_valid_df.columns)
print(cleaned_test_df.columns)

Index(['id', 'dialogue', 'summary', 'is_train'], dtype='object')
Index(['id', 'dialogue', 'summary', 'is_train'], dtype='object')
Index(['id', 'dialogue', 'summary', 'is_train'], dtype='object')


In [17]:
cleaned_train_df.to_csv("../dataset/cleaned_samsum_train.csv", index=False)
cleaned_valid_df.to_csv("../dataset/cleaned_samsum_valid.csv", index=False)
cleaned_test_df.to_csv("../dataset/cleaned_samsum_test.csv", index=False)