In [127]:
import pandas as pd
import re
from datasets import Dataset

In [128]:
pd.reset_option("display.max_column", None)
pd.reset_option("display.max_row", None)

In [129]:
data_path = "merged_transcripts2.xlsx"
friends_transcript_df = pd.read_excel(data_path)

In [130]:
friends_transcript_df

Unnamed: 0,Speaker,Dialogue
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Moni..."
1,Monica,There's nothing to tell! He's just some guy I ...
2,Joey,"C'mon, you're going out with the guy! There's ..."
3,Chandler,"All right Joey, be nice. So does he have a hu..."
4,Phoebe,"Wait, does he eat chalk?\n\n(They all stare, b..."
...,...,...
62853,Ross,I think I know too but I've been really wrong ...
62854,Ross,I'm sorry... we... we can't.
62855,Charlie,"All right, all right."
62856,Ross,"I mean, you just went out with my best friend,..."


In [131]:
# Remove actions from transcript
def remove_paranthesis(text):
    result = re.sub(r'\(.*?\)','',text)
    return result


In [132]:
friends_transcript_df['Dialogue'] = friends_transcript_df['Dialogue'].apply(remove_paranthesis)

In [133]:
friends_transcript_df['Dialogue'] 

0        Central Perk, Chandler, Joey, Phoebe, and Moni...
1        There's nothing to tell! He's just some guy I ...
2        C'mon, you're going out with the guy! There's ...
3        All right Joey, be nice.  So does he have a hu...
4                             Wait, does he eat chalk?\n\n
                               ...                        
62853    I think I know too but I've been really wrong ...
62854                         I'm sorry... we... we can't.
62855                                All right, all right.
62856    I mean, you just went out with my best friend,...
62857    What?\n\n\n\nEnd\n\n[1] Homo ergaster: Some sc...
Name: Dialogue, Length: 62858, dtype: object

In [134]:
friends_transcript_df['number_of_words']  = friends_transcript_df['Dialogue'].str.strip().str.split(" ")
friends_transcript_df['number_of_words'] = friends_transcript_df['number_of_words'].apply(lambda x: len(x))

In [135]:
friends_transcript_df.sample(5)

Unnamed: 0,Speaker,Dialogue,number_of_words
18772,Quartet,Your loyal loving boyfriend Ross..... Ross!\n\...,7
26212,Devon,Do you realise that we have not seen each othe...,17
35682,Chandler,Really?,1
37937,Chandler,Can I check out what she did to my room?,10
45213,Chandler,Mm! Night Gar’!,3


### Correction of Speaker names

In [136]:
pd.reset_option('display.max_columns', None)
pd.reset_option('display.max_rows', None)

friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            8876
Ross              8623
Chandler          8021
Monica            7967
Joey              7828
                  ... 
Marge                1
WRITER               1
INTERCOM             1
AMGER                1
Paleontologist       1
Name: count, Length: 404, dtype: int64

In [137]:
# Standardizing speaker names
friends_transcript_df['Speaker'] = friends_transcript_df['Speaker'].replace({
    'RACHEL': 'Rachel',
    'ROSS': 'Ross',
    'CHANDLER': 'Chandler',
    'MONICA': 'Monica',
    'JOEY': 'Joey',
    'PHOEBE': 'Phoebe'
})

# Verifying the changes
friends_transcript_df['Speaker'].value_counts()


Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
                  ... 
JANICE               1
Ashley               1
Janitor              1
Commercial           1
Paleontologist       1
Name: count, Length: 398, dtype: int64

In [138]:
friends_transcript_df['Speaker'].unique()

array(['Scene', 'Monica', 'Joey', 'Chandler', 'Phoebe', 'All', 'Ross',
       'Rachel', 'Waitress', 'Paul', 'Frannie', 'Customer', 'by', 'Max',
       'David', 'Janice', 'Sandy', 'Charlie', 'Both', 'Mike', 'Precious',
       'Colleen', 'Bill', 'Owen', 'Assistant', 'Amanda', 'Glenda',
       'Written', 'Produced', 'Transcribed', 'Jack', 'Judy', 'Everybody',
       'Estelle', 'Emma', 'Amy', 'Announcer', 'Benjamin', 'Laura', 'Host',
       'Erica', 'Waiter', 'Sarah', 'Girl', 'Missy', 'Voice', 'Donny',
       'Gene', 'Man', 'Roy', 'Henrietta', 'Adrienne', 'Nurse', 'Tape',
       'Director', 'Clerk', 'Gunther', 'Realtor', 'Mackenzie', 'Mark',
       'Woman', 'Rita', 'Jennifer', 'Lady', 'Doctor', 'Guy', 'Paolo',
       'Carol', 'Susan', 'Guys', 'Girls', 'Roger', 'Ronni', 'Lorraine',
       'Kristin', 'Celia', 'Steve', 'Ursula', 'Teacher', 'Helen', 'Nina',
       'Jamie', 'Fran', 'Intercom', 'Luisa', 'Barry', 'Marsha', 'Robbie',
       'Bernice', 'Bobby', 'Mindy', 'Danielle', 'Julie', 'Paulo'

In [139]:
pd.reset_option('display.max_rows', None)
friends_transcript_df[friends_transcript_df['Speaker'] == 'Scene']

Unnamed: 0,Speaker,Dialogue,number_of_words
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Moni...",9
107,Scene,"The Subway, Phoebe is singing for change.]",7
109,Scene,"Ross's Apartment, the guys are there assemblin...",8
127,Scene,"A Restaurant, Monica and Paul are eating.]",7
137,Scene,Ross's Apartment; Ross is pacing while Joey an...,15
...,...,...,...
62728,Scene,Rachel's hotel room. Joey is standing at the d...,11
62763,Scene,the hotel game room. Monica and Mike are still...,12
62789,Scene,"hotel's bar, Ross and Mr. Oberblau are talking...",35
62806,Scene,"game room, Monica and Mike are still playing p...",10


In [140]:
friends_transcript_df_demo1 = friends_transcript_df 
print(friends_transcript_df_demo1.shape)

(62858, 3)


In [141]:
friends_transcript_df_demo1 = friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] != 'Scene']

In [142]:
friends_transcript_df_demo1 = friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] != 'All']

In [143]:
friends_transcript_df_demo1 = friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] != 'by']

In [144]:
friends_transcript_df_demo1.shape

(59222, 3)

In [145]:
friends_transcript_df_demo1['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
                  ... 
Referee              1
Fireman              1
Anchorwoman          1
Employee             1
Paleontologist       1
Name: count, Length: 395, dtype: int64

In [146]:
pd.reset_option('display.max_colwidth', None)
friends_transcript_df_demo1['Speaker'].unique()

array(['Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel',
       'Waitress', 'Paul', 'Frannie', 'Customer', 'Max', 'David',
       'Janice', 'Sandy', 'Charlie', 'Both', 'Mike', 'Precious',
       'Colleen', 'Bill', 'Owen', 'Assistant', 'Amanda', 'Glenda',
       'Written', 'Produced', 'Transcribed', 'Jack', 'Judy', 'Everybody',
       'Estelle', 'Emma', 'Amy', 'Announcer', 'Benjamin', 'Laura', 'Host',
       'Erica', 'Waiter', 'Sarah', 'Girl', 'Missy', 'Voice', 'Donny',
       'Gene', 'Man', 'Roy', 'Henrietta', 'Adrienne', 'Nurse', 'Tape',
       'Director', 'Clerk', 'Gunther', 'Realtor', 'Mackenzie', 'Mark',
       'Woman', 'Rita', 'Jennifer', 'Lady', 'Doctor', 'Guy', 'Paolo',
       'Carol', 'Susan', 'Guys', 'Girls', 'Roger', 'Ronni', 'Lorraine',
       'Kristin', 'Celia', 'Steve', 'Ursula', 'Teacher', 'Helen', 'Nina',
       'Jamie', 'Fran', 'Intercom', 'Luisa', 'Barry', 'Marsha', 'Robbie',
       'Bernice', 'Bobby', 'Mindy', 'Danielle', 'Julie', 'Paulo',
       'Frankie', 'A

In [147]:
friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] == 'Sandy']

Unnamed: 0,Speaker,Dialogue,number_of_words
472,Sandy,"Hi, I'm Sandy.",3
474,Sandy,"Yeah. That's okay, right?\n\n",4
486,Sandy,"Y'know, when I saw you at the store last week,...",22
488,Sandy,Yeah.\n\n,1
57575,Sandy,Hi... I'm Sandy.,3
57625,Sandy,I really do understand how hard it's gotta be ...,26
57629,Sandy,It's okay. I get that a lot doing what I do. B...,18
57631,Sandy,Her name is Deliah.,4
57634,Sandy,I realise how it's... a bit unorthodox for som...,30
57636,Sandy,"Like in my last job, I met Daniel when he was ...",54


### Corretion in Dialogue

In [148]:
# Set option to display the full text
pd.reset_option('display.max_colwidth', None)

# Displaying a sample of the dialogues again
friends_transcript_df_demo1['Dialogue'].sample(7)


27332    Ooh, three points. Both fine answers, but we w...
9500                                            You bitch.
24322                          Yeah, she broke up with me.
62124    I think he is fine! It's just that we don't kn...
14157    Na, forget it, it's probably stripped and sold...
49611    Nono, me neither. Although, uh, y'know, back i...
18915          So, you wanna get a hamburger or something?
Name: Dialogue, dtype: object

In [149]:
import re

# Function to clean the dialogues
def clean_dialogue(text):
    # 1. Remove everything within braces or brackets {} or [] or ()
    text = re.sub(r'\[.*?\]|\{.*?\}|\(.*?\)', '', text)
    
    # 2. Remove everything before ] or } or )
    text = re.sub(r'.*[\]\}\)]', '', text)
    
    # 3. Remove everything after [, {, (
    text = re.sub(r'[\[\{\(].*', '', text)
    
    # 4. Remove all \n and other escape characters, including those in between
    text = text.replace('\n', ' ').strip()
    
    # 5. Remove extra spaces that may be left behind after removing \n
    text = re.sub(r'\s+', ' ', text)
    
    return text

# Apply the function to the 'Dialogue' column
friends_transcript_df_demo1['Dialogue'] = friends_transcript_df_demo1['Dialogue'].apply(clean_dialogue)

# Displaying a sample of the cleaned dialogues
friends_transcript_df_demo1['Dialogue'].sample(7)


60869                                     What's going on?
13528    Yeah, he's lived here for years, I don't, I do...
33283                             Sounds like a fun party.
34471                           Hey! Is the show still on?
47967                 She looks exactly like Aunt Marilyn.
35932                                          Oh. Hel-lo!
21511    Hi! Yeah! Tommy’s in line for the bathroom and...
Name: Dialogue, dtype: object

In [150]:
friends_transcript_df_demo1['Dialogue'].sample(7)

21364    Okay, now when you come back I hope you rememb...
33672    Yep! As long as you understand that I'm going ...
29464              Not even, say, breaking up with Janice?
37604     Well, Phoebe that’s fine because I’m not moving.
25123          Wait, wait, why were you at the courthouse?
62316    Why not? It's brilliant! Goodbye Mike, we'll s...
1036     Well, just you know, for argument's sake, you ...
Name: Dialogue, dtype: object

In [151]:
friends_transcript_df_demo1.to_excel("merged_transcripts3.xlsx", index=False)

In [152]:
pd.reset_option("display.max_rows", None)
friends_transcript_df_demo1.sample(3)

Unnamed: 0,Speaker,Dialogue,number_of_words
45222,Tag,"No, not my tushie.",4
5256,Phoebe,"Yeah, you got to get to the hospital.",8
43853,Monica,That’s right! You lose sucker!! Please still m...,10


### Ross transcripts data analysis

In [153]:
friends_transcript_df_demo1['Ross_response_flag'] = 0
friends_transcript_df_demo1.loc[(friends_transcript_df_demo1['Speaker'] == 'Ross')   &  (friends_transcript_df_demo1['number_of_words']>5), 'Ross_response_flag'] = 1

In [154]:
friends_transcript_df_demo1.sample(2)

Unnamed: 0,Speaker,Dialogue,number_of_words,Ross_response_flag
15086,Rachel,"Yeah, why?",2,0
9484,Monica,"Of course I wouldn't approve, I mean, you were...",27,0


In [155]:
friends_transcript_df_demo1.sample(15)

Unnamed: 0,Speaker,Dialogue,number_of_words,Ross_response_flag
21283,Ross,Look...,1,0
43133,Phoebe,"What a great night, Chandler can’t do it, thes...",11,0
17373,Janice,"Oh, I cannot believe he’s using our divorce to...",11,0
20155,Rachel,"No you guys, I am not getting in a car with hi...",19,0
25126,Frank,"Uhh, yeah.",2,0
22279,Rachel,Really?!,1,0
24141,Phoebe,"Yeah, ooh, I like that! Yeah. Wait! How do you...",13,0
36889,Ross,Fighting?!,1,0
1758,Amy,Oh! He's ok. Do you remember my old boyfriend ...,10,0
4871,Joey,There’s a part in a TV movie that I would be p...,28,0


In [156]:
friends_transcript_df_demo1 = friends_transcript_df_demo1.reset_index(drop=True)

In [157]:
friends_transcript_df_demo1.sample(5)

Unnamed: 0,Speaker,Dialogue,number_of_words,Ross_response_flag
12715,Rachel,"Not uh, not to my recollection.",6,0
53055,Phoebe,And so engaged.,3,0
53065,Phoebe,But you didn’t give it to her?,7,0
32111,Joey,"Wait-whoa-whoa, you lost me.",4,0
30303,Rachel,"I didn’t! Even when I found out…umm, all right...",28,0


In [158]:
friends_transcript_df_demo1 = friends_transcript_df_demo1.assign(new_index=range(1, len(friends_transcript_df_demo1) + 1))


In [159]:
indexes_to_take = list( friends_transcript_df_demo1[ (friends_transcript_df_demo1['Ross_response_flag'] == 1) & (friends_transcript_df_demo1.index>0)].index)

In [160]:
len(indexes_to_take)

5745

In [161]:
indexes_to_take[:2]

[19, 26]

In [162]:
system_promt = """"You are Ross from the Friends TV Show. Your responses should reflect his personality and speech patterns.\n"""

prompts = []
for ind in indexes_to_take:
    prompt = system_promt
    
    # Ensure `ind - 1` is within bounds
    if ind - 1 >= 0:
        prompt += friends_transcript_df_demo1.iloc[ind - 1]['Dialogue'] + '\n'
    
    # Check if `ind` is within bounds
    if ind < len(friends_transcript_df_demo1):
        prompt += friends_transcript_df_demo1.iloc[ind]['Dialogue']
    
    prompts.append(prompt)


In [163]:
print(prompts[6])

"You are Ross from the Friends TV Show. Your responses should reflect his personality and speech patterns.
Strip joint! C'mon, you're single! Have some hormones!
I don't want to be single, okay? I just... I just- I just wanna be married again!


In [164]:
df = pd.DataFrame({"prompt":prompts})
df.head()

Unnamed: 0,prompt
0,"""You are Ross from the Friends TV Show. Your r..."
1,"""You are Ross from the Friends TV Show. Your r..."
2,"""You are Ross from the Friends TV Show. Your r..."
3,"""You are Ross from the Friends TV Show. Your r..."
4,"""You are Ross from the Friends TV Show. Your r..."


In [165]:
dataset = Dataset.from_pandas(df)

##### optinal_cleaning

In [39]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
Phoebe            7567
Scene             3120
Mike               360
All                306
Janice             215
by                 210
Emily              202
Charlie            190
Carol              156
Tag                146
David              137
Paul               133
Frank              132
RICHARD            128
Richard            128
Gunther            123
Amy                119
Woman              111
Pete               103
Joshua              98
Gary                96
Eric                95
Elizabeth           94
Mona                92
Janine              92
Kathy               91
Susan               90
MNCA                87
Ursula              85
Jill                83
RACH                80
Joanna              73
Ben                 72
Erica               66
Gavin               64
EDDIE               64
Nurse               62
Kate                61
Dan

In [40]:
pd.reset_option('display.max_columns', None)
pd.reset_option('display.max_rows', None)

In [41]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'All']

Unnamed: 0,Speaker,Dialogue,number_of_words
9,All,"Oh, yeah. Had that dream.",5
89,All,Hey! Paul! Hi! The Wine Guy! Hey!,7
176,All,Morning. Good morning.,3
188,All,Okayyy!,1
258,All,"Cut, cut, cut, cut, cut, cut, cut...",7
...,...,...,...
60139,All,"Hey, hey!",2
60207,All,Hey!\n\n,1
61008,All,"Oh, so sorry man! Sorry!",5
61030,All,Wow!,1


In [42]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'Scene']

Unnamed: 0,Speaker,Dialogue,number_of_words
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Monica are there.]",9
107,Scene,"The Subway, Phoebe is singing for change.]",7
109,Scene,"Ross's Apartment, the guys are there assembling furniture.]",8
127,Scene,"A Restaurant, Monica and Paul are eating.]",7
137,Scene,Ross's Apartment; Ross is pacing while Joey and Chandler are working on some more furniture.],15
...,...,...,...
62728,Scene,"Rachel's hotel room. Joey is standing at the door, facing Rachel]",11
62763,Scene,the hotel game room. Monica and Mike are still playing ping pong],12
62789,Scene,"hotel's bar, Ross and Mr. Oberblau are talking]\n\nMr. Oberblau: I'm just saying, I have a cabin in the Adirondacks if you ever want to get away from the city, well, that'd be just nifty!",35
62806,Scene,"game room, Monica and Mike are still playing ping pong]\n\n",10


In [43]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'Scene']

In [44]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'All']

In [45]:
pd.set_option("display.max_rows",None)
friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
Phoebe            7567
Mike               360
Janice             215
by                 210
Emily              202
Charlie            190
Carol              156
Tag                146
David              137
Paul               133
Frank              132
RICHARD            128
Richard            128
Gunther            123
Amy                119
Woman              111
Pete               103
Joshua              98
Gary                96
Eric                95
Elizabeth           94
Janine              92
Mona                92
Kathy               91
Susan               90
MNCA                87
Ursula              85
Jill                83
RACH                80
Joanna              73
Ben                 72
Erica               66
EDDIE               64
Gavin               64
Nurse               62
Kate                61
Danny               58
Guy                 58
Dir

In [46]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'by']

Unnamed: 0,Speaker,Dialogue,number_of_words
306,by,Adam Chase & Ira Ungerleider\nTranscribed by: guineapig,7
540,by,Andrew Reich & Ted Cohen\nTranscribed by: Coffee Mug,8
893,by,Sherry Bilsing-Graham & Ellen Plummer\nDirected by: Ben Weiss\nTranscribed by: Kreidy,10
1127,by,"Brian Buckner\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa",15
1669,by,"Dana Klein\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa",15
2016,by,"Sebastian Jones\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Cof fee Mug, Ele onora, Seba stiano & Vaness a\nFinal check by Kim\n\n--------------------------------------------------------------------------------",22
2330,by,"Mark Kunerth\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFinal check by Kim",18
2600,by,"Shana Goldberg-Meehan\nProduced by: Robert Carlock & Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFinal check by Kim",18
2936,by,"Scott Silveri\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFinal check by Kim",18
3210,by,"Doty Abrams\nProduced by: Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFlashback clips transcribed by: Guineapig, Dan Silverstein, Ruth Curran, Eric Aasen and Mindy Mattingly Phillips, and compiled by Eleonora.\nFinal check by Kim",33


In [47]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'by']

In [48]:
pd.set_option("display.max_rows",None)
friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
Phoebe            7567
Mike               360
Janice             215
Emily              202
Charlie            190
Carol              156
Tag                146
David              137
Paul               133
Frank              132
Richard            128
RICHARD            128
Gunther            123
Amy                119
Woman              111
Pete               103
Joshua              98
Gary                96
Eric                95
Elizabeth           94
Janine              92
Mona                92
Kathy               91
Susan               90
MNCA                87
Ursula              85
Jill                83
RACH                80
Joanna              73
Ben                 72
Erica               66
EDDIE               64
Gavin               64
Nurse               62
Kate                61
Guy                 58
Danny               58
Director            57
PHO

In [49]:
friends_transcript_df['Speaker'].unique()

array(['Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel',
       'Waitress', 'Paul', 'Frannie', 'Customer', 'Max', 'David',
       'Janice', 'Sandy', 'Charlie', 'Both', 'Mike', 'Precious',
       'Colleen', 'Bill', 'Owen', 'Assistant', 'Amanda', 'Glenda',
       'Written', 'Produced', 'Transcribed', 'Jack', 'Judy', 'Everybody',
       'Estelle', 'Emma', 'Amy', 'Announcer', 'Benjamin', 'Laura', 'Host',
       'Erica', 'Waiter', 'Sarah', 'Girl', 'Missy', 'Voice', 'Donny',
       'Gene', 'Man', 'Roy', 'Henrietta', 'Adrienne', 'Nurse', 'Tape',
       'Director', 'Clerk', 'Gunther', 'Realtor', 'Mackenzie', 'Mark',
       'Woman', 'Rita', 'Jennifer', 'Lady', 'Doctor', 'Guy', 'Paolo',
       'Carol', 'Susan', 'Guys', 'Girls', 'Roger', 'Ronni', 'Lorraine',
       'Kristin', 'Celia', 'Steve', 'Ursula', 'Teacher', 'Helen', 'Nina',
       'Jamie', 'Fran', 'Intercom', 'Luisa', 'Barry', 'Marsha', 'Robbie',
       'Bernice', 'Bobby', 'Mindy', 'Danielle', 'Julie', 'Paulo',
       'Frankie', 'A