In [2]:
import pandas as pd
import re
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_path = "merged_transcripts.xlsx"
friends_transcript_df = pd.read_excel(data_path)

In [4]:
friends_transcript_df

Unnamed: 0,Speaker,Dialogue
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Moni..."
1,Monica,There's nothing to tell! He's just some guy I ...
2,Joey,"C'mon, you're going out with the guy! There's ..."
3,Chandler,"All right Joey, be nice. So does he have a hu..."
4,Phoebe,"Wait, does he eat chalk?\n\n(They all stare, b..."
...,...,...
6195,All,That's so sweet.
6196,Ross,"And hey, here's to a lousy Christmas."
6197,Rachel,And a crappy New Year.
6198,Chandler,"Here, here!\n\nClosing Credits"


In [5]:
# Remove actions from transcript
def remove_paranthesis(text):
    result = re.sub(r'\(.*?\)','',text)
    return result


In [6]:
friends_transcript_df['Dialogue'] = friends_transcript_df['Dialogue'].apply(remove_paranthesis)

In [7]:
friends_transcript_df['Dialogue'] 

0       Central Perk, Chandler, Joey, Phoebe, and Moni...
1       There's nothing to tell! He's just some guy I ...
2       C'mon, you're going out with the guy! There's ...
3       All right Joey, be nice.  So does he have a hu...
4                            Wait, does he eat chalk?\n\n
                              ...                        
6195                                     That's so sweet.
6196                And hey, here's to a lousy Christmas.
6197                               And a crappy New Year.
6198                       Here, here!\n\nClosing Credits
6199    The Subway, Joey sees his poster and he peels ...
Name: Dialogue, Length: 6200, dtype: object

In [11]:
friends_transcript_df['number_of_words']  = friends_transcript_df['Dialogue'].str.strip().str.split(" ")
friends_transcript_df['number_of_words'] = friends_transcript_df['number_of_words'].apply(lambda x: len(x))

In [14]:
friends_transcript_df.sample(5)

Unnamed: 0,Speaker,Dialogue,number_of_words
4955,Phoebe,Yeah?,1
3311,Phoebe,Oh my God! Go away! Stop looking in here!,11
5203,Ross,"Uh-oh, uh-oh, the laundry's done. It's, uh, i...",23
3353,Monica,"Uh, Rach... how come you have dental floss in...",11
2976,Ross,"Oh, forget it, okay?",4


In [15]:
friends_transcript_df['Ross_response_flag'] = 0
friends_transcript_df.loc[(friends_transcript_df['Speaker'] == 'Ross')   &  (friends_transcript_df['number_of_words']>5), 'Ross_response_flag'] = 1

In [17]:
friends_transcript_df.sample(8)

Unnamed: 0,Speaker,Dialogue,number_of_words,Ross_response_flag
2317,Chandler,"OK, so now we draw cards.",6,0
148,Paul,"No, it's, it's more of a fifth date kinda reve...",10,0
5035,Chandler,"So, Saturday night, the big night, date night,...",12,0
3853,Ross,"Was there...uh, huh, huh, huh... andybody, any...",9,1
1425,Chandler,"Hi, Just Janice.",3,0
4467,Ross,"No, no, no... why, because it might get weird ...",32,1
3728,Rachel,Get off.,2,0
4875,Rachel,They wanna know if I'm okay. Okay.. they wanna...,65,0


In [26]:
indexes_to_take = list( friends_transcript_df[ (friends_transcript_df['Ross_response_flag'] == 1) & (friends_transcript_df.index>0)].index)

In [27]:
len(indexes_to_take)

626

In [36]:
indexes_to_take[:19]

[21,
 28,
 30,
 32,
 34,
 36,
 41,
 54,
 60,
 86,
 99,
 101,
 110,
 119,
 138,
 141,
 143,
 145,
 163]

In [37]:
system_promt = """" Your are Ross from the Friends Tv Show". Your responses should reflect his personality and speech patterns \n"""

prompts = []
for ind in indexes_to_take:
    prompt = system_promt

    prompt += friends_transcript_df.iloc[ind -1]['Dialogue']
    prompt += '\n'
    prompt += friends_transcript_df.iloc[ind]['Dialogue']
    prompts.append(prompt)

In [40]:
print(prompts[6])

" Your are Ross from the Friends Tv Show". Your responses should reflect his personality and speech patterns 
Strip joint! C'mon, you're single! Have some hormones!
I don't want to be single, okay? I just... I just- I just wanna be married again!




In [32]:
df = pd.DataFrame({"prompt":prompts})
df.head()

Unnamed: 0,prompt
0,""" Your are Ross from the Friends Tv Show"". You..."
1,""" Your are Ross from the Friends Tv Show"". You..."
2,""" Your are Ross from the Friends Tv Show"". You..."
3,""" Your are Ross from the Friends Tv Show"". You..."
4,""" Your are Ross from the Friends Tv Show"". You..."


In [33]:
dataset = Dataset.from_pandas(df)

##### optinal_cleaning

In [41]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

friends_transcript_df['Speaker'].value_counts()

Speaker
Ross            984
Monica          924
Rachel          899
Chandler        836
Phoebe          668
Joey            666
Scene           330
All              91
Carol            79
Susan            69
Janice           38
Barry            35
Ursula           34
Aurora           29
David            28
Nurse            26
Roger            24
Steve            23
Paolo            23
Mindy            22
Lydia            21
Ronni            19
by               18
Paul             17
Teacher          16
Luisa            16
Woman            14
Jill             14
Nina             14
Angela           13
Max              13
Shelley          12
Lizzie           11
Jamie            11
Melanie           8
Girl              8
Director          7
Fran              7
Lowell            7
Receptionist      6
Alan              6
Paula             6
Carl              6
Celia             6
Guy               5
Frannie           5
Robbie            5
Joanne            5
Danielle          5
Kristin     

In [26]:
pd.reset_option('display.max_columns', None)
pd.reset_option('display.max_rows', None)

In [27]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'All']

Unnamed: 0,Speaker,Dialogue,number of words
9,All,"Oh, yeah. Had that dream.",5
89,All,Hey! Paul! Hi! The Wine Guy! Hey!,7
176,All,Morning. Good morning.,3
188,All,Okayyy!,1
258,All,"Cut, cut, cut, cut, cut, cut, cut...",7
...,...,...,...
5570,All,Come on.,2
5781,All,Yeah! Right!,2
5784,All,Yes. Absolutely. A quality.,4
5893,All,God! Ross!,2


In [28]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'Scene']

Unnamed: 0,Speaker,Dialogue,number of words
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Moni...",9
107,Scene,"The Subway, Phoebe is singing for change.]",7
109,Scene,"Ross's Apartment, the guys are there assemblin...",8
127,Scene,"A Restaurant, Monica and Paul are eating.]",7
137,Scene,Ross's Apartment; Ross is pacing while Joey an...,15
...,...,...,...
6112,Scene,"Carol and Susan's, Ross is preparing to talk t...",11
6119,Scene,"Monica and Rachel's, the group is coming back ...",11
6136,Scene,"Carol and Susan's, Carol is reading, Ross is t...",12
6149,Scene,"The Hallway, Joey has a tray full of keys, and...",17


In [31]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'Scene']

In [32]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'All']

In [35]:
pd.set_option("display.max_rows",None)
friends_transcript_df['Speaker'].value_counts()

Speaker
Ross            984
Monica          924
Rachel          899
Chandler        836
Phoebe          668
Joey            666
Carol            79
Susan            69
Janice           38
Barry            35
Ursula           34
Aurora           29
David            28
Nurse            26
Roger            24
Steve            23
Paolo            23
Mindy            22
Lydia            21
Ronni            19
by               18
Paul             17
Teacher          16
Luisa            16
Woman            14
Jill             14
Nina             14
Angela           13
Max              13
Shelley          12
Lizzie           11
Jamie            11
Melanie           8
Girl              8
Fran              7
Lowell            7
Director          7
Paula             6
Receptionist      6
Alan              6
Carl              6
Celia             6
Robbie            5
Danielle          5
Frannie           5
Kristin           5
Joanne            5
Guy               5
Marsha            4
Lorraine    

In [36]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'by']

Unnamed: 0,Speaker,Dialogue,number of words
306,by,Adam Chase & Ira Ungerleider\nTranscribed by: ...,7
540,by,Alexa Junge\nTranscribed by: guineapig,4
796,by,"Jeffrey Astrof, Mike Sikowitz, Adam Chase & Ir...",19
1056,by,Alexa Junge\nTranscribed by: guineapig,4
1282,by,Bill Lawrence\nTranscribed by: Mindy Mattingly...,11
1478,by,Jeff Greenstein & Jeff Strauss\nTranscribed by...,8
1736,by,Marta Kauffman & David Crane\nTranscribed by: ...,13
2004,by,Marta Kauffman & David Crane\nTranscribed by: ...,14
2258,by,Jeffrey Astrof and Mike Sikowitz. .\nTranscrib...,9
2788,by,Jeffrey Astrof & Mike Sikowitz\nTranscribed by...,44


In [37]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'by']

In [38]:
pd.set_option("display.max_rows",None)
friends_transcript_df['Speaker'].value_counts()

Speaker
Ross            984
Monica          924
Rachel          899
Chandler        836
Phoebe          668
Joey            666
Carol            79
Susan            69
Janice           38
Barry            35
Ursula           34
Aurora           29
David            28
Nurse            26
Roger            24
Steve            23
Paolo            23
Mindy            22
Lydia            21
Ronni            19
Paul             17
Teacher          16
Luisa            16
Jill             14
Woman            14
Nina             14
Angela           13
Max              13
Shelley          12
Lizzie           11
Jamie            11
Melanie           8
Girl              8
Fran              7
Lowell            7
Director          7
Paula             6
Receptionist      6
Alan              6
Carl              6
Celia             6
Robbie            5
Danielle          5
Frannie           5
Kristin           5
Joanne            5
Guy               5
Marsha            4
Lorraine          4
Bob         

In [39]:
friends_transcript_df['Speaker'].unique()

array(['Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel',
       'Waitress', 'Paul', 'Frannie', 'Customer', 'Max', 'David',
       'Janice', 'Sandy', 'Guy', 'Paolo', 'Carol', 'Susan', 'Guys',
       'Girls', 'Roger', 'Ronni', 'Woman', 'Lorraine', 'Kristin', 'Celia',
       'Steve', 'Ursula', 'Teacher', 'Helen', 'Nina', 'Jamie', 'Fran',
       'Nurse', 'Intercom', 'Luisa', 'Both', 'Barry', 'Marsha', 'Robbie',
       'Bernice', 'Bobby', 'Mindy', 'Danielle', 'PA', 'Actor', 'Gerston',
       'Santos', 'Petrie', 'Lydia', 'Janitor', 'Melanie', 'Man', 'Carl',
       'Julie', 'Paula', 'Alan', 'Lizzie', 'Leslie', 'Kiki', 'Joanne',
       'Receptionist', 'Kid', 'Angela', 'Bob', 'Aurora', 'Director',
       'Jill', 'Shelley', 'Andrea', 'Lowell', 'Terry', 'Girl'],
      dtype=object)