In [1]:
import pandas as pd
import re
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.reset_option("display.max_column", None)
pd.reset_option("display.max_row", None)

In [9]:
data_path = "../data_webscrapped/dataset_transcripts/merged_transcripts.xlsx"
friends_transcript_df = pd.read_excel(data_path)

In [10]:
friends_transcript_df

Unnamed: 0,Speaker,Dialogue
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Moni..."
1,Monica,There's nothing to tell! He's just some guy I ...
2,Joey,"C'mon, you're going out with the guy! There's ..."
3,Chandler,"All right Joey, be nice. So does he have a hu..."
4,Phoebe,"Wait, does he eat chalk?\n\n(They all stare, b..."
...,...,...
62853,Ross,I think I know too but I've been really wrong ...
62854,Ross,I'm sorry... we... we can't.
62855,Charlie,"All right, all right."
62856,Ross,"I mean, you just went out with my best friend,..."


In [11]:
friends_transcript_df['Dialogue'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 62858 entries, 0 to 62857
Series name: Dialogue
Non-Null Count  Dtype 
--------------  ----- 
62858 non-null  object
dtypes: object(1)
memory usage: 491.2+ KB


In [10]:
# Remove actions from transcript
def remove_paranthesis(text):
    result = re.sub(r'\(.*?\)','',text)
    return result


In [13]:
friends_transcript_df['Dialogue'] = friends_transcript_df['Dialogue'].apply(remove_paranthesis)

In [14]:
friends_transcript_df['Dialogue'] 

0        Central Perk, Chandler, Joey, Phoebe, and Moni...
1        There's nothing to tell! He's just some guy I ...
2        C'mon, you're going out with the guy! There's ...
3        All right Joey, be nice.  So does he have a hu...
4                             Wait, does he eat chalk?\n\n
                               ...                        
62853    I think I know too but I've been really wrong ...
62854                         I'm sorry... we... we can't.
62855                                All right, all right.
62856    I mean, you just went out with my best friend,...
62857    What?\n\n\n\nEnd\n\n[1] Homo ergaster: Some sc...
Name: Dialogue, Length: 62858, dtype: object

In [15]:
friends_transcript_df['number_of_words']  = friends_transcript_df['Dialogue'].str.strip().str.split(" ")
friends_transcript_df['number_of_words'] = friends_transcript_df['number_of_words'].apply(lambda x: len(x))

In [16]:
friends_transcript_df.sample(5)

Unnamed: 0,Speaker,Dialogue,number_of_words
40967,Ross,"Yeah, it kinda grows on you. Actually, I want...",19
43532,Chandler,Our kids are gonna be fat aren’t they.,8
51164,Eric,"It’s just so weird, two people look so much al...",13
38570,Joey,"Yeah, yeah, okay. Uh, look buddy, I came with ...",42
33125,Phoebe,"Okay, Rachel, get me perfume!",5


### Correction of Speaker names

In [17]:
pd.reset_option('display.max_columns', None)
pd.reset_option('display.max_rows', None)

friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            8876
Ross              8623
Chandler          8021
Monica            7967
Joey              7828
                  ... 
Marge                1
WRITER               1
INTERCOM             1
AMGER                1
Paleontologist       1
Name: count, Length: 404, dtype: int64

In [18]:
# Standardizing speaker names
friends_transcript_df['Speaker'] = friends_transcript_df['Speaker'].replace({
    'RACHEL': 'Rachel',
    'ROSS': 'Ross',
    'CHANDLER': 'Chandler',
    'MONICA': 'Monica',
    'JOEY': 'Joey',
    'PHOEBE': 'Phoebe'
})

# Verifying the changes
friends_transcript_df['Speaker'].value_counts()


Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
                  ... 
JANICE               1
Ashley               1
Janitor              1
Commercial           1
Paleontologist       1
Name: count, Length: 398, dtype: int64

In [19]:
friends_transcript_df['Speaker'].unique()

array(['Scene', 'Monica', 'Joey', 'Chandler', 'Phoebe', 'All', 'Ross',
       'Rachel', 'Waitress', 'Paul', 'Frannie', 'Customer', 'by', 'Max',
       'David', 'Janice', 'Sandy', 'Charlie', 'Both', 'Mike', 'Precious',
       'Colleen', 'Bill', 'Owen', 'Assistant', 'Amanda', 'Glenda',
       'Written', 'Produced', 'Transcribed', 'Jack', 'Judy', 'Everybody',
       'Estelle', 'Emma', 'Amy', 'Announcer', 'Benjamin', 'Laura', 'Host',
       'Erica', 'Waiter', 'Sarah', 'Girl', 'Missy', 'Voice', 'Donny',
       'Gene', 'Man', 'Roy', 'Henrietta', 'Adrienne', 'Nurse', 'Tape',
       'Director', 'Clerk', 'Gunther', 'Realtor', 'Mackenzie', 'Mark',
       'Woman', 'Rita', 'Jennifer', 'Lady', 'Doctor', 'Guy', 'Paolo',
       'Carol', 'Susan', 'Guys', 'Girls', 'Roger', 'Ronni', 'Lorraine',
       'Kristin', 'Celia', 'Steve', 'Ursula', 'Teacher', 'Helen', 'Nina',
       'Jamie', 'Fran', 'Intercom', 'Luisa', 'Barry', 'Marsha', 'Robbie',
       'Bernice', 'Bobby', 'Mindy', 'Danielle', 'Julie', 'Paulo'

In [20]:
pd.reset_option('display.max_rows', None)
friends_transcript_df[friends_transcript_df['Speaker'] == 'Scene']

Unnamed: 0,Speaker,Dialogue,number_of_words
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Moni...",9
107,Scene,"The Subway, Phoebe is singing for change.]",7
109,Scene,"Ross's Apartment, the guys are there assemblin...",8
127,Scene,"A Restaurant, Monica and Paul are eating.]",7
137,Scene,Ross's Apartment; Ross is pacing while Joey an...,15
...,...,...,...
62728,Scene,Rachel's hotel room. Joey is standing at the d...,11
62763,Scene,the hotel game room. Monica and Mike are still...,12
62789,Scene,"hotel's bar, Ross and Mr. Oberblau are talking...",35
62806,Scene,"game room, Monica and Mike are still playing p...",10


In [21]:
friends_transcript_df_demo1 = friends_transcript_df 
print(friends_transcript_df_demo1.shape)

(62858, 3)


In [22]:
friends_transcript_df_demo1 = friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] != 'Scene']

In [23]:
friends_transcript_df_demo1 = friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] != 'All']

In [24]:
friends_transcript_df_demo1 = friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] != 'by']

In [25]:
friends_transcript_df_demo1.shape

(59222, 3)

In [26]:
friends_transcript_df_demo1['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
                  ... 
Referee              1
Fireman              1
Anchorwoman          1
Employee             1
Paleontologist       1
Name: count, Length: 395, dtype: int64

In [27]:
pd.reset_option('display.max_colwidth', None)
friends_transcript_df_demo1['Speaker'].unique()

array(['Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel',
       'Waitress', 'Paul', 'Frannie', 'Customer', 'Max', 'David',
       'Janice', 'Sandy', 'Charlie', 'Both', 'Mike', 'Precious',
       'Colleen', 'Bill', 'Owen', 'Assistant', 'Amanda', 'Glenda',
       'Written', 'Produced', 'Transcribed', 'Jack', 'Judy', 'Everybody',
       'Estelle', 'Emma', 'Amy', 'Announcer', 'Benjamin', 'Laura', 'Host',
       'Erica', 'Waiter', 'Sarah', 'Girl', 'Missy', 'Voice', 'Donny',
       'Gene', 'Man', 'Roy', 'Henrietta', 'Adrienne', 'Nurse', 'Tape',
       'Director', 'Clerk', 'Gunther', 'Realtor', 'Mackenzie', 'Mark',
       'Woman', 'Rita', 'Jennifer', 'Lady', 'Doctor', 'Guy', 'Paolo',
       'Carol', 'Susan', 'Guys', 'Girls', 'Roger', 'Ronni', 'Lorraine',
       'Kristin', 'Celia', 'Steve', 'Ursula', 'Teacher', 'Helen', 'Nina',
       'Jamie', 'Fran', 'Intercom', 'Luisa', 'Barry', 'Marsha', 'Robbie',
       'Bernice', 'Bobby', 'Mindy', 'Danielle', 'Julie', 'Paulo',
       'Frankie', 'A

In [28]:
friends_transcript_df_demo1[friends_transcript_df_demo1['Speaker'] == 'Sandy']

Unnamed: 0,Speaker,Dialogue,number_of_words
472,Sandy,"Hi, I'm Sandy.",3
474,Sandy,"Yeah. That's okay, right?\n\n",4
486,Sandy,"Y'know, when I saw you at the store last week,...",22
488,Sandy,Yeah.\n\n,1
57575,Sandy,Hi... I'm Sandy.,3
57625,Sandy,I really do understand how hard it's gotta be ...,26
57629,Sandy,It's okay. I get that a lot doing what I do. B...,18
57631,Sandy,Her name is Deliah.,4
57634,Sandy,I realise how it's... a bit unorthodox for som...,30
57636,Sandy,"Like in my last job, I met Daniel when he was ...",54


### Corretion in Dialogue

In [29]:
# Set option to display the full text
pd.reset_option('display.max_colwidth', None)

# Displaying a sample of the dialogues again
friends_transcript_df_demo1['Dialogue'].sample(7)


17059    Yeah, y'know what I got a better idea. How-how...
40155                      God I’m just a horrible person.
53749     Happy Valentine’s Day! …Or something to remem...
21181    Well, the brown one brings out your eyes, but ...
40790                       No! No! No! I was so careful! 
12719    Alright, you're on. I can take two minutes out...
78                               He finally asked you out?
Name: Dialogue, dtype: object

In [30]:
import re

# Function to clean the dialogues
def clean_dialogue(text):
    # 1. Remove everything within braces or brackets {} or [] or ()
    text = re.sub(r'\[.*?\]|\{.*?\}|\(.*?\)', '', text)
    
    # 2. Remove everything before ] or } or )
    text = re.sub(r'.*[\]\}\)]', '', text)
    
    # 3. Remove everything after [, {, (
    text = re.sub(r'[\[\{\(].*', '', text)
    
    # 4. Remove all \n and other escape characters, including those in between
    text = text.replace('\n', ' ').strip()
    
    # 5. Remove extra spaces that may be left behind after removing \n
    text = re.sub(r'\s+', ' ', text)
    
    return text

# Apply the function to the 'Dialogue' column
friends_transcript_df_demo1['Dialogue'] = friends_transcript_df_demo1['Dialogue'].apply(clean_dialogue)

# Displaying a sample of the cleaned dialogues
friends_transcript_df_demo1['Dialogue'].sample(7)


49786                                              Really.
14724    Yeah, Karen. I'm thinking about having an affa...
21875                                                 Wow.
6061                         Alright, whadyou do with him?
997         You want me to wash my hands first, don't you?
53404    Hey, come on Phoebe, you can understand why th...
36450    No. When it comes to sweets, he’s surprisingly...
Name: Dialogue, dtype: object

In [31]:
friends_transcript_df_demo1['Dialogue'].sample(7)

60632                              This is your cellphone?
26194                                             Bye-bye.
25285    Yeah, so we just thought we’d stop by and let ...
53451    Hey! So? Dr. Long: She’s fine. She’s experienc...
52328       Yes! I’d love to! Have her come by the office.
59841    Nah, I don't know if I should. I don't wanna b...
26443    Oh yeah, she’s-she’s amazing. And-and she’s so...
Name: Dialogue, dtype: object

In [32]:
friends_transcript_df_demo1=friends_transcript_df_demo1.drop(columns='number_of_words')

In [50]:
friends_transcript_df_demo1.to_excel("merged_transcripts3.xlsx", index=True)

#remove the 'number_of_words' , 'Ross_response_flag' , 'new_index'

In [51]:
friends_transcript_df_demo1.to_csv("merged_transcripts3.csv", index=True)

In [49]:
pd.reset_option("display.max_rows", None)
friends_transcript_df_demo1.head(8919)

Unnamed: 0,Speaker,Dialogue
0,Monica,There's nothing to tell! He's just some guy I ...
1,Joey,"C'mon, you're going out with the guy! There's ..."
2,Chandler,"All right Joey, be nice. So does he have a hum..."
3,Phoebe,"Wait, does he eat chalk?"
4,Phoebe,"Just, 'cause, I don't want her to go through w..."
...,...,...
8914,Phoebe,"Ok, Ross, could you just open your mind like t..."
8915,Ross,There might be…a teeny…tiny…possibility.
8916,Phoebe,I can't believe you caved.
8917,Ross,What?


In [46]:
print(friends_transcript_df_demo1.loc[89])


Speaker             Ross
Dialogue    A wandering?
Name: 89, dtype: object


In [47]:
print(friends_transcript_df_demo1.iloc[88])  # 0-based index, so row 89 is at position 88


Speaker                                                Monica
Dialogue    Okay, umm-umm, I'll just--I'll be right back, ...
Name: 88, dtype: object


In [41]:
# friends_transcript_df_demo1.reset_index(drop=True, inplace=True)


In [48]:
# Check if ind is valid
if 1 <= friends_transcript_df_demo1.index <= friends_transcript_df_demo1.shape[0]:  # Adjust based on your indexing
    # Access the Dialogue at the given index
    prompt = friends_transcript_df_demo1.iloc[ind - 1]['Dialogue']  # Use iloc for 0-based index
    print(f"Accessed Dialogue: {prompt}")
else:
    print(f"Index {friends_transcript_df_demo1.index} is out of bounds. Valid range: 1 to {friends_transcript_df_demo1.shape[0]}.")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [38]:
# #error is the indexing is there
# friends_transcript_df_demo1 = friends_transcript_df_demo1.assign(new_index=range(1, len(friends_transcript_df_demo1) + 1))
# friends_transcript_df_demo1.head(8800)

Unnamed: 0,Speaker,Dialogue,new_index
1,Monica,There's nothing to tell! He's just some guy I ...,1
2,Joey,"C'mon, you're going out with the guy! There's ...",2
3,Chandler,"All right Joey, be nice. So does he have a hum...",3
4,Phoebe,"Wait, does he eat chalk?",4
5,Phoebe,"Just, 'cause, I don't want her to go through w...",5
...,...,...,...
9293,Ross,We'll give you Janice.,8796
9294,Phoebe,"I miss Janice though. ""Hello, Chandler Bing.""",8797
9295,Rachel,"""Oh, my, god.""",8798
9296,Joey,"""Oh, Chandler, now, now, that's it. There, fas...",8799


### Ross transcripts data analysis

In [52]:
friends_transcript_df_demo1['number_of_words']  = friends_transcript_df_demo1['Dialogue'].str.strip().str.split(" ")
friends_transcript_df_demo1['number_of_words'] = friends_transcript_df_demo1['number_of_words'].apply(lambda x: len(x))

In [53]:
friends_transcript_df_demo1['Ross_response_flag'] = 0
friends_transcript_df_demo1.loc[(friends_transcript_df_demo1['Speaker'] == 'Ross')   &  (friends_transcript_df_demo1['number_of_words']>5), 'Ross_response_flag'] = 1

In [54]:
friends_transcript_df_demo1.sample(2)

Unnamed: 0,Speaker,Dialogue,number_of_words,Ross_response_flag
46159,Richard,Well of course I am!,5,0
11763,Rachel,What's this.,2,0


In [55]:
# Remove the automatic index by resetting it and then set the new_index column as the index
friends_transcript_df_demo1.set_index('new_index', inplace=True)

# Display the dataframe to verify the changes
print(friends_transcript_df_demo1.head())


KeyError: "None of ['new_index'] are in the columns"

In [56]:
friends_transcript_df_demo1.head(5500)

Unnamed: 0,Speaker,Dialogue,number_of_words,Ross_response_flag
0,Monica,There's nothing to tell! He's just some guy I ...,11,0
1,Joey,"C'mon, you're going out with the guy! There's ...",14,0
2,Chandler,"All right Joey, be nice. So does he have a hum...",16,0
3,Phoebe,"Wait, does he eat chalk?",5,0
4,Phoebe,"Just, 'cause, I don't want her to go through w...",16,0
...,...,...,...,...
5495,Chandler,"Aww, we were worried about you! Hm. I guess I ...",20,0
5496,Joey,I'm gonna miss these little guys. It was nice ...,13,0
5497,Chandler,"Hey, you know what? Maybe we should keep them ...",12,0
5498,Joey,What?,1,0


In [57]:
# Extract indexes where 'Ross_response_flag' is 1 and the index is greater than 0
indexes_to_take = list(friends_transcript_df_demo1[
    (friends_transcript_df_demo1['Ross_response_flag'] == 1) & 
    (friends_transcript_df_demo1.index > 0)
].index)


In [58]:
len(indexes_to_take)

5666

In [59]:
indexes_to_take[:2]

[19, 26]

In [60]:
system_promt = """"You are Ross from the Friends TV Show. Your responses should reflect his personality and speech patterns.\n"""

prompts = []
for ind in indexes_to_take:
    prompt = system_promt
    
    # Ensure `ind - 1` is within bounds
    if ind - 1 >= 0:
        prompt += friends_transcript_df_demo1.iloc[ind - 1]['Dialogue'] + '\n'
    
    # Check if `ind` is within bounds
    if ind < len(friends_transcript_df_demo1):
        prompt += friends_transcript_df_demo1.iloc[ind]['Dialogue']
    
    prompts.append(prompt)


In [61]:
print(prompts[6])

"You are Ross from the Friends TV Show. Your responses should reflect his personality and speech patterns.
Strip joint! C'mon, you're single! Have some hormones!
I don't want to be single, okay? I just... I just- I just wanna be married again!


In [62]:
df = pd.DataFrame({"prompt":prompts})
df.head()

Unnamed: 0,prompt
0,"""You are Ross from the Friends TV Show. Your r..."
1,"""You are Ross from the Friends TV Show. Your r..."
2,"""You are Ross from the Friends TV Show. Your r..."
3,"""You are Ross from the Friends TV Show. Your r..."
4,"""You are Ross from the Friends TV Show. Your r..."


In [63]:
dataset = Dataset.from_pandas(df)

##### optinal_cleaning

In [39]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
Phoebe            7567
Scene             3120
Mike               360
All                306
Janice             215
by                 210
Emily              202
Charlie            190
Carol              156
Tag                146
David              137
Paul               133
Frank              132
RICHARD            128
Richard            128
Gunther            123
Amy                119
Woman              111
Pete               103
Joshua              98
Gary                96
Eric                95
Elizabeth           94
Mona                92
Janine              92
Kathy               91
Susan               90
MNCA                87
Ursula              85
Jill                83
RACH                80
Joanna              73
Ben                 72
Erica               66
Gavin               64
EDDIE               64
Nurse               62
Kate                61
Dan

In [40]:
pd.reset_option('display.max_columns', None)
pd.reset_option('display.max_rows', None)

In [41]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'All']

Unnamed: 0,Speaker,Dialogue,number_of_words
9,All,"Oh, yeah. Had that dream.",5
89,All,Hey! Paul! Hi! The Wine Guy! Hey!,7
176,All,Morning. Good morning.,3
188,All,Okayyy!,1
258,All,"Cut, cut, cut, cut, cut, cut, cut...",7
...,...,...,...
60139,All,"Hey, hey!",2
60207,All,Hey!\n\n,1
61008,All,"Oh, so sorry man! Sorry!",5
61030,All,Wow!,1


In [42]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'Scene']

Unnamed: 0,Speaker,Dialogue,number_of_words
0,Scene,"Central Perk, Chandler, Joey, Phoebe, and Monica are there.]",9
107,Scene,"The Subway, Phoebe is singing for change.]",7
109,Scene,"Ross's Apartment, the guys are there assembling furniture.]",8
127,Scene,"A Restaurant, Monica and Paul are eating.]",7
137,Scene,Ross's Apartment; Ross is pacing while Joey and Chandler are working on some more furniture.],15
...,...,...,...
62728,Scene,"Rachel's hotel room. Joey is standing at the door, facing Rachel]",11
62763,Scene,the hotel game room. Monica and Mike are still playing ping pong],12
62789,Scene,"hotel's bar, Ross and Mr. Oberblau are talking]\n\nMr. Oberblau: I'm just saying, I have a cabin in the Adirondacks if you ever want to get away from the city, well, that'd be just nifty!",35
62806,Scene,"game room, Monica and Mike are still playing ping pong]\n\n",10


In [43]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'Scene']

In [44]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'All']

In [45]:
pd.set_option("display.max_rows",None)
friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
Phoebe            7567
Mike               360
Janice             215
by                 210
Emily              202
Charlie            190
Carol              156
Tag                146
David              137
Paul               133
Frank              132
RICHARD            128
Richard            128
Gunther            123
Amy                119
Woman              111
Pete               103
Joshua              98
Gary                96
Eric                95
Elizabeth           94
Janine              92
Mona                92
Kathy               91
Susan               90
MNCA                87
Ursula              85
Jill                83
RACH                80
Joanna              73
Ben                 72
Erica               66
EDDIE               64
Gavin               64
Nurse               62
Kate                61
Danny               58
Guy                 58
Dir

In [46]:
friends_transcript_df[friends_transcript_df['Speaker'] == 'by']

Unnamed: 0,Speaker,Dialogue,number_of_words
306,by,Adam Chase & Ira Ungerleider\nTranscribed by: guineapig,7
540,by,Andrew Reich & Ted Cohen\nTranscribed by: Coffee Mug,8
893,by,Sherry Bilsing-Graham & Ellen Plummer\nDirected by: Ben Weiss\nTranscribed by: Kreidy,10
1127,by,"Brian Buckner\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa",15
1669,by,"Dana Klein\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa",15
2016,by,"Sebastian Jones\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Cof fee Mug, Ele onora, Seba stiano & Vaness a\nFinal check by Kim\n\n--------------------------------------------------------------------------------",22
2330,by,"Mark Kunerth\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFinal check by Kim",18
2600,by,"Shana Goldberg-Meehan\nProduced by: Robert Carlock & Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFinal check by Kim",18
2936,by,"Scott Silveri\nProduced by: Robert Carlock and Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFinal check by Kim",18
3210,by,"Doty Abrams\nProduced by: Wendy Knoller\nTranscribed by: Coffee Mug, Eleonora, Sebastiano & Vanessa\nFlashback clips transcribed by: Guineapig, Dan Silverstein, Ruth Curran, Eric Aasen and Mindy Mattingly Phillips, and compiled by Eleonora.\nFinal check by Kim",33


In [47]:
friends_transcript_df = friends_transcript_df[friends_transcript_df['Speaker'] != 'by']

In [48]:
pd.set_option("display.max_rows",None)
friends_transcript_df['Speaker'].value_counts()

Speaker
Rachel            9335
Ross              9231
Chandler          8505
Monica            8481
Joey              8319
Phoebe            7567
Mike               360
Janice             215
Emily              202
Charlie            190
Carol              156
Tag                146
David              137
Paul               133
Frank              132
Richard            128
RICHARD            128
Gunther            123
Amy                119
Woman              111
Pete               103
Joshua              98
Gary                96
Eric                95
Elizabeth           94
Janine              92
Mona                92
Kathy               91
Susan               90
MNCA                87
Ursula              85
Jill                83
RACH                80
Joanna              73
Ben                 72
Erica               66
EDDIE               64
Gavin               64
Nurse               62
Kate                61
Guy                 58
Danny               58
Director            57
PHO

In [49]:
friends_transcript_df['Speaker'].unique()

array(['Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel',
       'Waitress', 'Paul', 'Frannie', 'Customer', 'Max', 'David',
       'Janice', 'Sandy', 'Charlie', 'Both', 'Mike', 'Precious',
       'Colleen', 'Bill', 'Owen', 'Assistant', 'Amanda', 'Glenda',
       'Written', 'Produced', 'Transcribed', 'Jack', 'Judy', 'Everybody',
       'Estelle', 'Emma', 'Amy', 'Announcer', 'Benjamin', 'Laura', 'Host',
       'Erica', 'Waiter', 'Sarah', 'Girl', 'Missy', 'Voice', 'Donny',
       'Gene', 'Man', 'Roy', 'Henrietta', 'Adrienne', 'Nurse', 'Tape',
       'Director', 'Clerk', 'Gunther', 'Realtor', 'Mackenzie', 'Mark',
       'Woman', 'Rita', 'Jennifer', 'Lady', 'Doctor', 'Guy', 'Paolo',
       'Carol', 'Susan', 'Guys', 'Girls', 'Roger', 'Ronni', 'Lorraine',
       'Kristin', 'Celia', 'Steve', 'Ursula', 'Teacher', 'Helen', 'Nina',
       'Jamie', 'Fran', 'Intercom', 'Luisa', 'Barry', 'Marsha', 'Robbie',
       'Bernice', 'Bobby', 'Mindy', 'Danielle', 'Julie', 'Paulo',
       'Frankie', 'A

### testing of def_load_data




In [50]:
friends_transcript_df = pd.read_csv("merged_transcripts3.csv")

In [51]:
friends_transcript_df.shape

(59222, 2)

In [52]:
friends_transcript_df = friends_transcript_df.dropna()


In [54]:
friends_transcript_df['number_of_words']  = friends_transcript_df['Dialogue'].str.strip().str.split(" ")

In [55]:
friends_transcript_df['Dialogue'] = friends_transcript_df['Dialogue'].apply(remove_paranthesis)
friends_transcript_df['number_of_words'] = friends_transcript_df['number_of_words'].apply(lambda x: len(x))


In [56]:
friends_transcript_df.head(5)

Unnamed: 0,Speaker,Dialogue,number_of_words
0,Monica,There's nothing to tell! He's just some guy I ...,11
1,Joey,"C'mon, you're going out with the guy! There's ...",14
2,Chandler,"All right Joey, be nice. So does he have a hum...",16
3,Phoebe,"Wait, does he eat chalk?",5
4,Phoebe,"Just, 'cause, I don't want her to go through w...",16


In [57]:
friends_transcript_df[(friends_transcript_df['Speaker']== 'Chandler') & (friends_transcript_df['number_of_words'] > 5)]

Unnamed: 0,Speaker,Dialogue,number_of_words
2,Chandler,"All right Joey, be nice. So does he have a hum...",16
6,Chandler,Sounds like a date to me.,6
7,Chandler,"Alright, so I'm back in high school, I'm stand...",22
8,Chandler,"Then I look down, and I realize there's a phon...",11
13,Chandler,"All of a sudden, the phone starts to ring. Now...",21
...,...,...,...
59175,Chandler,"Monica, that was also true an hour ago! I mean...",60
59189,Chandler,Because I'm gonna play for ya.,6
59194,Chandler,"Yes, I do. Now, I may not understand why you h...",30
59197,Chandler,"All right Mike, let's get this over with. Sudd...",15


In [58]:
friends_transcript_df['response_flag'] = 0

In [59]:
friends_transcript_df.loc[
                (friends_transcript_df['Speaker'] == 'Chandler') & 
                (friends_transcript_df['number_of_words'] > 5), 
                 'response_flag'] = 1

In [61]:
friends_transcript_df.head(3)

Unnamed: 0,Speaker,Dialogue,number_of_words,response_flag
0,Monica,There's nothing to tell! He's just some guy I ...,11,0
1,Joey,"C'mon, you're going out with the guy! There's ...",14,0
2,Chandler,"All right Joey, be nice. So does he have a hum...",16,1


In [74]:
index_req = list(friends_transcript_df[ (friends_transcript_df['response_flag'] == 1) & (friends_transcript_df.index>0)].index)

In [78]:
len(index_req)

5336

In [79]:
index_req

[2,
 6,
 7,
 8,
 13,
 15,
 33,
 40,
 55,
 70,
 78,
 87,
 99,
 109,
 116,
 135,
 156,
 163,
 177,
 185,
 187,
 224,
 245,
 280,
 282,
 285,
 293,
 299,
 307,
 310,
 311,
 320,
 338,
 340,
 343,
 345,
 347,
 354,
 356,
 375,
 378,
 394,
 396,
 398,
 400,
 466,
 470,
 487,
 489,
 492,
 496,
 502,
 507,
 517,
 522,
 542,
 548,
 550,
 563,
 570,
 613,
 642,
 644,
 660,
 668,
 670,
 675,
 763,
 765,
 769,
 771,
 773,
 813,
 820,
 822,
 841,
 845,
 847,
 882,
 886,
 935,
 936,
 944,
 945,
 952,
 954,
 956,
 960,
 995,
 1000,
 1005,
 1007,
 1064,
 1067,
 1068,
 1069,
 1071,
 1073,
 1075,
 1077,
 1083,
 1089,
 1091,
 1097,
 1108,
 1110,
 1119,
 1167,
 1169,
 1195,
 1199,
 1207,
 1264,
 1293,
 1295,
 1298,
 1299,
 1319,
 1321,
 1324,
 1330,
 1338,
 1341,
 1346,
 1376,
 1378,
 1393,
 1395,
 1451,
 1455,
 1457,
 1463,
 1468,
 1474,
 1490,
 1514,
 1521,
 1526,
 1552,
 1554,
 1557,
 1560,
 1573,
 1575,
 1604,
 1618,
 1623,
 1711,
 1716,
 1746,
 1759,
 1763,
 1773,
 1802,
 1806,
 1811,
 1813,
 1817,


In [80]:
system_prompt = f"""\nYou are Chandler from the Friends TV Show. Your responses should reflect Chandler's personality and speech patterns.\n"""

In [82]:
prompts = []
for ind in index_req:
    prompt = system_prompt
            
    # Insert the index validation here
    if 0 <= ind - 1 < len(friends_transcript_df):
        prompt += friends_transcript_df.iloc[ind - 1]['Dialogue']
    else:
         # Handle the case where the index is out of bounds
        print(f"Index {ind - 1} is out of bounds")
        continue  # Skip this iteration if the index is out of bounds

    prompt += '\n'
    prompt += friends_transcript_df.iloc[ind]['Dialogue']
    prompts.append(prompt)

Index 59196 is out of bounds
Index 59202 is out of bounds


In [89]:
prompts[0:2]

["\nYou are Chandler from the Friends TV Show. Your responses should reflect Chandler's personality and speech patterns.\nC'mon, you're going out with the guy! There's gotta be something wrong with him!\nAll right Joey, be nice. So does he have a hump? A hump and a hairpiece?",
 "\nYou are Chandler from the Friends TV Show. Your responses should reflect Chandler's personality and speech patterns.\nOkay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.\nSounds like a date to me."]