In [1]:
# introduce dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# read in csv and convert to dataframe
scripts_csv = "../Data/scripts.csv"
scripts_df = pd.read_csv(scripts_csv, encoding = "utf-8")
scripts_df.columns

Index(['Unnamed: 0', 'Character', 'Dialogue', 'EpisodeNo', 'SEID', 'Season'], dtype='object')

In [3]:
# replace ... with , to prevent in proper spliting a sentence into multiple sentences improperly
# scripts_df['Dialogue'] = scripts_df['Dialogue'].replace({'...': ','}, regex=True)

In [4]:
pilot = scripts_df[scripts_df["SEID"]=="PILOT"]
pilot

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,0,JERRY,Do you know what this is all about? Do you kno...,1,PILOT,1
1,1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,PILOT,1
2,2,GEORGE,Are you through?,1,PILOT,1
3,3,JERRY,"You do of course try on, when you buy?",1,PILOT,1
4,4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,PILOT,1
...,...,...,...,...,...,...
209,209,JERRY,Me too!,1,PILOT,1
210,210,JERRY,"I swear, I have absolutely no idea what women ...",1,PILOT,1
211,211,JERRY,"So, Im on line at the supermarket. Two women i...",1,PILOT,1
212,212,AUDIENCE,Cheque.,1,PILOT,1


In [5]:
# convert Character to title format
scripts_df['Character'] = scripts_df['Character'].apply(lambda x: x.title())
scripts_df.head()
print(scripts_df["Dialogue"][0])


Do you know what this is all about? Do you know, why were here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about We should go out? This is what theyre talking about...this whole thing, were all out now, no one is home. Not one person here is home, were all out! There are people tryin to find us, they dont know where we are. (on an imaginary phone) Did you ring?, I cant find him. Where did he go? He didnt tell me where he was going. He must have gone out. You wanna go out you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then youre standing around, whatta you do? You go We gotta be getting back. Once youre out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Where ever you are in life, its my feeling, youve gotta go.


In [6]:
# replace and with , to allow singular split
scripts_df['Dialogue'] = scripts_df['Dialogue'].str.replace('?', '?@', regex = True)
scripts_df['Dialogue'] = scripts_df['Dialogue'].str.replace('.', '.+', regex = True)
scripts_df['Dialogue'] = scripts_df['Dialogue'].str.replace('!', '!$', regex = True)

In [7]:
# assess dialogue to understand various components on which to split 
print(scripts_df["Dialogue"][0])

Do you know what this is all about?@ Do you know, why were here?@ To be out, this is out.+.+.+and out is one of the single most enjoyable experiences of life.+ People.+.+.+did you ever hear people talking about We should go out?@ This is what theyre talking about.+.+.+this whole thing, were all out now, no one is home.+ Not one person here is home, were all out!$ There are people tryin to find us, they dont know where we are.+ (on an imaginary phone) Did you ring?@, I cant find him.+ Where did he go?@ He didnt tell me where he was going.+ He must have gone out.+ You wanna go out you get ready, you pick out the clothes, right?@ You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation.+.+.+Then youre standing around, whatta you do?@ You go We gotta be getting back.+ Once youre out, you wanna get back!$ You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right?@ Where ever you are in life, its my feeling, youve gotta 

In [8]:
# create split of unknown quanities and create n new columns based on split
sentences_df = scripts_df['Dialogue'].str.split('[+|@|$]', expand=True).add_prefix('Sentence_')
sentences_df.head()

Unnamed: 0,Sentence_0,Sentence_1,Sentence_2,Sentence_3,Sentence_4,Sentence_5,Sentence_6,Sentence_7,Sentence_8,Sentence_9,...,Sentence_40,Sentence_41,Sentence_42,Sentence_43,Sentence_44,Sentence_45,Sentence_46,Sentence_47,Sentence_48,Sentence_49
0,Do you know what this is all about?,"Do you know, why were here?","To be out, this is out.",.,.,and out is one of the single most enjoyable ex...,People.,.,.,did you ever hear people talking about We shou...,...,,,,,,,,,,
1,"(pointing at Georges shirt) See, to me, that b...",The second button literally makes or breaks t...,Its too high!,Its in no-mans-land.,You look like you live with your mother.,,,,,,...,,,,,,,,,,
2,Are you through?,,,,,,,,,,...,,,,,,,,,,
3,"You do of course try on, when you buy?",,,,,,,,,,...,,,,,,,,,,
4,"Yes, it was purple, I liked it, I dont actuall...",,,,,,,,,,...,,,,,,,,,,


In [9]:
# transpose the df to create a list of the unknown number of sentences for each individual dialogue
sentences_transpose = sentences_df.transpose()
sentences_transpose.reset_index(inplace = True)
sentences_transpose.rename(columns = ({"index": "sentence_number"}), inplace = True)
sentences_list = sentences_transpose["sentence_number"].unique()
sentences_list

array(['Sentence_0', 'Sentence_1', 'Sentence_2', 'Sentence_3',
       'Sentence_4', 'Sentence_5', 'Sentence_6', 'Sentence_7',
       'Sentence_8', 'Sentence_9', 'Sentence_10', 'Sentence_11',
       'Sentence_12', 'Sentence_13', 'Sentence_14', 'Sentence_15',
       'Sentence_16', 'Sentence_17', 'Sentence_18', 'Sentence_19',
       'Sentence_20', 'Sentence_21', 'Sentence_22', 'Sentence_23',
       'Sentence_24', 'Sentence_25', 'Sentence_26', 'Sentence_27',
       'Sentence_28', 'Sentence_29', 'Sentence_30', 'Sentence_31',
       'Sentence_32', 'Sentence_33', 'Sentence_34', 'Sentence_35',
       'Sentence_36', 'Sentence_37', 'Sentence_38', 'Sentence_39',
       'Sentence_40', 'Sentence_41', 'Sentence_42', 'Sentence_43',
       'Sentence_44', 'Sentence_45', 'Sentence_46', 'Sentence_47',
       'Sentence_48', 'Sentence_49'], dtype=object)

In [10]:
# create split of unknown quanities and create n new columns based on split
scripts_df[sentences_list] = scripts_df['Dialogue'].str.split('[+|@|$]', expand=True).add_prefix('Sentence_')
scripts_df.head()

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentence_0,Sentence_1,Sentence_2,Sentence_3,...,Sentence_40,Sentence_41,Sentence_42,Sentence_43,Sentence_44,Sentence_45,Sentence_46,Sentence_47,Sentence_48,Sentence_49
0,0,Jerry,Do you know what this is all about?@ Do you kn...,1,PILOT,1,Do you know what this is all about?,"Do you know, why were here?","To be out, this is out.",.,...,,,,,,,,,,
1,1,Jerry,"(pointing at Georges shirt) See, to me, that b...",1,PILOT,1,"(pointing at Georges shirt) See, to me, that b...",The second button literally makes or breaks t...,Its too high!,Its in no-mans-land.,...,,,,,,,,,,
2,2,George,Are you through?@,1,PILOT,1,Are you through?,,,,...,,,,,,,,,,
3,3,Jerry,"You do of course try on, when you buy?@",1,PILOT,1,"You do of course try on, when you buy?",,,,...,,,,,,,,,,
4,4,George,"Yes, it was purple, I liked it, I dont actuall...",1,PILOT,1,"Yes, it was purple, I liked it, I dont actuall...",,,,...,,,,,,,,,,


In [11]:
# replace substitue symbols with proper puncutation to reintroduce
# use a for loop to run through the elements of the sentences_list, adding Dialogue as a list item

# np.append(sentences_list,'Dialogue')

# run a for loop to properly reintroduce the punctuation
# for row in scripts_df["Dialogue"]:
# #     try:
scripts_df["Dialogue"] = scripts_df["Dialogue"].str.replace('@', '')
scripts_df["Dialogue"] = scripts_df["Dialogue"].str.replace('$', '')
scripts_df["Dialogue"] = scripts_df["Dialogue"].str.replace('+', '')           
#     except:
        

  scripts_df["Dialogue"] = scripts_df["Dialogue"].str.replace('$', '')
  scripts_df["Dialogue"] = scripts_df["Dialogue"].str.replace('+', '')


In [12]:
scripts_df.head()

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentence_0,Sentence_1,Sentence_2,Sentence_3,...,Sentence_40,Sentence_41,Sentence_42,Sentence_43,Sentence_44,Sentence_45,Sentence_46,Sentence_47,Sentence_48,Sentence_49
0,0,Jerry,Do you know what this is all about? Do you kno...,1,PILOT,1,Do you know what this is all about?,"Do you know, why were here?","To be out, this is out.",.,...,,,,,,,,,,
1,1,Jerry,"(pointing at Georges shirt) See, to me, that b...",1,PILOT,1,"(pointing at Georges shirt) See, to me, that b...",The second button literally makes or breaks t...,Its too high!,Its in no-mans-land.,...,,,,,,,,,,
2,2,George,Are you through?,1,PILOT,1,Are you through?,,,,...,,,,,,,,,,
3,3,Jerry,"You do of course try on, when you buy?",1,PILOT,1,"You do of course try on, when you buy?",,,,...,,,,,,,,,,
4,4,George,"Yes, it was purple, I liked it, I dont actuall...",1,PILOT,1,"Yes, it was purple, I liked it, I dont actuall...",,,,...,,,,,,,,,,
