In [2]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # missing data visualisation
%matplotlib inline
sns.set_style("darkgrid")
sns.set_palette("Set1")

import nltk
import spacy
import contractions
import unidecode
import unicodedata
from dateutil.parser import parse

In [3]:
transcripts_df = pd.read_csv('../data/interim_data/st_transcripts_w_metadata.csv')

In [4]:
transcripts_df.head()

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript
0,The Cage,TOS,Unknown,1988-10-04,the cage,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...
1,Where No Man Has Gone Before,TOS,1312.4 - 1313.8,1966-09-22,where no man has gone before,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...
2,The Corbomite Maneuver,TOS,1512.2 - 1514.1,1966-11-10,the corbomite maneuver,\n\n\n\n\n\nThe Star Trek Transcripts - The Co...
3,Mudd's Women,TOS,1329.8 - 1330.1,1966-10-13,mudds women,\n\n\n\n\n\nThe Star Trek Transcripts - Mudd's...
4,The Enemy Within,TOS,1672.1 - 1673.1,1966-10-06,the enemy within,\n\n\n\n\n\nThe Star Trek Transcripts - The En...


In [5]:
transcripts_df.sort_values(by='series')

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript
730,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
716,The Vulcan Hello,DIS,1207.3,2017-09-24,the vulcan hello,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
717,Battle at the Binary Stars,DIS,1207.3,2017-09-24,battle at the binary stars,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
718,Context Is for Kings,DIS,Unknown,2017-10-01,context is for kings,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
719,The Butcher's Knife Cares Not for the Lamb's Cry,DIS,Unknown,2017-10-08,the butchers knife cares not for the lambs cry,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
...,...,...,...,...,...,...
580,Blink of an Eye,VOY,Unknown,2000-01-19,blink of an eye,\n\n\n\n\n\nThe Voyager Transcripts - Blink Of...
581,Virtuoso,VOY,53556.4,2000-01-26,virtuoso,\n\n\n\n\n\nThe Voyager Transcripts - Virtuoso...
582,Memorial,VOY,Unknown,2000-02-02,memorial,\n\n\n\n\n\nThe Voyager Transcripts - Memorial...
570,Survival Instinct,VOY,53049.2,1999-09-29,survival instinct,\n\n\n\n\n\nThe Voyager Transcripts - Survival...


In [6]:
transcripts_df.iloc[0]['transcript']

"\n\n\n\n\n\nThe Star Trek Transcripts - The Cage\n\n\n\nThe\r\nCage\nUnaired\r\npilot\n\n\n\n\n\n\n [Bridge]\n\nSPOCK: Check the circuit. \r\nTYLER: All operating, sir. \r\nSPOCK: It can't be the screen then. Definitely something out there,\r\nCaptain, headed this way. \r\nTYLER: It could be these meteorites. \r\nONE: No, it's something else. There's still something out there. \r\nTYLER: It's coming at the speed of light, collision course. The\r\nmeteorite beam has not deflected it, Captain.\r\nONE: Evasive manoeuvres, sir?\r\nPIKE: Steady as we go.\r\nGARISON: It's a radio wave, sir. We're passing through an old-style\r\ndistress signal.\r\nPIKE: They were keyed to cause interference and attract attention this\r\nway.\r\nGARISON: A ship in trouble making a forced landing, sir. That's it. No\r\nother message.\r\nTYLER: I have a fix. It comes from the Talos star group.\r\nONE: We've no ships or Earth colonies that far out.\r\nSPOCK: Their call letters check with a survey expedition. SS

In [119]:
test_txt = transcripts_df.iloc[1]['transcript']

In [109]:
# remove space characters, e.g. newlines
def remove_spaces(script):
    script_clean = script.replace("\r", " ") # replace \r with whitespace
    script_clean = re.sub(r'\n+', ' ', script_clean) # replace multiple newlines with single whitespace
    script_clean = re.sub('\s+',' ', script_clean) # replace multiple whitespaces with single whitespace
    script_clean = script_clean.strip() #remove leading/trailing whitespace
    
    return script_clean

In [169]:
# Website specific text needs to be removed:

# Heading: "The Star Trek Transcripts - "
# Original Title, "Unaired"/"pilot" (first occurence before first character)

# Closing line with Copyrights:
# <Back\r\nto the episode listing\nStar\r\nTrek ® is copyright of  CBS\r\nStudios Inc. 
# Copyright © 1966, Present.\r\nThe Star Trek web pages on this site are for educational 
# and\r\nentertainment purposes only. All other copyrights property of their\r\nrespective holders.\n"

def remove_site_specs(text):
    # replace \r\n character sequence with space
    clean_text = text.replace('\r\n', ' ')
    clean_text = clean_text.replace('<', '')
    clean_text = remove_spaces(clean_text)
    
    # rm 'The Star Trek Transcripts - ' etc. line 
    clean_text = clean_text.replace('The Star Trek Transcripts - ', '')
    clean_text = clean_text.replace('The Animated Star Trek Transcripts - ', '')
    clean_text = clean_text.replace('The Next Generation Transcripts - ', '')
    clean_text = clean_text.replace('The Deep Space Nine Transcripts - ', '')
    clean_text = clean_text.replace('The Voyager Transcripts - ', '')
    clean_text = clean_text.replace('The Enterprise Transcripts - ', '')
    clean_text = clean_text.replace('Star Trek Discovery Transcripts - ', '')
    
    # remove reference at the end of document
    clean_text = clean_text.replace('Back to the episode listing Star Trek ® is copyright of CBS Studios Inc. Copyright © 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    clean_text = clean_text.replace('Star Trek ® is copyright of CBS Studios Inc . Copyright © 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    clean_text = clean_text.replace('Star Trek is copyright of CBS Studios Inc . Copyright 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    clean_text = clean_text.replace('Back to the episode listing', '')
    clean_text = clean_text.replace('Star Trek is copyright of CBS Studios Inc. Copyright 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    return clean_text

In [141]:
# Replace character sequence from Website
# Replace accented characters
# Remove extra spaces
# Remove website specific strings, e.g. Copyright
# Expand contactions

def preclean_transcript(transcript):
    # replace \r\n character sequence with space
    clean_text = transcript.replace('\r\n', ' ')
    
    # remove accented characters
    clean_text = unicodedata.normalize('NFKD', clean_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # remove extra spaces & website specific strings, e.g. copyright
    clean_text = remove_site_specs(clean_text)
    
    # Expand contractions to easier deal with apostrophes (Klingon names)
    # https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/
    expanded_words = []    
    for word in clean_text.split():
      # using contractions.fix to expand the shotened words
      expanded_words.append(contractions.fix(word))   

    expanded_text = ' '.join(expanded_words)
    
    # text inside [] signifies location
    # remove all text inside []
    expanded_text = re.sub("[\[].*?[\]]", "", expanded_text)
    
    # remove space before :
    expanded_text = re.sub(' :', ':', expanded_text)
    
    # remove repeated whitespace inside a string
    final_text = re.sub('\s+',' ', expanded_text)
    
    return final_text

In [170]:
transcripts_df['clean_transcript'] = transcripts_df['transcript'].map(lambda x: preclean_transcript(x))

In [144]:
transcripts_df

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript,clean_transcript
0,The Cage,TOS,Unknown,1988-10-04,the cage,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...,The Cage The Cage Unaired pilot SPOCK: Check t...
1,Where No Man Has Gone Before,TOS,1312.4 - 1313.8,1966-09-22,where no man has gone before,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...,Where No Man Has Gone Before Where No Man Has ...
2,The Corbomite Maneuver,TOS,1512.2 - 1514.1,1966-11-10,the corbomite maneuver,\n\n\n\n\n\nThe Star Trek Transcripts - The Co...,The Corbomite Maneuver The Corbomite Maneuver ...
3,Mudd's Women,TOS,1329.8 - 1330.1,1966-10-13,mudds women,\n\n\n\n\n\nThe Star Trek Transcripts - Mudd's...,Mudd's Women Mudd's Women Stardate: 1329.8 Ori...
4,The Enemy Within,TOS,1672.1 - 1673.1,1966-10-06,the enemy within,\n\n\n\n\n\nThe Star Trek Transcripts - The En...,The Enemy Within The Enemy WIthin Stardate: 16...
...,...,...,...,...,...,...,...
726,The Wolf Inside,DIS,Unknown,2018-01-14,the wolf inside,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,The Wolf Inside The Wolf Inside Stardate: Unkn...
727,Vaulting Ambition,DIS,Unknown,2018-01-21,vaulting ambition,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,Vaulting Ambition Vaulting Ambition Stardate: ...
728,What's Past Is Prologue,DIS,1834.2,2018-01-28,whats past is prologue,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,what is Past is Prologue what is Past is Prolo...
729,"The War Without, The War Within",DIS,Unknown,2018-02-04,the war without the war within,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,"The War Without, The War Within The War Withou..."


In [171]:
transcripts_df.iloc[0]['clean_transcript']

"The Cage The Cage Unaired pilot SPOCK: Check the circuit. TYLER: All operating, sir. SPOCK: It can not be the screen then. Definitely something out there, Captain, headed this way. TYLER: It could be these meteorites. ONE: No, it is something else. there is still something out there. TYLER: it is coming at the speed of light, collision course. The meteorite beam has not deflected it, Captain. ONE: Evasive manoeuvres, sir? PIKE: Steady as we go. GARISON: it is a radio wave, sir. we are passing through an old-style distress signal. PIKE: They were keyed to because interference and attract attention this way. GARISON: A ship in trouble making a forced landing, sir. that is it. No other message. TYLER: I have a fix. It comes from the Talos star group. ONE: we have no ships or Earth colonies that far out. SPOCK: Their call letters check with a survey expedition. SS Columbia. It disappeared in that region approximately eighteen years ago. TYLER: It would take that long for a radio beam to t

----

In [173]:
def create_lines_list(transcript, title):
    # remove apostrophes
    no_apos = transcript.replace("'", "")
    
    # add newline before uppercase words (=names) & split at newlines
    lines_list = re.sub(r"(\w+):", r'\n\1:', no_apos).split('\n')
    
    # filter for non-speech strings
    matchers = ['Transcripts', 'Airdate', 'Unaired', 'CBS', 'Stardate', title]
    matching = [s for s in lines_list if any(xs in s for xs in matchers)]
    print(matching)
    
    # create list of all speech items
    lines_list = [x for x in lines_list if x not in matching]
    return lines_list

In [174]:
lines_list = create_lines_list(transcripts_df.iloc[0]['clean_transcript'], transcripts_df.iloc[0]['title'])

['The Cage The Cage Unaired pilot ']


In [175]:
len(lines_list)

333

In [176]:
lines_list

['SPOCK: Check the circuit. ',
 'TYLER: All operating, sir. ',
 'SPOCK: It can not be the screen then. Definitely something out there, Captain, headed this way. ',
 'TYLER: It could be these meteorites. ',
 'ONE: No, it is something else. there is still something out there. ',
 'TYLER: it is coming at the speed of light, collision course. The meteorite beam has not deflected it, Captain. ',
 'ONE: Evasive manoeuvres, sir? ',
 'PIKE: Steady as we go. ',
 'GARISON: it is a radio wave, sir. we are passing through an old-style distress signal. ',
 'PIKE: They were keyed to because interference and attract attention this way. ',
 'GARISON: A ship in trouble making a forced landing, sir. that is it. No other message. ',
 'TYLER: I have a fix. It comes from the Talos star group. ',
 'ONE: we have no ships or Earth colonies that far out. ',
 'SPOCK: Their call letters check with a survey expedition. SS Columbia. It disappeared in that region approximately eighteen years ago. ',
 'TYLER: It wou

In [177]:
character_list = []
line_list = []

for line in lines_list:
    line_splitted = line.split(':')
    character_list.append(line_splitted[0])
    line_list.append(line_splitted[1])

In [178]:
pd.DataFrame(list(zip(character_list, line_list)),
               columns =['character', 'line'])

Unnamed: 0,character,line
0,SPOCK,Check the circuit.
1,TYLER,"All operating, sir."
2,SPOCK,It can not be the screen then. Definitely som...
3,TYLER,It could be these meteorites.
4,ONE,"No, it is something else. there is still some..."
...,...,...
328,TYLER,"Eve, sir? Yes, sir."
329,BOYCE,Eve as in Adam?
330,PIKE,As in all ships doctors are dirty old men. Wh...
331,ONE,"All decks show ready, sir."


----

In [90]:
transcripts_df.head()

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript,clean_transcript
0,The Cage,TOS,Unknown,1988-10-04,the cage,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...,The Cage The Cage Unaired pilot SPOCK: Check t...
1,Where No Man Has Gone Before,TOS,1312.4 - 1313.8,1966-09-22,where no man has gone before,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...,Where No Man Has Gone Before Where No Man Has ...
2,The Corbomite Maneuver,TOS,1512.2 - 1514.1,1966-11-10,the corbomite maneuver,\n\n\n\n\n\nThe Star Trek Transcripts - The Co...,The Corbomite Maneuver The Corbomite Maneuver ...
3,Mudd's Women,TOS,1329.8 - 1330.1,1966-10-13,mudds women,\n\n\n\n\n\nThe Star Trek Transcripts - Mudd's...,Mudd's Women Mudd's Women Stardate: 1329.8 Ori...
4,The Enemy Within,TOS,1672.1 - 1673.1,1966-10-06,the enemy within,\n\n\n\n\n\nThe Star Trek Transcripts - The En...,The Enemy Within The Enemy WIthin Stardate: 16...


In [70]:
lines_df = pd.DataFrame(columns=['title', 'series', 'stardate', 'airdate', 'title_clean', 'character', 'line'])

In [106]:
def create_lines(title, title_clean, series, stardate, airdate, transcript):
    character_list = []
    line_list = []
    
    lines_list = create_lines_list(transcript, title)
    
    title_list = [title] * len(lines_list)
    title_clean_list = [title_clean] * len(lines_list)
    series_list = [series] * len(lines_list)
    stardate_list = [stardate] * len(lines_list)
    airdate_list = [airdate] * len(lines_list)
    
    
    for line in lines_list:
        line_splitted = line.split(':')
        character_list.append(line_splitted[0])
        line_list.append(line_splitted[1])
        
    # dictionary of lists 
    dict = {'title': title_list, 'series': series_list, 'stardate': stardate_list, 
            'airdate': airdate_list, 'title_clean': title_clean_list, 
            'character': character_list, 'line': line_list} 

    df = pd.DataFrame(dict)
        
    return df
    

In [179]:
character_list = []
line_list = []
    
lines_list = create_lines_list(transcripts_df.iloc[730]['clean_transcript'], transcripts_df.iloc[730]['title'])

['Will You Take My Hand? Will You Take My Hand? ', 'Stardate: 2257Original ', 'Airdate: 11 Feb 2018 ']


In [180]:
title_list = [transcripts_df.iloc[730]['title']] * len(lines_list)
title_clean_list = [transcripts_df.iloc[730]['title_clean']] * len(lines_list)
series_list = [transcripts_df.iloc[730]['series']] * len(lines_list)
stardate_list = [transcripts_df.iloc[730]['stardate']] * len(lines_list)
airdate_list = [transcripts_df.iloc[730]['airdate']] * len(lines_list)

In [181]:
lines_list

['KLINGON: We have acquired the target. (Our viewpoint moves in from the Klingon fleet past Jupiter and Mars to a blue planet with a single large moon, a spacedock and plenty of small satellites.) ',
 'BURNHAM: On the eve of battle, on a cold and windless night, an old general turned to a young soldier. "Tomorrow," said the master, "you will know fear." The young soldier, who had not yet experienced the agony of war, looked at the general with quizzical eyes. "How will I know fear if I do not know what it looks like?" The general replied, "You will know fear because it speaks very fast and it speaks very loud." ',
 'GEORGIOU: Status, helm? ',
 'DETMER: Bearing 94 mark 21, 12 light-years from Klingon homeworld ',
 'GEORGIOU: Do not show respect by referring to that green dot as homeworld. Klingons are animals, and they do not have homes. Call it QonoS, or the enemy planet. Ops? ',
 'OWOSEKUN: Shields holding at 100%. Our scans show ',
 'GEORGIOU: I have little interest in what we are sc

In [182]:
for line in lines_list:
    line_splitted = line.split(':')
    print(line_splitted[0])
    #print(line_splitted[1])
    #character_list.append(line_splitted[0])
    #line_list.append(line_splitted[1])

KLINGON
BURNHAM
GEORGIOU
DETMER
GEORGIOU
OWOSEKUN
GEORGIOU
BRYCE
GEORGIOU
BURNHAM
SARU
SARU
BURNHAM
GEORGIOU
GEORGIOU
SARU
GEORGIOU
BURNHAM
GEORGIOU
BURNHAM
GEORGIOU
BURNHAM
GEORGIOU
GEORGIOU
BURNHAM
GEORGIOU
BURNHAM
GEORGIOU
BURNHAM
LRELL
GEORGIOU
LRELL
LRELL
GEORGIOU
LRELL
BURNHAM
LRELL
GEORGIOU
BURNHAM
GEORGIOU
LRELL
BURNHAM
GEORGIOU
BURNHAM
GEORGIOU
TYLER
GEORGIOU
BURNHAM
TYLER
BURNHAM
TYLER
GEORGIOU
BURNHAM
GEORGIOU
BURNHAM
TYLER
GEORGIOU
TILLY
GEORGIOU
TILLY
TYLER
TILLY
BURNHAM
TILLY
BURNHAM
GEORGIOU
TILLY
BURNHAM
TILLY
GEORGIOU
TYLER
BURNHAM
SARU
STAMETS
SARU
AIRIAM
COMPUTER
OWOSEKUN
DETMER
STAMETS
SARU
SARU
GEORGIOU
TILLY
GEORGIOU
TYLER
WOMAN
GEORGIOU
TILLY
GEORGIOU
TILLY
GEORGIOU
WOMAN
TILLY
STALLHOLDER
TILLY
BURNHAM
TILLY
BURNHAM
TILLY
BURNHAM
TILLY
BURNHAM
GEORGIOU
TILLY
TYLER
TILLY
GEORGIOU
SHAVO
GEORGIOU
TILLY
GEORGIOU
GEORGIOU
SHAVO
GEORGIOU
TILLY
GEORGIOU
TYLER
BURNHAM
TYLER
BURNHAM
TYLER
BURNHAM
TYLER
KLINGON
TYLER
KLINGON
TYLER
CROUPIER
TYLER
CROUPIER
TYLER
ORION
TILLY

In [128]:
transcripts_df.iloc[730]

title                                          Will You Take My Hand?
series                                                            DIS
stardate                                                      Unknown
airdate                                                    2018-02-11
title_clean                                     will you take my hand
transcript          \n\n\n\n\n\nStar Trek Discovery Transcripts - ...
clean_transcript    Will You Take My Hand? Will You Take My Hand? ...
Name: 730, dtype: object

In [190]:
create_lines(transcripts_df.iloc[3]['title'], transcripts_df.iloc[3]['title_clean'],
             transcripts_df.iloc[3]['series'], transcripts_df.iloc[3]['stardate'], 
             transcripts_df.iloc[3]['airdate'], transcripts_df.iloc[3]['clean_transcript'])

['Stardate: 1329.8 Original ', 'Airdate: 13 Oct, 1966 Captains log, Stardate 1329.8. The you.S.S. Enterprise in pursuit of an unidentified vessel. ', 'MUDD: Well, no, Captain. This is me cargo. Captains log, Stardate 1329.1. we have taken aboard from unregistered transport vessel its captain and, and three unusual females. These women have a mysterious magnetic effect on the male members of my crew, including myself. Explanation unknown at present. ', 'SPOCK: Rigel 12, Mister Farrell. You have the course. Captains log, Stardate 1329.2. On board the you.S.S. Enterprise, a ships hearing is being convened against the transport vessels captain. I am becoming concerned about the almost-hypnotic effect produced by the women. ', 'KIRK: This hearing is convened. Stardate 1329.2, on board starship you.S.S. Enterprise. Formal hearings against transport captain Leo Walsh. Start computer. ', 'COMPUTER: Incorrect. Masters license revoked Stardate 1116.4. ', 'MUDD: Oh, you beautiful galaxy! Oh, that

IndexError: list index out of range

In [188]:
line_dfs_list = []

for i, row in transcripts_df.iterrows():
    print('Processing lines in ' + row['series'] + ' - ' + row['title'] + '...')
    # for each episode:
    # create lines_list
    # create characters & line list
    # create lists w/ title, series, airdate, title_clean - [x] * len(line_list)
    # convert all lists to a df via dict
    # append to new df: transcripts_df = pd.concat([transcripts_df, multipart_df]).reset_index(drop=True)
    df = create_lines(row['title'], row['title_clean'], row['series'], row['stardate'], \
                      row['airdate'], row['clean_transcript'])
    
    line_dfs_list.append(df)

Processing lines in TOS - The Cage...
['The Cage The Cage Unaired pilot ']
Processing lines in TOS - Where No Man Has Gone Before...
Processing lines in TOS - The Corbomite Maneuver...
['The Corbomite Maneuver The Corbomite Maneuver ', 'Stardate: 1512.2 Original ', 'Airdate: 10 Nov, 1966 ', 'BALOK: (laughs) I see. We think much alike, Captain, you and I. Now, before I bring back the Fesarius, let me show you my vessel. It is not often I have this pleasure. Yes, we are very much alike, Captain. Both proud of our ships. Star Trek is copyright of CBS Studios INC. Copyright 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.']
Processing lines in TOS - Mudd's Women...
['Stardate: 1329.8 Original ', 'Airdate: 13 Oct, 1966 Captains log, Stardate 1329.8. The you.S.S. Enterprise in pursuit of an unidentified vessel. ', 'MUDD: Well, no, Captain. This is me cargo. Captains log, Stardat

IndexError: list index out of range

In [189]:
line_dfs_list

[        title series stardate     airdate title_clean character  \
 0    The Cage    TOS  Unknown  1988-10-04    the cage     SPOCK   
 1    The Cage    TOS  Unknown  1988-10-04    the cage     TYLER   
 2    The Cage    TOS  Unknown  1988-10-04    the cage     SPOCK   
 3    The Cage    TOS  Unknown  1988-10-04    the cage     TYLER   
 4    The Cage    TOS  Unknown  1988-10-04    the cage       ONE   
 ..        ...    ...      ...         ...         ...       ...   
 328  The Cage    TOS  Unknown  1988-10-04    the cage     TYLER   
 329  The Cage    TOS  Unknown  1988-10-04    the cage     BOYCE   
 330  The Cage    TOS  Unknown  1988-10-04    the cage      PIKE   
 331  The Cage    TOS  Unknown  1988-10-04    the cage       ONE   
 332  The Cage    TOS  Unknown  1988-10-04    the cage      PIKE   
 
                                                   line  
 0                                  Check the circuit.   
 1                                 All operating, sir.   
 2     I

In [68]:
# characters' list
list(set(re.findall(r"(\w+):", ' '.join(lines_list))))

['SHAVA',
 'CORNWELL',
 'TILLY',
 'Federation',
 'BRUCE',
 'WOMAN',
 'BURNHAM',
 'STALLHOLDER',
 'DETMER',
 'BRYCE',
 'AIRIAM',
 'GEORGIOU',
 'KLINGON',
 'SARU',
 'SHAVO',
 'CROUPIER',
 'ORION',
 'TYLER',
 'STAMETS',
 'AMANDA',
 'OWOSEKUN',
 'COMPUTER',
 'LRELL',
 'SAREK']

In [191]:
def clean_text(script):
	script_clean = script.strip()
	script_clean = script_clean.replace("\n", "")
	script_clean = script_clean.replace("\r", " ")
	script_clean = script_clean.replace("\r\n", "")
	script_clean = re.sub("([\(\[]).*?([\)\]])", "", script_clean)
	script_clean = re.sub(r'\.([a-zA-Z])', r'. \1', script_clean) #remove missing whitespace between character lines.
	script_clean = re.sub(r'\!([a-zA-Z])', r'! \1', script_clean)
	script_clean = re.sub(r'\?([a-zA-Z])', r'? \1', script_clean)
	return script_clean

In [214]:
gh_test = clean_text(transcripts_df.iloc[730]['clean_transcript'])

In [215]:
gh_test

'Will You Take My Hand? Will You Take My Hand? Stardate: 2257Original Airdate: 11 Feb 2018 KLINGON: We have acquired the target.  BURNHAM: On the eve of battle, on a cold and windless night, an old general turned to a young soldier. "Tomorrow," said the master, "you will know fear." The young soldier, who had not yet experienced the agony of war, looked at the general with quizzical eyes. "How will I know fear if I do not know what it looks like?" The general replied, "You will know fear because it speaks very fast and it speaks very loud." GEORGIOU: Status, helm? DETMER: Bearing 94 mark 21, 12 light-years from Klingon homeworld GEORGIOU: Do not show respect by referring to that green dot as homeworld. Klingons are animals, and they do not have homes. Call it Qo\'noS, or the enemy planet. Ops? OWOSEKUN: Shields holding at 100%. Our scans show GEORGIOU: I have little interest in what we are scanning. I care what is scanning us. Communications? BRYCE: Discovery is running dark, Captain. 

In [216]:
def get_cast(script_clean):
	tokens = nltk.word_tokenize(script_clean)
	cast = []
	for word in tokens:
		if re.search("\\b[A-Z]{3,}\\b", word) is not None:
			cast.append(word)
	return list(set(cast))

In [217]:
gh_cast = get_cast(gh_test)

In [218]:
gh_cast

['SHAVA',
 'CORNWELL',
 'TILLY',
 'BRUCE',
 'WOMAN',
 'BURNHAM',
 'STALLHOLDER',
 'DETMER',
 'BRYCE',
 'AIRIAM',
 'GEORGIOU',
 'KLINGON',
 'SARU',
 'ORION',
 'SHAVO',
 'CROUPIER',
 'USS',
 'TYLER',
 "L'RELL",
 'STAMETS',
 'AMANDA',
 'OWOSEKUN',
 'III',
 'COMPUTER',
 'SAREK']

In [219]:
def get_lines(script_clean, cast):
	split_script = script_clean.split(':')
	lines_dict = dict.fromkeys(cast)
	for cast_member in cast:
		lines = []
		for i in range(len(split_script)-1):
			if cast_member in split_script[i].strip().split(" "):
				line = split_script[i+1].strip().split(" ")
				line = [word for word in line if word != '']
				for member in cast:
					if member in line:
						line.remove(member)
				line = ' '.join(line)
				lines.append(line)
		lines_dict[cast_member] = lines

	return lines_dict

In [220]:
gh_lines = get_lines(gh_test, gh_cast)

In [221]:
gh_lines

{'SHAVA': ['But we have to charge you.'],
 'CORNWELL': ['You want to do this here? Fine. Terms of atrocity are convenient after the fact. The Klingons are on the verge of wiping out the Federation.',
  'It very soon will be. We do not have the luxury of principles.',
  'What is it you are suggesting?',
  'Today, we honor Ensign Sylvia Tilly. Accepted into Starfleet Command Training Program.',
  'Lieutenant Commander Paul Stamets. Medical Officer Hugh Culber.',
  'Commander Saru. First Kelpien to receive the Medal of Honour.'],
 'TILLY': ['Thank you. I have heard and read so much about you. Michael can attest, I am always asking about Captain Georgiou.',
  'Oh, she is not',
  'So she is from',
  'you are the Terran Emperor.',
  'I know. What is happening?',
  'Oh. But just so you are clear, I am not the same person I was in your universe.',
  'Not a lot of humans around here.',
  'Insult her again, and your nose is going to be able to sniff the back of your head.',
  'Nausicaan disrupto

In [223]:
for key,val in gh_lines.items():
    print(key)
    print(val)

SHAVA
['But we have to charge you.']
CORNWELL
['You want to do this here? Fine. Terms of atrocity are convenient after the fact. The Klingons are on the verge of wiping out the Federation.', 'It very soon will be. We do not have the luxury of principles.', 'What is it you are suggesting?', 'Today, we honor Ensign Sylvia Tilly. Accepted into Starfleet Command Training Program.', 'Lieutenant Commander Paul Stamets. Medical Officer Hugh Culber.', 'Commander Saru. First Kelpien to receive the Medal of Honour.']
TILLY
['Thank you. I have heard and read so much about you. Michael can attest, I am always asking about Captain Georgiou.', 'Oh, she is not', 'So she is from', 'you are the Terran Emperor.', 'I know. What is happening?', 'Oh. But just so you are clear, I am not the same person I was in your universe.', 'Not a lot of humans around here.', 'Insult her again, and your nose is going to be able to sniff the back of your head.', 'Nausicaan disruptor pistols. Latest spec. Paralithium cell

In [225]:
gh_df_new = pd.DataFrame(list(gh_lines.items()))

In [229]:
gh_df_new.columns = ['character', 'lines']

In [230]:
gh_df_new

Unnamed: 0,character,lines
0,SHAVA,[But we have to charge you.]
1,CORNWELL,[You want to do this here? Fine. Terms of atro...
2,TILLY,[Thank you. I have heard and read so much abou...
3,BRUCE,[Incoming transmission.]
4,WOMAN,"[Keep walking, Federation. No one wants you he..."
5,BURNHAM,"[On the eve of battle, on a cold and windless ..."
6,STALLHOLDER,[Greetings.]
7,DETMER,"[Bearing 94 mark 21, 12 light-years from Kling..."
8,BRYCE,"[Discovery is running dark, Captain., I am hav..."
9,AIRIAM,"[Aye, Captain.]"


In [234]:
rows = []
_ = gh_df_new.apply(lambda row: [rows.append([row['character'], ln]) 
                         for ln in row['lines']], axis=1)

In [237]:
df_new = pd.DataFrame(rows)
df_new.columns = ['character', 'line']

In [238]:
df_new

Unnamed: 0,character,line
0,SHAVA,But we have to charge you.
1,CORNWELL,You want to do this here? Fine. Terms of atroc...
2,CORNWELL,It very soon will be. We do not have the luxur...
3,CORNWELL,What is it you are suggesting?
4,CORNWELL,"Today, we honor Ensign Sylvia Tilly. Accepted ..."
...,...,...
280,SAREK,Michael.
281,SAREK,What the Federation chose to do on Qo'noS was ...
282,SAREK,"And yet, you were able to find another way. I ..."
283,SAREK,"Commander Burnham, the Federation is as gratef..."


In [None]:
transcripts_df.iloc[730]['title']

In [239]:
df_new['title'] = transcripts_df.iloc[730]['title']

In [241]:
df_new['title_clean'] = transcripts_df.iloc[730]['title_clean']

In [243]:
df_new['series'] = transcripts_df.iloc[730]['series']

In [245]:
df_new['airdate'] = transcripts_df.iloc[730]['airdate']

In [247]:
df_new['stardate'] = transcripts_df.iloc[730]['stardate']

In [248]:
df_new

Unnamed: 0,character,line,title,title_clean,series,airdate,stardate
0,SHAVA,But we have to charge you.,Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
1,CORNWELL,You want to do this here? Fine. Terms of atroc...,Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
2,CORNWELL,It very soon will be. We do not have the luxur...,Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
3,CORNWELL,What is it you are suggesting?,Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
4,CORNWELL,"Today, we honor Ensign Sylvia Tilly. Accepted ...",Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
...,...,...,...,...,...,...,...
280,SAREK,Michael.,Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
281,SAREK,What the Federation chose to do on Qo'noS was ...,Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
282,SAREK,"And yet, you were able to find another way. I ...",Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown
283,SAREK,"Commander Burnham, the Federation is as gratef...",Will You Take My Hand?,will you take my hand,DIS,2018-02-11,Unknown


In [71]:
lines_df

Unnamed: 0,title,series,stardate,airdate,title_clean,character,line


In [249]:
pd.concat([lines_df, df_new])

Unnamed: 0,title,series,stardate,airdate,title_clean,character,line
0,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,SHAVA,But we have to charge you.
1,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,CORNWELL,You want to do this here? Fine. Terms of atroc...
2,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,CORNWELL,It very soon will be. We do not have the luxur...
3,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,CORNWELL,What is it you are suggesting?
4,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,CORNWELL,"Today, we honor Ensign Sylvia Tilly. Accepted ..."
...,...,...,...,...,...,...,...
280,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,SAREK,Michael.
281,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,SAREK,What the Federation chose to do on Qo'noS was ...
282,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,SAREK,"And yet, you were able to find another way. I ..."
283,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,SAREK,"Commander Burnham, the Federation is as gratef..."
