In [2]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # missing data visualisation
%matplotlib inline
sns.set_style("darkgrid")
sns.set_palette("Set1")

import nltk
import spacy
import contractions
import unidecode
import unicodedata
from dateutil.parser import parse

In [3]:
transcripts_df = pd.read_csv('../data/interim_data/st_transcripts_w_metadata.csv')

In [4]:
transcripts_df.head()

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript
0,The Cage,TOS,Unknown,1988-10-04,the cage,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...
1,Where No Man Has Gone Before,TOS,1312.4 - 1313.8,1966-09-22,where no man has gone before,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...
2,The Corbomite Maneuver,TOS,1512.2 - 1514.1,1966-11-10,the corbomite maneuver,\n\n\n\n\n\nThe Star Trek Transcripts - The Co...
3,Mudd's Women,TOS,1329.8 - 1330.1,1966-10-13,mudds women,\n\n\n\n\n\nThe Star Trek Transcripts - Mudd's...
4,The Enemy Within,TOS,1672.1 - 1673.1,1966-10-06,the enemy within,\n\n\n\n\n\nThe Star Trek Transcripts - The En...


In [5]:
transcripts_df.sort_values(by='series')

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript
730,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
716,The Vulcan Hello,DIS,1207.3,2017-09-24,the vulcan hello,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
717,Battle at the Binary Stars,DIS,1207.3,2017-09-24,battle at the binary stars,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
718,Context Is for Kings,DIS,Unknown,2017-10-01,context is for kings,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
719,The Butcher's Knife Cares Not for the Lamb's Cry,DIS,Unknown,2017-10-08,the butchers knife cares not for the lambs cry,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
...,...,...,...,...,...,...
580,Blink of an Eye,VOY,Unknown,2000-01-19,blink of an eye,\n\n\n\n\n\nThe Voyager Transcripts - Blink Of...
581,Virtuoso,VOY,53556.4,2000-01-26,virtuoso,\n\n\n\n\n\nThe Voyager Transcripts - Virtuoso...
582,Memorial,VOY,Unknown,2000-02-02,memorial,\n\n\n\n\n\nThe Voyager Transcripts - Memorial...
570,Survival Instinct,VOY,53049.2,1999-09-29,survival instinct,\n\n\n\n\n\nThe Voyager Transcripts - Survival...


In [6]:
transcripts_df.iloc[0]['transcript']

"\n\n\n\n\n\nThe Star Trek Transcripts - The Cage\n\n\n\nThe\r\nCage\nUnaired\r\npilot\n\n\n\n\n\n\n [Bridge]\n\nSPOCK: Check the circuit. \r\nTYLER: All operating, sir. \r\nSPOCK: It can't be the screen then. Definitely something out there,\r\nCaptain, headed this way. \r\nTYLER: It could be these meteorites. \r\nONE: No, it's something else. There's still something out there. \r\nTYLER: It's coming at the speed of light, collision course. The\r\nmeteorite beam has not deflected it, Captain.\r\nONE: Evasive manoeuvres, sir?\r\nPIKE: Steady as we go.\r\nGARISON: It's a radio wave, sir. We're passing through an old-style\r\ndistress signal.\r\nPIKE: They were keyed to cause interference and attract attention this\r\nway.\r\nGARISON: A ship in trouble making a forced landing, sir. That's it. No\r\nother message.\r\nTYLER: I have a fix. It comes from the Talos star group.\r\nONE: We've no ships or Earth colonies that far out.\r\nSPOCK: Their call letters check with a survey expedition. SS

In [83]:
test_txt = transcripts_df.iloc[720]['transcript']

In [34]:
# remove space characters, e.g. newlines
def remove_spaces(script):
    script_clean = script.replace("\r", " ") # replace \r with whitespace
    script_clean = re.sub(r'\n+', ' ', script_clean) # replace multiple newlines with single whitespace
    script_clean = re.sub('\s+',' ', script_clean) # replace multiple whitespaces with single whitespace
    script_clean = script_clean.strip() #remove leading/trailing whitespace
    
    return script_clean

In [39]:
# Website specific text needs to be removed:

# Heading: "The Star Trek Transcripts - "
# Original Title, "Unaired"/"pilot" (first occurence before first character)

# Closing line with Copyrights:
# <Back\r\nto the episode listing\nStar\r\nTrek ® is copyright of  CBS\r\nStudios Inc. 
# Copyright © 1966, Present.\r\nThe Star Trek web pages on this site are for educational 
# and\r\nentertainment purposes only. All other copyrights property of their\r\nrespective holders.\n"

def remove_site_specs(text):
    # replace \r\n character sequence with space
    clean_text = text.replace('\r\n', ' ')
    clean_text = remove_spaces(clean_text)
    
    # rm 'The Star Trek Transcripts - ' etc. line 
    clean_text = clean_text.replace('The Star Trek Transcripts - ', '')
    clean_text = clean_text.replace('The Animated Star Trek Transcripts - ', '')
    clean_text = clean_text.replace('The Next Generation Transcripts - ', '')
    clean_text = clean_text.replace('The Deep Space Nine Transcripts - ', '')
    clean_text = clean_text.replace('The Voyager Transcripts - ', '')
    clean_text = clean_text.replace('The Enterprise Transcripts - ', '')
    clean_text = clean_text.replace('Star Trek Discovery Transcripts - ', '')
    
    # remove reference at the end of document
    clean_text = clean_text.replace('<Back to the episode listing Star Trek ® is copyright of CBS Studios Inc. Copyright © 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    clean_text = clean_text.replace('Star Trek ® is copyright of CBS Studios Inc . Copyright © 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    clean_text = clean_text.replace('Star Trek is copyright of CBS Studios Inc . Copyright 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    return clean_text

In [86]:
# Replace character sequence from Website
# Replace accented characters
# Remove extra spaces
# Remove website specific strings, e.g. Copyright
# Expand contactions

def preclean_transcript(transcript):
    # replace \r\n character sequence with space
    clean_text = transcript.replace('\r\n', ' ')
    
    # remove accented characters
    clean_text = unicodedata.normalize('NFKD', clean_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # remove extra spaces & website specific strings, e.g. copyright
    clean_text = remove_site_specs(clean_text)
    
    # Expand contractions to easier deal with apostrophes (Klingon names)
    # https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/
    expanded_words = []    
    for word in clean_text.split():
      # using contractions.fix to expand the shotened words
      expanded_words.append(contractions.fix(word))   

    expanded_text = ' '.join(expanded_words)
    
    # text inside [] signifies location
    # remove all text inside []
    expanded_text = re.sub("[\[].*?[\]]", "", expanded_text)
    
    # remove space before :
    expanded_text = re.sub(' :', ':', expanded_text)
    
    # remove repeated whitespace inside a string
    final_text = re.sub('\s+',' ', expanded_text)
    
    return final_text

In [87]:
preclean_transcript(test_txt)



In [88]:
transcripts_df['clean_transcript'] = transcripts_df['transcript'].map(lambda x: preclean_transcript(x))

In [89]:
transcripts_df

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript,clean_transcript
0,The Cage,TOS,Unknown,1988-10-04,the cage,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...,The Cage The Cage Unaired pilot SPOCK: Check t...
1,Where No Man Has Gone Before,TOS,1312.4 - 1313.8,1966-09-22,where no man has gone before,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...,Where No Man Has Gone Before Where No Man Has ...
2,The Corbomite Maneuver,TOS,1512.2 - 1514.1,1966-11-10,the corbomite maneuver,\n\n\n\n\n\nThe Star Trek Transcripts - The Co...,The Corbomite Maneuver The Corbomite Maneuver ...
3,Mudd's Women,TOS,1329.8 - 1330.1,1966-10-13,mudds women,\n\n\n\n\n\nThe Star Trek Transcripts - Mudd's...,Mudd's Women Mudd's Women Stardate: 1329.8 Ori...
4,The Enemy Within,TOS,1672.1 - 1673.1,1966-10-06,the enemy within,\n\n\n\n\n\nThe Star Trek Transcripts - The En...,The Enemy Within The Enemy WIthin Stardate: 16...
...,...,...,...,...,...,...,...
726,The Wolf Inside,DIS,Unknown,2018-01-14,the wolf inside,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,The Wolf Inside The Wolf Inside Stardate: Unkn...
727,Vaulting Ambition,DIS,Unknown,2018-01-21,vaulting ambition,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,Vaulting Ambition Vaulting Ambition Stardate: ...
728,What's Past Is Prologue,DIS,1834.2,2018-01-28,whats past is prologue,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,what is Past is Prologue what is Past is Prolo...
729,"The War Without, The War Within",DIS,Unknown,2018-02-04,the war without the war within,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...,"The War Without, The War Within The War Withou..."


----

In [55]:
no_apos = expanded_text.replace("'", "")
lines_list = re.sub(r"(\w+):", r'\n\1:', no_apos).split('\n')
#lines_list = lines_list.split('<')
matchers = ['Transcripts', 'Airdate', 'CBS', 'Stardate', transcripts_df.iloc[730]['title']]
matching = [s for s in lines_list if any(xs in s for xs in matchers)]
print(matching)

['Will You Take My Hand? Will You Take My Hand? ', 'Stardate: 2257Original ', 'Airdate: 11 Feb 2018 ']


In [56]:
lines_list = [x for x in lines_list if x not in matching]
lines_list

['KLINGON: We have acquired the target. (Our viewpoint moves in from the Klingon fleet past Jupiter and Mars to a blue planet with a single large moon, a spacedock and plenty of small satellites.) ',
 'BURNHAM: On the eve of battle, on a cold and windless night, an old general turned to a young soldier. "Tomorrow," said the master, "you will know fear." The young soldier, who had not yet experienced the agony of war, looked at the general with quizzical eyes. "How will I know fear if I do not know what it looks like?" The general replied, "You will know fear because it speaks very fast and it speaks very loud." ',
 'GEORGIOU: Status, helm? ',
 'DETMER: Bearing 94 mark 21, 12 light-years from Klingon homeworld ',
 'GEORGIOU: Do not show respect by referring to that green dot as homeworld. Klingons are animals, and they do not have homes. Call it QonoS, or the enemy planet. Ops? ',
 'OWOSEKUN: Shields holding at 100%. Our scans show ',
 'GEORGIOU: I have little interest in what we are sc

In [57]:
len(lines_list)

284

In [68]:
# characters' list
list(set(re.findall(r"(\w+):", ' '.join(lines_list))))

['SHAVA',
 'CORNWELL',
 'TILLY',
 'Federation',
 'BRUCE',
 'WOMAN',
 'BURNHAM',
 'STALLHOLDER',
 'DETMER',
 'BRYCE',
 'AIRIAM',
 'GEORGIOU',
 'KLINGON',
 'SARU',
 'SHAVO',
 'CROUPIER',
 'ORION',
 'TYLER',
 'STAMETS',
 'AMANDA',
 'OWOSEKUN',
 'COMPUTER',
 'LRELL',
 'SAREK']

In [62]:
for line in lines_list:
    line_splitted = line.split(':')
    print(line_splitted[0])
    print(len(line_splitted[1]))
    print(line_splitted[1])

KLINGON
187
 We have acquired the target. (Our viewpoint moves in from the Klingon fleet past Jupiter and Mars to a blue planet with a single large moon, a spacedock and plenty of small satellites.) 
BURNHAM
409
 On the eve of battle, on a cold and windless night, an old general turned to a young soldier. "Tomorrow," said the master, "you will know fear." The young soldier, who had not yet experienced the agony of war, looked at the general with quizzical eyes. "How will I know fear if I do not know what it looks like?" The general replied, "You will know fear because it speaks very fast and it speaks very loud." 
GEORGIOU
15
 Status, helm? 
DETMER
59
 Bearing 94 mark 21, 12 light-years from Klingon homeworld 
GEORGIOU
157
 Do not show respect by referring to that green dot as homeworld. Klingons are animals, and they do not have homes. Call it QonoS, or the enemy planet. Ops? 
OWOSEKUN
41
 Shields holding at 100%. Our scans show 
GEORGIOU
93
 I have little interest in what we are scan

----

In [72]:
transcripts_df.head()

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript
0,The Cage,TOS,Unknown,1988-10-04,the cage,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...
1,Where No Man Has Gone Before,TOS,1312.4 - 1313.8,1966-09-22,where no man has gone before,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...
2,The Corbomite Maneuver,TOS,1512.2 - 1514.1,1966-11-10,the corbomite maneuver,\n\n\n\n\n\nThe Star Trek Transcripts - The Co...
3,Mudd's Women,TOS,1329.8 - 1330.1,1966-10-13,mudds women,\n\n\n\n\n\nThe Star Trek Transcripts - Mudd's...
4,The Enemy Within,TOS,1672.1 - 1673.1,1966-10-06,the enemy within,\n\n\n\n\n\nThe Star Trek Transcripts - The En...


In [70]:
lines_df = pd.DataFrame(columns=['title', 'series', 'stardate', 'airdate', 'title_clean', 'character', 'line'])

In [78]:
for i, row in transcripts_df.iterrows():
    print(row['title'], row['series'], row['airdate'], row['title_clean'])

The Cage TOS 1988-10-04 the cage
Where No Man Has Gone Before TOS 1966-09-22 where no man has gone before
The Corbomite Maneuver TOS 1966-11-10 the corbomite maneuver
Mudd's Women TOS 1966-10-13 mudds women
The Enemy Within TOS 1966-10-06 the enemy within
The Man Trap TOS 1966-09-08 the man trap
The Naked Time TOS 1966-09-29 the naked time
Charlie X TOS 1966-09-15 charlie x
Balance of Terror TOS 1966-12-15 balance of terror
What Are Little Girls Made Of? TOS 1966-10-20 what are little girls made of
Dagger of the Mind TOS 1966-11-03 dagger of the mind
Miri TOS 1966-10-27 miri
The Conscience of the King TOS 1966-12-08 the conscience of the king
The Galileo Seven TOS 1967-01-05 the galileo seven
Court Martial TOS 1967-02-02 court martial
The Menagerie, Part I TOS 1966-11-17 the menagerie part 1
The Menagerie, Part II TOS 1966-11-24 the menagerie part 2
Shore Leave TOS 1966-12-29 shore leave
The Squire of Gothos TOS 1967-01-12 the squire of gothos
Arena TOS 1967-01-19 arena
The Alternative

Necessary Evil DS9 1993-11-14 necessary evil
Second Sight DS9 1993-11-20 second sight
Sanctuary DS9 1993-11-28 sanctuary
Rivals DS9 1994-01-02 rivals
The Alternate DS9 1994-01-09 the alternate
Armageddon Game DS9 1994-01-30 armageddon game
Whispers DS9 1994-02-06 whispers
Paradise DS9 1994-02-13 paradise
Shadowplay DS9 1994-02-20 shadowplay
Playing God DS9 1994-02-27 playing god
Profit and Loss DS9 1994-03-20 profit and loss
Blood Oath DS9 1994-03-27 blood oath
The Maquis, Part I DS9 1994-04-24 the maquis part 1
The Maquis, Part II DS9 1994-05-01 the maquis part 2
The Wire DS9 1994-05-08 the wire
Crossover DS9 1994-05-15 crossover
The Collaborator DS9 1994-05-22 the collaborator
Tribunal DS9 1994-06-05 tribunal
The Jem'Hadar DS9 1994-06-12 the jemhadar
The Search, Part I DS9 1994-09-26 the search part 1
The Search, Part II DS9 1994-10-03 the search part 2
The House of Quark DS9 1994-10-10 the house of quark
Equilibrium DS9 1994-10-17 equilibrium
Second Skin DS9 1994-10-24 second skin
T

Similitude ENT 2003-11-19 similitude
Carpenter Street ENT 2003-11-26 carpenter street
Chosen Realm ENT 2004-01-14 chosen realm
Proving Ground ENT 2004-01-21 proving ground
Stratagem ENT 2004-02-04 stratagem
Harbinger ENT 2004-02-11 harbinger
Doctor's Orders ENT 2004-02-18 doctors orders
Hatchery ENT 2004-02-25 hatchery
Azati Prime ENT 2004-03-03 azati prime
Damage ENT 2004-04-21 damage
The Forgotten ENT 2004-04-28 the forgotten
E² ENT 2004-05-05 e2
The Council ENT 2004-05-12 the council
Countdown ENT 2004-05-19 countdown
Zero Hour ENT 2004-05-26 zero hour
Storm Front ENT 2004-10-08 storm front part 1
Storm Front, Part II ENT 2004-10-15 storm front part 2
Home ENT 2004-10-22 home
Borderland ENT 2004-10-29 borderland
Cold Station 12 ENT 2004-11-05 cold station 12
The Augments ENT 2004-11-12 the augments
The Forge ENT 2004-11-19 the forge
Awakening ENT 2004-11-26 awakening
Kir'Shara ENT 2004-12-03 kirshara
Daedalus ENT 2005-01-14 daedalus
Observer Effect ENT 2005-01-21 observer effect
Bab

In [71]:
lines_df

Unnamed: 0,title,series,stardate,airdate,title_clean,character,line
