In [2]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # missing data visualisation
%matplotlib inline
sns.set_style("darkgrid")
sns.set_palette("Set1")

import nltk
import spacy
import contractions
import unidecode
import unicodedata
from dateutil.parser import parse

In [3]:
transcripts_df = pd.read_csv('../data/interim_data/st_transcripts_w_metadata.csv')

In [4]:
transcripts_df.head()

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript
0,The Cage,TOS,Unknown,1988-10-04,the cage,\n\n\n\n\n\nThe Star Trek Transcripts - The Ca...
1,Where No Man Has Gone Before,TOS,1312.4 - 1313.8,1966-09-22,where no man has gone before,\n\n\n\n\n\nThe Star Trek Transcripts - Where ...
2,The Corbomite Maneuver,TOS,1512.2 - 1514.1,1966-11-10,the corbomite maneuver,\n\n\n\n\n\nThe Star Trek Transcripts - The Co...
3,Mudd's Women,TOS,1329.8 - 1330.1,1966-10-13,mudds women,\n\n\n\n\n\nThe Star Trek Transcripts - Mudd's...
4,The Enemy Within,TOS,1672.1 - 1673.1,1966-10-06,the enemy within,\n\n\n\n\n\nThe Star Trek Transcripts - The En...


In [5]:
transcripts_df.sort_values(by='series')

Unnamed: 0,title,series,stardate,airdate,title_clean,transcript
730,Will You Take My Hand?,DIS,Unknown,2018-02-11,will you take my hand,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
716,The Vulcan Hello,DIS,1207.3,2017-09-24,the vulcan hello,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
717,Battle at the Binary Stars,DIS,1207.3,2017-09-24,battle at the binary stars,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
718,Context Is for Kings,DIS,Unknown,2017-10-01,context is for kings,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
719,The Butcher's Knife Cares Not for the Lamb's Cry,DIS,Unknown,2017-10-08,the butchers knife cares not for the lambs cry,\n\n\n\n\n\nStar Trek Discovery Transcripts - ...
...,...,...,...,...,...,...
580,Blink of an Eye,VOY,Unknown,2000-01-19,blink of an eye,\n\n\n\n\n\nThe Voyager Transcripts - Blink Of...
581,Virtuoso,VOY,53556.4,2000-01-26,virtuoso,\n\n\n\n\n\nThe Voyager Transcripts - Virtuoso...
582,Memorial,VOY,Unknown,2000-02-02,memorial,\n\n\n\n\n\nThe Voyager Transcripts - Memorial...
570,Survival Instinct,VOY,53049.2,1999-09-29,survival instinct,\n\n\n\n\n\nThe Voyager Transcripts - Survival...


In [6]:
transcripts_df.iloc[0]['transcript']

"\n\n\n\n\n\nThe Star Trek Transcripts - The Cage\n\n\n\nThe\r\nCage\nUnaired\r\npilot\n\n\n\n\n\n\n [Bridge]\n\nSPOCK: Check the circuit. \r\nTYLER: All operating, sir. \r\nSPOCK: It can't be the screen then. Definitely something out there,\r\nCaptain, headed this way. \r\nTYLER: It could be these meteorites. \r\nONE: No, it's something else. There's still something out there. \r\nTYLER: It's coming at the speed of light, collision course. The\r\nmeteorite beam has not deflected it, Captain.\r\nONE: Evasive manoeuvres, sir?\r\nPIKE: Steady as we go.\r\nGARISON: It's a radio wave, sir. We're passing through an old-style\r\ndistress signal.\r\nPIKE: They were keyed to cause interference and attract attention this\r\nway.\r\nGARISON: A ship in trouble making a forced landing, sir. That's it. No\r\nother message.\r\nTYLER: I have a fix. It comes from the Talos star group.\r\nONE: We've no ships or Earth colonies that far out.\r\nSPOCK: Their call letters check with a survey expedition. SS

In [None]:
# Replace character sequence from Website
# Replace accented characters
# Remove extra spaces
# Remove website specific strings, e.g. Copyright
# Expand contactions

In [31]:
test_txt = transcripts_df.iloc[730]['transcript']

In [32]:
# replace \r\n character sequence with space
clean_text = test_txt.replace('\r\n', ' ')

In [33]:
# remove accented characters
clean_text = unicodedata.normalize('NFKD', clean_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [34]:
# remove space characters, e.g. newlines
def remove_spaces(script):
    script_clean = script.replace("\r", " ") # replace \r with whitespace
    script_clean = re.sub(r'\n+', ' ', script_clean) # replace multiple newlines with single whitespace
    script_clean = re.sub('\s+',' ', script_clean) # replace multiple whitespaces with single whitespace
    script_clean = script_clean.strip() #remove leading/trailing whitespace
    
    return script_clean

In [39]:
# Website specific text needs to be removed:

# Heading: "The Star Trek Transcripts - "
# Original Title, "Unaired"/"pilot" (first occurence before first character)

# Closing line with Copyrights:
# <Back\r\nto the episode listing\nStar\r\nTrek ® is copyright of  CBS\r\nStudios Inc. 
# Copyright © 1966, Present.\r\nThe Star Trek web pages on this site are for educational 
# and\r\nentertainment purposes only. All other copyrights property of their\r\nrespective holders.\n"

def remove_site_specs(text):
    # replace \r\n character sequence with space
    clean_text = text.replace('\r\n', ' ')
    clean_text = remove_spaces(clean_text)
    
    # rm 'The Star Trek Transcripts - ' etc. line 
    clean_text = clean_text.replace('The Star Trek Transcripts - ', '')
    clean_text = clean_text.replace('The Animated Star Trek Transcripts - ', '')
    clean_text = clean_text.replace('The Next Generation Transcripts - ', '')
    clean_text = clean_text.replace('The Deep Space Nine Transcripts - ', '')
    clean_text = clean_text.replace('The Voyager Transcripts - ', '')
    clean_text = clean_text.replace('The Enterprise Transcripts - ', '')
    clean_text = clean_text.replace('Star Trek Discovery Transcripts - ', '')
    
    # remove reference at the end of document
    clean_text = clean_text.replace('<Back to the episode listing Star Trek ® is copyright of CBS Studios Inc. Copyright © 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    clean_text = clean_text.replace('Star Trek ® is copyright of CBS Studios Inc . Copyright © 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    clean_text = clean_text.replace('Star Trek is copyright of CBS Studios Inc . Copyright 1966, Present. The Star Trek web pages on this site are for educational and entertainment purposes only. All other copyrights property of their respective holders.', '')
    return clean_text

In [40]:
clean_text = remove_site_specs(clean_text)

In [41]:
# https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/
expanded_words = []    
for word in clean_text.split():
  # using contractions.fix to expand the shotened words
  expanded_words.append(contractions.fix(word))   
    
expanded_text = ' '.join(expanded_words)

In [45]:
# text inside [] signifies location
# remove all text inside []
expanded_text = re.sub("[\[].*?[\]]", "", expanded_text)

In [46]:
# remove space before :
expanded_text = re.sub(' :', ':', expanded_text)

In [49]:
# remove repeated whitespace inside a string
expanded_text = re.sub('\s+',' ', expanded_text)

In [50]:
expanded_text

'Will You Take My Hand? Will You Take My Hand? Stardate: 2257Original Airdate: 11 Feb 2018 KLINGON: We have acquired the target. (Our viewpoint moves in from the Klingon fleet past Jupiter and Mars to a blue planet with a single large moon, a spacedock and plenty of small satellites.) BURNHAM: On the eve of battle, on a cold and windless night, an old general turned to a young soldier. "Tomorrow," said the master, "you will know fear." The young soldier, who had not yet experienced the agony of war, looked at the general with quizzical eyes. "How will I know fear if I do not know what it looks like?" The general replied, "You will know fear because it speaks very fast and it speaks very loud." GEORGIOU: Status, helm? DETMER: Bearing 94 mark 21, 12 light-years from Klingon homeworld GEORGIOU: Do not show respect by referring to that green dot as homeworld. Klingons are animals, and they do not have homes. Call it Qo\'noS, or the enemy planet. Ops? OWOSEKUN: Shields holding at 100%. Our 

In [55]:
no_apos = expanded_text.replace("'", "")
lines_list = re.sub(r"(\w+):", r'\n\1:', no_apos).split('\n')
#lines_list = lines_list.split('<')
matchers = ['Transcripts', 'Airdate', 'CBS', 'Stardate', transcripts_df.iloc[730]['title']]
matching = [s for s in lines_list if any(xs in s for xs in matchers)]
print(matching)

['Will You Take My Hand? Will You Take My Hand? ', 'Stardate: 2257Original ', 'Airdate: 11 Feb 2018 ']


In [56]:
lines_list = [x for x in lines_list if x not in matching]
lines_list

['KLINGON: We have acquired the target. (Our viewpoint moves in from the Klingon fleet past Jupiter and Mars to a blue planet with a single large moon, a spacedock and plenty of small satellites.) ',
 'BURNHAM: On the eve of battle, on a cold and windless night, an old general turned to a young soldier. "Tomorrow," said the master, "you will know fear." The young soldier, who had not yet experienced the agony of war, looked at the general with quizzical eyes. "How will I know fear if I do not know what it looks like?" The general replied, "You will know fear because it speaks very fast and it speaks very loud." ',
 'GEORGIOU: Status, helm? ',
 'DETMER: Bearing 94 mark 21, 12 light-years from Klingon homeworld ',
 'GEORGIOU: Do not show respect by referring to that green dot as homeworld. Klingons are animals, and they do not have homes. Call it QonoS, or the enemy planet. Ops? ',
 'OWOSEKUN: Shields holding at 100%. Our scans show ',
 'GEORGIOU: I have little interest in what we are sc

In [57]:
len(lines_list)

284

In [61]:
for line in lines_list:
    line_splitted = line.split(':')
    print(line_splitted[0])
    print(line_splitted[1])

KLINGON
 We have acquired the target. (Our viewpoint moves in from the Klingon fleet past Jupiter and Mars to a blue planet with a single large moon, a spacedock and plenty of small satellites.) 
BURNHAM
 On the eve of battle, on a cold and windless night, an old general turned to a young soldier. "Tomorrow," said the master, "you will know fear." The young soldier, who had not yet experienced the agony of war, looked at the general with quizzical eyes. "How will I know fear if I do not know what it looks like?" The general replied, "You will know fear because it speaks very fast and it speaks very loud." 
GEORGIOU
 Status, helm? 
DETMER
 Bearing 94 mark 21, 12 light-years from Klingon homeworld 
GEORGIOU
 Do not show respect by referring to that green dot as homeworld. Klingons are animals, and they do not have homes. Call it QonoS, or the enemy planet. Ops? 
OWOSEKUN
 Shields holding at 100%. Our scans show 
GEORGIOU
 I have little interest in what we are scanning. I care what is sca