In [49]:
class Subtitle:
    def __init__(self, caption):
        self.caption = caption
        self.sublist = self.format_subtitle()
        self.text = self.compile_sub()
    
#     function to convert srt into a list of itmes in the format {"time": {time}, "subtitle": {subtitle}}
    def format_subtitle(self):
        subtitle = []
        caption = self.caption.split('\n\n')
        for section in caption:
            section = section.split('\n')
            subtitle.append({
                "section": int(section[0]),
                "time": section[1],
                "subtitle": section[2]
            })
        return subtitle
    
    def compile_sub(self):
        text = " ".join([section["subtitle"] for section in self.sublist])
        return text

In [72]:
with open('output.txt', 'r') as f:
    caption = f.read()
    
subtitle = Subtitle(caption)

In [73]:
subtitle.sublist

[{'section': 1,
  'time': '00:00:00,030 --> 00:00:04,680',
  'subtitle': "view j/s it's a JavaScript framework for"},
 {'section': 2,
  'time': '00:00:02,610 --> 00:00:06,420',
  'subtitle': 'building front-end UIs in view you can'},
 {'section': 3,
  'time': '00:00:04,680 --> 00:00:07,680',
  'subtitle': 'start simple and then progressively add'},
 {'section': 4,
  'time': '00:00:06,420 --> 00:00:10,050',
  'subtitle': 'in the tools and features that you need'},
 {'section': 5,
  'time': '00:00:07,680 --> 00:00:11,790',
  'subtitle': 'to build a complex web application at'},
 {'section': 6,
  'time': '00:00:10,050 --> 00:00:13,799',
  'subtitle': 'its core it provides a way to build'},
 {'section': 7,
  'time': '00:00:11,790 --> 00:00:15,660',
  'subtitle': 'components that encapsulate data or'},
 {'section': 8,
  'time': '00:00:13,799 --> 00:00:17,430',
  'subtitle': 'state in your JavaScript and then'},
 {'section': 9,
  'time': '00:00:15,660 --> 00:00:19,500',
  'subtitle': 'connec

In [63]:
text = subtitle.text

In [2]:
# import from nltk
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [75]:
# Word tokenise
tokens = word_tokenize(text)
# print(tokens)
# tag some text
tagged = nltk.pos_tag(tokens)
print(tagged[:5])
POS = {
    "CD": "cardinal numbers",
    "AT": "articles",
    "JJ": "adjectives",
    "NN": "nouns formed from oadjectives & nouns (default)",
    "RB": "adverbs",
    "NNS": "plural nouns",
    "VBG": "gerunds",
    "VBD": "past tense verbs"
}

[('view', 'NN'), ('j/s', 'VBP'), ('it', 'PRP'), ("'s", 'VBZ'), ('a', 'DT')]


In [76]:
# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in tokens]
print(" ".join(stemmed))

view j/ it 's a javascript framework for build front-end ui in view you can start simpl and then progress add in the tool and featur that you need to build a complex web applic at it core it provid a way to build compon that encapsul data or state in your javascript and then connect that state reactiv to a templat in html we call these compon declar view becaus the same data input will alway produc the same output in the visual ui when we declar data on thi data object it link or bind it to the html on the templat abov when the valu of the data chang the compon will automat rerend or in other word it 's reactiv and the framework doe a ton of work under the hood to make sure that thi process is perform across a huge compon tree we can work with thi data in the templat thank to view html base templat syntax we can interpol a valu for express use doubl brace and we also have a varieti of direct to control the behavior of the html base on the data we can use vf to onli render an element wh

In [59]:
# lemmatizing
wnl = WordNetLemmatizer()
lemmatized = [wnl.lemmatize(w) for w in tokens]
print(" ".join(lemmatized))

view j/s it 's a JavaScript framework for building front-end UIs in view you can start simple and then progressively add in the tool and feature that you need to build a complex web application at it core it provides a way to build component that encapsulate data or state in your JavaScript and then connect that state reactively to a template in HTML we call these component declarative view because the same data input will always produce the same output in the visual UI when we declare data on this data object it link or bind it to the HTML on the template above when the value of the data change the component will automatically rerender or in other word it 's reactive and the framework doe a ton of work under the hood to make sure that this process is performance across a huge component tree we can work with this data in the template thanks to view HTML based template syntax we can interpolate a value for expression using double brace and we also have a variety of directive to control 

In [29]:
# Stopword removal
sw = stopwords.words("English")
content = [w for w in tokens if w.lower() not in sw]
" ".join(content)

"view j/s 's JavaScript framework building front-end UIs view start simple progressively add tools features need build complex web application core provides way build components encapsulate data state JavaScript connect state reactively template HTML call components declarative views data inputs always produce output visual UI declare data data object links binds HTML template value data changes component automatically rerender words 's reactive framework ton work hood make sure process performance across huge component tree work data template thanks views HTML based template syntax interpolate value expression using double braces also variety directives control behavior HTML based data use VF render element value right side truthy might fallback element 's rendered values fall C V else make app interactive listening events using V directive listen event element run code handle event right side directly template define custom method components methods object method access reactive data

In [30]:
text

"view j/s it's a JavaScript framework for building front-end UIs in view you can start simple and then progressively add in the tools and features that you need to build a complex web application at its core it provides a way to build components that encapsulate data or state in your JavaScript and then connect that state reactively to a template in HTML we call these components declarative views because the same data inputs will always produce the same output in the visual UI when we declare data on this data object it links or binds it to the HTML on the template above when the value of the data changes the component will automatically rerender or in other words it's reactive and the framework does a ton of work under the hood to make sure that this process is performance across a huge component tree we can work with this data in the template thanks to views HTML based template syntax we can interpolate a value for expression using double braces and we also have a variety of directiv

In [43]:
synsets = wordnet.synsets('phone')

for synset in synsets:
    print(synset.lemma_names())

['telephone', 'phone', 'telephone_set']
['phone', 'speech_sound', 'sound']
['earphone', 'earpiece', 'headphone', 'phone']
['call', 'telephone', 'call_up', 'phone', 'ring']


In [45]:
synsets = wordnet.synsets('cell')

for synset in synsets:
    print(synset.lemma_names())

['cell']
['cell']
['cell', 'electric_cell']
['cell', 'cadre']
['cellular_telephone', 'cellular_phone', 'cellphone', 'cell', 'mobile_phone']
['cell', 'cubicle']
['cell', 'jail_cell', 'prison_cell']


In [49]:
synsets = wordnet.synsets('mobile phone')

for synset in synsets:
    print(synset.lemma_names())

In [10]:
phone = wordnet.synsets('car')
synsets = phone[0].hyponyms()
for synset in synsets:
    print(synset.lemma_names())
    print(synset.definition())
    print(synset.examples())

['ambulance']
a vehicle that takes people to and from hospitals
[]
['beach_wagon', 'station_wagon', 'wagon', 'estate_car', 'beach_waggon', 'station_waggon', 'waggon']
a car that has a long body and rear door with space behind rear seat
[]
['bus', 'jalopy', 'heap']
a car that is old and unreliable
['the fenders had fallen off that old bus']
['cab', 'hack', 'taxi', 'taxicab']
a car driven by a person whose job is to take passengers where they want to go in exchange for money
[]
['compact', 'compact_car']
a small and economical car
[]
['convertible']
a car that has top that can be folded or removed
[]
['coupe']
a car with two doors and front seats and a luggage compartment
[]
['cruiser', 'police_cruiser', 'patrol_car', 'police_car', 'prowl_car', 'squad_car']
a car in which policemen cruise the streets; equipped with radiotelephonic communications to headquarters
[]
['electric', 'electric_automobile', 'electric_car']
a car that is powered by electricity
[]
['gas_guzzler']
a car with relati

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [77]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [15]:
synonyms = []
antonyms = []

for syn in wordnet.synsets('good'):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.extend([n.name() for n in l.antonyms()])
            
print(set(synonyms), set(antonyms))

{'secure', 'skillful', 'dependable', 'just', 'unspoiled', 'skilful', 'undecomposed', 'commodity', 'trade_good', 'full', 'proficient', 'honest', 'honorable', 'sound', 'soundly', 'good', 'practiced', 'in_effect', 'serious', 'thoroughly', 'near', 'well', 'estimable', 'beneficial', 'in_force', 'upright', 'dear', 'salutary', 'safe', 'ripe', 'respectable', 'expert', 'goodness', 'adept', 'right', 'effective', 'unspoilt'} {'badness', 'evilness', 'ill', 'bad', 'evil'}


In [22]:
# similarities

w1 = wordnet.synset("agriculture.n.01")
w2 = wordnet.synset("farm.n.01")
print(w1.wup_similarity(w2))

0.11764705882352941


https://spacy.io/

In [41]:
# !pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0-py3-none-any.whl (778.8 MB)
[K     |████████████████████████████████| 778.8 MB 47 kB/s s eta 0:00:01
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [26]:
from spacy.lang.en import English

nlp = English()

doc = nlp("i like python programming")

import spacy

nlp = spacy.load("en_core_web_sm")

In [34]:
doc = nlp("its official Apple is the first U.S. public company to reach 1 trillion market value")
# for token in doc:
#     print(token.text, token.pos_, token.dep_) # token's text, part-of-speech tag, dependency label
    
# entity text, label
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
1 trillion CARDINAL


## Pipeline
## Doc
## Token
## Span
## lexical attributes: like_num, is_alpha, is_punc

In [46]:
nlp = spacy.load("en_core_web_lg")

doc1 = nlp("farm")
doc2 = nlp("water")

print(doc1.similarity(doc2))

0.3350881239854339


In [51]:
with open('output.txt') as f:
    transcript = f.read()
    
subtitle = Subtitle(transcript)

In [53]:
subtitle.text

"view j/s it's a JavaScript framework for building front-end UIs in view you can start simple and then progressively add in the tools and features that you need to build a complex web application at its core it provides a way to build components that encapsulate data or state in your JavaScript and then connect that state reactively to a template in HTML we call these components declarative views because the same data inputs will always produce the same output in the visual UI when we declare data on this data object it links or binds it to the HTML on the template above when the value of the data changes the component will automatically rerender or in other words it's reactive and the framework does a ton of work under the hood to make sure that this process is performance across a huge component tree we can work with this data in the template thanks to views HTML based template syntax we can interpolate a value for expression using double braces and we also have a variety of directiv

In [None]:
from spacy import displacy

nlp = spacy.load('en_core_web_lg')
doc = nlp(subtitle.text)
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

