<a href="https://colab.research.google.com/github/mer0mingian/statistics_for_data_science/blob/master/Discovering_the_Star_Trek_Universe_with_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import spacy

In [0]:
print("Hello Universe!")

 # 1. Tokenizer

In [0]:
import spacy

# Load small spacy model, alternative install "en_core_web_lg" with word vectors
nlp = spacy.load("en")

In [0]:
doc = nlp("I'm done here on planet Earth. Ready to beam, Miles O'Brien!")

In [0]:
# Text tokenizer
for token in doc:
  print(token)

In [0]:
# Sentence tokenizer
for sent in doc.sents:
  print()
  print(sent)

# 2. Named Entity Recognition


In [0]:
# Iterate over named entities and show text and label (type of entity)
for ent in doc.ents:
  print(ent.text, " - ", ent.label_)

In [0]:
## Task: Parse the sentence "United Federation of Planets is based on Earth in San Francisco" 
## with the NLP pipeline. Which entities are in this sentence?


doc = 

In [0]:
spacy.displacy.render(doc, style='ent', jupyter=True)

# 3. Part of Speech Tagging

![alt text](https://www.startrek.com/sites/default/files/styles/content_full/public/images/inline/2019-01/a105e08e3af3eda31c9b970fcb04f265.jpg?itok=MxAB76pF)

In [0]:
doc = nlp("There is a fire on deck one!")

In [0]:
for token in doc:
  print(token.text, "-", token.lemma_, "-", token.pos_)

In [0]:
displacy.render(doc, style='dep', jupyter=True)

In [0]:
spacy.explain(---)

In [0]:
doc = nlp("Good. They can't fire when they're cloaked.")

In [0]:
displacy.render(doc, style='dep', jupyter=True)

# 4. Matcher

[Open Matcher Demo](https://explosion.ai/demos/matcher?text=We%27re%20now%20at%20warp%20four%20point%20three.%0AMr.%20Crusher%2C%20warp%20eight%2C%20engange!&model=en_core_web_sm&pattern=%5B%5D)

In [0]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = []
matcher.add("WARP", None, pattern)

doc = nlp(u"We're now at warp nine point three.")

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]  # The matched span
    print(span)


# 5. Representations and similarities

In [0]:
doc = nlp("This is Data's cat. He does not have a dog!")

In [0]:
# Token has a vector (word embedding)
doc[0].vector

In [0]:
print(doc[4], doc[11])

In [0]:
# Calculating cosine similarity of vectors and compare
for word in doc:
  print(word.text, " - ", doc[4].text, ": ", word.similarity(doc[4]))

# 6. Using custom models

In [0]:
!pip install https://github.com/chssch/spacy-models/raw/master/en_tng_ner-0.0.1.tar.gz
!spacy link en_tng_ner en_tng_ner

In [0]:
nlp = spacy.load("en_tng_ner")

In [0]:
doc = nlp("The Vulcan ship arrived at Earth")
displacy.render(doc, style='ent', jupyter=True)


In [0]:
doc = nlp("This material can be found on Vulcan")
displacy.render(doc, style='ent', jupyter=True)

In [0]:
!wget https://github.com/RTrek/startrekTNGdataset/raw/master/data2/TNG.csv

In [0]:
def get_data():
    import pandas as pd
    df = pd.read_csv("TNG.csv")
    # Only keep rows with a person as speaker
    df = df.dropna(subset=['who'])
    df = df.fillna("")
    # Normalize
    df.who = df.who.apply(lambda x: x.replace("V.O.", "").strip())
    df.text = df.text.apply(lambda x: x.strip())
    # Only keep main characters
    df = df.query('who == "PICARD" or who == "RIKER" or who == "WORF" or who == "DATA" or who == "BEVERLY" or who == "TROI"')
    df = df.reset_index()
    return df

In [0]:
# Extra: using nlp.pipe for batch processing

In [0]:
# Batch processing
df_subset = get_data().sample(frac=0.01)
for doc in nlp.pipe((it.text for _, it in df_subset.iterrows())):
    if doc.ents:
        displacy.render(doc, style='ent', jupyter=True, options={"ents": ["ALIEN_SPECIES", "SPACE_DESTINATION"], "colors": {"ALIEN_SPECIES": "#AA3939",  "SPACE_DESTINATION": "#2D882D"}})