In [10]:
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Words:", [(token.lemma_, token.pos_) for token in doc])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Words: [('when', 'ADV'), ('Sebastian', 'PROPN'), ('Thrun', 'PROPN'), ('start', 'VERB'), ('work', 'VERB'), ('on', 'ADP'), ('self', 'NOUN'), ('-', 'PUNCT'), ('drive', 'VERB'), ('car', 'NOUN'), ('at', 'ADP'), ('Google', 'PROPN'), ('in', 'ADP'), ('2007', 'NUM'), (',', 'PUNCT'), ('few', 'ADJ'), ('people', 'NOUN'), ('outside', 'ADP'), ('of', 'ADP'), ('the', 'DET'), ('company', 'NOUN'), ('take', 'VERB'), ('-PRON-', 'PRON'), ('seriously', 'ADV'), ('.', 'PUNCT'), ('"', 'PUNCT'), ('-PRON-', 'PRON'), ('can', 'VERB'), ('tell', 'VERB'), ('-PRON-', 'PRON'), ('very', 'ADV'), ('senior', 'ADJ'), ('ceo', 'NOUN'), ('of', 'ADP'), ('major', 'ADJ'), ('american', 'ADJ'), ('car', 'NOUN'), ('company', 'NOUN'), ('would', 'VERB'), ('shake', 'VERB'), ('-PRON-', 'DET'), ('hand', 'NOUN'), ('and', 'CCONJ

In [11]:
text_with_duplicates = ("Somebody once told me that "
                        "something tell me this "
                        "I will was away from to that "
                        "which is to be done to do "
                        "thought that i think a thought ")
print(text_with_duplicates)
doc_with_duplicates = nlp(text_with_duplicates)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc_with_duplicates.noun_chunks])
print("Words:", [(token.lemma_, token.pos_) for token in doc_with_duplicates])

# Find named entities, phrases and concepts
for entity in doc_with_duplicates.ents:
    print(entity.text, entity.label_)

Somebody once told me that something tell me this I will was away from to that which is to be done to do thought that i think a thought 
Noun phrases: ['Somebody', 'me', 'something', 'me', 'I', 'to do thought', 'i', 'a thought']
Words: [('somebody', 'PRON'), ('once', 'ADV'), ('tell', 'VERB'), ('-PRON-', 'PRON'), ('that', 'SCONJ'), ('something', 'PRON'), ('tell', 'VERB'), ('-PRON-', 'PRON'), ('this', 'DET'), ('-PRON-', 'PRON'), ('will', 'VERB'), ('be', 'AUX'), ('away', 'ADV'), ('from', 'ADP'), ('to', 'ADP'), ('that', 'DET'), ('which', 'DET'), ('be', 'AUX'), ('to', 'PART'), ('be', 'AUX'), ('do', 'VERB'), ('to', 'PART'), ('do', 'AUX'), ('thought', 'NOUN'), ('that', 'SCONJ'), ('i', 'PRON'), ('think', 'VERB'), ('a', 'DET'), ('thought', 'NOUN')]
