In [4]:
import spacy

In [5]:
nlp = spacy.load('en')

In [6]:
documents = "I’ve been 2 times to New York in 2011, but did not have the constitution for it. It DIDN’T appeal to me. I preferred Los Angeles."

In [7]:
tokens = [[token.text for token in sentence] for sentence in nlp(
documents).sents]

In [8]:
print(tokens)

[['I', '’ve', 'been', '2', 'times', 'to', 'New', 'York', 'in', '2011', ',', 'but', 'did', 'not', 'have', 'the', 'constitution', 'for', 'it', '.'], ['It', 'DIDN’T', 'appeal', 'to', 'me', '.'], ['I', 'preferred', 'Los', 'Angeles', '.']]


In [9]:
lemmas = [[token.lemma_ for token in sentence] for sentence in nlp(
documents).sents]

In [10]:
print(lemmas)

[['-PRON-', 'have', 'be', '2', 'time', 'to', 'New', 'York', 'in', '2011', ',', 'but', 'do', 'not', 'have', 'the', 'constitution', 'for', '-PRON-', '.'], ['-PRON-', 'DIDN’T', 'appeal', 'to', '-PRON-', '.'], ['-PRON-', 'prefer', 'Los', 'Angeles', '.']]


In [11]:
from nltk import SnowballStemmer

In [13]:
stemmer = SnowballStemmer("english")

In [14]:
stems = [[stemmer.stem(token) for token in sentence] for sentence in
tokens]

In [15]:
print(stems)

[['i', 've', 'been', '2', 'time', 'to', 'new', 'york', 'in', '2011', ',', 'but', 'did', 'not', 'have', 'the', 'constitut', 'for', 'it', '.'], ['it', "didn't", 'appeal', 'to', 'me', '.'], ['i', 'prefer', 'los', 'angel', '.']]


In [16]:
from nltk import ngrams

In [17]:
bigrams = [gram for gram in ngrams(tokens[0], 2)]

In [18]:
print(bigrams)

[('I', '’ve'), ('’ve', 'been'), ('been', '2'), ('2', 'times'), ('times', 'to'), ('to', 'New'), ('New', 'York'), ('York', 'in'), ('in', '2011'), ('2011', ','), (',', 'but'), ('but', 'did'), ('did', 'not'), ('not', 'have'), ('have', 'the'), ('the', 'constitution'), ('constitution', 'for'), ('for', 'it'), ('it', '.')]


In [19]:
pos = [[token.pos_ for token in sentence] for sentence in nlp(
documents).sents]

In [20]:
print(pos)

[['PRON', 'VERB', 'AUX', 'NUM', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'ADP', 'NUM', 'PUNCT', 'CCONJ', 'AUX', 'PART', 'AUX', 'DET', 'NOUN', 'ADP', 'PRON', 'PUNCT'], ['PRON', 'VERB', 'VERB', 'ADP', 'PRON', 'PUNCT'], ['PRON', 'VERB', 'PROPN', 'PROPN', 'PUNCT']]


In [22]:
content = [[token.text for token in sentence
if token.pos_ in {'NOUN', 'VERB', 'PROPN', 'ADJ', 'ADV'}
and not token.is_stop]
for sentence in nlp(documents).sents]

In [23]:
print (content)

[['times', 'New', 'York', 'constitution'], ['DIDN’T', 'appeal'], ['preferred', 'Los', 'Angeles']]


In [25]:
entities = [[(entity.text, entity.label_)
for entity in nlp(sentence.text).ents]
for sentence in nlp(documents).sents]


In [26]:
print (entities)

[[('2', 'CARDINAL'), ('New York', 'GPE'), ('2011', 'DATE')], [], [('Los Angeles', 'GPE')]]
