# NLP - POS & NER

```
%conda install spacy`
```


~~~
import spacy
spacy.cli.download("en") 
~~~

In [1]:
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
wp = ('President Trump is questioning his administration’s aggressive strategy '
      'in Venezuela following the failure of a US-backed effort to oust '
      'President Nicolás Maduro, complaining he was misled about how easy '
      'it would be to replace the socialist strongman with a young opposition '
      'figure, according to administration officials and White House advisers.')

In [4]:
doc = nlp(wp)

In [5]:
for token in doc:
    print(token.pos_, '\t', token.text)

PROPN 	 President
PROPN 	 Trump
VERB 	 is
VERB 	 questioning
ADJ 	 his
NOUN 	 administration
PART 	 ’s
ADJ 	 aggressive
NOUN 	 strategy
ADP 	 in
PROPN 	 Venezuela
VERB 	 following
DET 	 the
NOUN 	 failure
ADP 	 of
DET 	 a
PROPN 	 US
PUNCT 	 -
VERB 	 backed
NOUN 	 effort
PART 	 to
VERB 	 oust
PROPN 	 President
PROPN 	 Nicolás
PROPN 	 Maduro
PUNCT 	 ,
VERB 	 complaining
PRON 	 he
VERB 	 was
VERB 	 misled
ADP 	 about
ADV 	 how
ADJ 	 easy
PRON 	 it
VERB 	 would
VERB 	 be
PART 	 to
VERB 	 replace
DET 	 the
ADJ 	 socialist
NOUN 	 strongman
ADP 	 with
DET 	 a
ADJ 	 young
NOUN 	 opposition
NOUN 	 figure
PUNCT 	 ,
VERB 	 according
ADP 	 to
NOUN 	 administration
NOUN 	 officials
CCONJ 	 and
PROPN 	 White
PROPN 	 House
NOUN 	 advisers
PUNCT 	 .


In [6]:
displacy.render(doc, style="dep", jupyter=True)

In [7]:
simple_wp = 'President Trump is questioning his administration’s aggressive strategy in Venezuela.'

In [8]:
doc2 = nlp(simple_wp)
displacy.render(doc2, style="dep", jupyter=True)

In [9]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Trump PERSON
Venezuela GPE
US GPE
Nicolás Maduro PERSON
White House ORG


In [10]:
displacy.render(doc, style="ent", jupyter=True)

How can you use this?

In [11]:
def extract_people(text):
    people = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            people.append(ent.text)
    return people

In [12]:
extract_people(wp)

['Trump', 'Nicolás Maduro']

In [13]:
for token in doc:
    print(token.pos_, '\t', token.text)

PROPN 	 President
PROPN 	 Trump
VERB 	 is
VERB 	 questioning
ADJ 	 his
NOUN 	 administration
PART 	 ’s
ADJ 	 aggressive
NOUN 	 strategy
ADP 	 in
PROPN 	 Venezuela
VERB 	 following
DET 	 the
NOUN 	 failure
ADP 	 of
DET 	 a
PROPN 	 US
PUNCT 	 -
VERB 	 backed
NOUN 	 effort
PART 	 to
VERB 	 oust
PROPN 	 President
PROPN 	 Nicolás
PROPN 	 Maduro
PUNCT 	 ,
VERB 	 complaining
PRON 	 he
VERB 	 was
VERB 	 misled
ADP 	 about
ADV 	 how
ADJ 	 easy
PRON 	 it
VERB 	 would
VERB 	 be
PART 	 to
VERB 	 replace
DET 	 the
ADJ 	 socialist
NOUN 	 strongman
ADP 	 with
DET 	 a
ADJ 	 young
NOUN 	 opposition
NOUN 	 figure
PUNCT 	 ,
VERB 	 according
ADP 	 to
NOUN 	 administration
NOUN 	 officials
CCONJ 	 and
PROPN 	 White
PROPN 	 House
NOUN 	 advisers
PUNCT 	 .


In [15]:
def extract_adjectives(text):
    adjectives = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'ADJ':
            adjectives.append(token.text)
    adjectives = ', '.join(adjectives)
    return adjectives

In [16]:
extract_adjectives(wp)

'his, aggressive, easy, socialist, young'

In [18]:
import pandas as pd

bg_df_full = pd.read_csv('data/boardgames.csv')


In [22]:
bg_df = bg_df_full.sample(500)
bg_df.head()

Unnamed: 0,name,description,max_players,min_players,min_playtime,max_playtime,min_age,category,mechanics,year_published,...,mechanic_auctionbidding,mechanic_simulation,mechanic_areamovement,mechanic_simultaneousactionselection,mechanic_actionpointallowancesystem,mechanic_cooperativeplay,mechanic_pointtopointmovement,mechanic_partnerships,mechanic_memory,quality_game
10800,Firefly: The Game,"Players begin with a ship, and travel from pla...",5.0,1.0,120.0,240.0,13.0,"MoviesTVRadiotheme, ScienceFiction, SpaceExplo...","AreaMovement, CardDrafting, DiceRolling, Picku...",2013.0,...,False,False,True,False,False,False,False,False,False,True
8402,Bisikle,Bisikle is a flicking game. The players have t...,4.0,1.0,30.0,30.0,5.0,"ActionDexterity, Racing, Sports",,2009.0,...,False,False,False,False,False,False,False,False,False,True
8834,Panic Station,Panic Station is a paranoia-driven partly coop...,6.0,4.0,40.0,40.0,10.0,"Adventure, Bluffing, Exploration, PartyGame, S...","ActionPointAllowanceSystem, AreaMovement, Dice...",2011.0,...,False,False,True,False,True,False,False,True,False,True
916,Vegas,Vegas is set in a casino with a game board tha...,4.0,2.0,60.0,60.0,12.0,,"DiceRolling, TilePlacement",1996.0,...,False,False,False,False,False,False,False,False,False,False
3136,Mad Gab Card Game,Two teams compete to solve ten word puzzles co...,10.0,2.0,20.0,20.0,10.0,"CardGame, Puzzle, WordGame",Partnerships,2000.0,...,False,False,False,False,False,False,False,True,False,False


In [23]:
bg_df['adjectives'] = bg_df['description'].apply(extract_adjectives)

In [24]:
bg_df['adjectives']

10800    illegal, your, full, Most, First, popular, the...
8402     crazy, electrical, whole, unique, internal, ea...
8834     cooperative, which, fiendish, alien, their, th...
916               that, special, best, highest, total, new
3136                             unrelated, that, familiar
7484     your, fast, furious, old, traditional, differe...
7334     simple, little, different, that, their, talles...
13210    accessible, clever, tiered, that, his, other, ...
11575    formidable, Indian, ripe, his, more, his, succ...
7939     Last, major, large, second, your, Such, unprof...
2285                                 24th, his, own, other
9574     different, unknown, legendary, our, our, vast,...
7806     simple, British, Swordfish, incoming, its, Cre...
10337    vain, flamboyant, blind, skinny, common, your,...
9943     safe, elder, his, sunken, complete, ultimate, ...
2807                                                  last
530      customizable, boxed, which, supernatural, more.

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
vectorizer = CountVectorizer(lowercase = True,
                             stop_words= 'english',
                             max_df    = 1.0,
                             min_df    = 0.0)
vectorizer.fit(bg_df['adjectives'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.0,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [32]:
wf_array = vectorizer.transform(bg_df['adjectives'])

df= pd.DataFrame(wf_array.todense(),
                 columns=vectorizer.get_feature_names())


In [33]:
df.sum().sort_values(ascending=False)

new             192
different       107
special          84
possible         66
second           49
german           49
best             48
various          47
simple           45
available        43
historical       40
certain          40
unique           39
large            38
small            35
strategic        35
powerful         35
high             34
right            33
good             33
highest          33
tactical         32
single           32
french           32
little           31
additional       31
great            31
original         31
real             31
able             30
               ... 
momentous         1
morbid            1
movable           1
multicolored      1
officious         1
octagonal         1
occasional        1
oblong            1
nuclear           1
northeastern      1
nordic            1
nonstop           1
nightfall         1
nicer             1
nice              1
net               1
near              1
nazi              1
naughty           1


<div class="alert alert-info">
<h3> Your turn</h3>
<p> 1. What are the most popular adjectives in hip hop lyrics ? Hint: create a new df with just the hip-hop genre.
<p> 2. Are nouns, verbs or adjectives more useful in telling music genres apart? Focus on one part of speech at time; extract just those words; create a vectorizer with the 400 most frequent (excluding stop words); run a logistic regression model; record the accuracy; repeat.
    <p> 3. What are the most popular organizations in country music lyrics ? 


</div>



