# NLP - POS & NER

```
%conda install spacy`
```


~~~
import spacy
spacy.cli.download("en") 
~~~

In [1]:
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
wp = ('President Trump is questioning his administration’s aggressive strategy '
      'in Venezuela following the failure of a US-backed effort to oust '
      'President Nicolás Maduro, complaining he was misled about how easy '
      'it would be to replace the socialist strongman with a young opposition '
      'figure, according to administration officials and White House advisers.')

In [67]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    


President President PROPN NNP compound Xxxxx True False
Trump Trump PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
questioning question VERB VBG ROOT xxxx True False
his -PRON- DET PRP$ poss xxx True True
administration administration NOUN NN poss xxxx True False
’s ’s PART POS case ’x False True
aggressive aggressive ADJ JJ amod xxxx True False
strategy strategy NOUN NN dobj xxxx True False
in in ADP IN prep xx True True
Venezuela Venezuela PROPN NNP pobj Xxxxx True False
following follow VERB VBG prep xxxx True False
the the DET DT det xxx True True
failure failure NOUN NN pobj xxxx True False
of of ADP IN prep xx True True
a a DET DT det x True True
US US PROPN NNP npadvmod XX True True
- - PUNCT HYPH punct - False False
backed back VERB VBN amod xxxx True False
effort effort NOUN NN pobj xxxx True False
to to PART TO aux xx True True
oust oust VERB VB acl xxxx True False
President President PROPN NNP compound Xxxxx True False
Nicolás Nicolás PROPN NNP compound Xxxx

In [68]:
props(token)

NameError: name 'props' is not defined

In [7]:
for token in doc:
    print(token.pos_, '\t', token.text)

PROPN 	 President
PROPN 	 Trump
AUX 	 is
VERB 	 questioning
DET 	 his
NOUN 	 administration
PART 	 ’s
ADJ 	 aggressive
NOUN 	 strategy
ADP 	 in
PROPN 	 Venezuela
VERB 	 following
DET 	 the
NOUN 	 failure
ADP 	 of
DET 	 a
PROPN 	 US
PUNCT 	 -
VERB 	 backed
NOUN 	 effort
PART 	 to
VERB 	 oust
PROPN 	 President
PROPN 	 Nicolás
PROPN 	 Maduro
PUNCT 	 ,
VERB 	 complaining
PRON 	 he
AUX 	 was
VERB 	 misled
ADP 	 about
ADV 	 how
ADJ 	 easy
PRON 	 it
VERB 	 would
AUX 	 be
PART 	 to
VERB 	 replace
DET 	 the
ADJ 	 socialist
NOUN 	 strongman
ADP 	 with
DET 	 a
ADJ 	 young
NOUN 	 opposition
NOUN 	 figure
PUNCT 	 ,
VERB 	 according
ADP 	 to
NOUN 	 administration
NOUN 	 officials
CCONJ 	 and
PROPN 	 White
PROPN 	 House
NOUN 	 advisers
PUNCT 	 .


In [8]:
displacy.render(doc, style="dep", jupyter=True)

In [10]:
simple_wp = 'President Trump is questioning his administration’s aggressive strategy in Venezuela.'

In [11]:
doc2 = nlp(simple_wp)
displacy.render(doc2, style="dep", jupyter=True)

In [12]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Trump PERSON
Venezuela GPE
US GPE
Nicolás Maduro PERSON
White House ORG


In [13]:
displacy.render(doc, style="ent", jupyter=True)

How can you use this?

In [14]:
def extract_people(text):
    people = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            people.append(ent.text)
    return people

In [15]:
extract_people(wp)

['Trump', 'Nicolás Maduro']

In [16]:
for token in doc:
    display(token.pos_, '\t', token.text)

'PROPN'

'\t'

'President'

'PROPN'

'\t'

'Trump'

'AUX'

'\t'

'is'

'VERB'

'\t'

'questioning'

'DET'

'\t'

'his'

'NOUN'

'\t'

'administration'

'PART'

'\t'

'’s'

'ADJ'

'\t'

'aggressive'

'NOUN'

'\t'

'strategy'

'ADP'

'\t'

'in'

'PROPN'

'\t'

'Venezuela'

'VERB'

'\t'

'following'

'DET'

'\t'

'the'

'NOUN'

'\t'

'failure'

'ADP'

'\t'

'of'

'DET'

'\t'

'a'

'PROPN'

'\t'

'US'

'PUNCT'

'\t'

'-'

'VERB'

'\t'

'backed'

'NOUN'

'\t'

'effort'

'PART'

'\t'

'to'

'VERB'

'\t'

'oust'

'PROPN'

'\t'

'President'

'PROPN'

'\t'

'Nicolás'

'PROPN'

'\t'

'Maduro'

'PUNCT'

'\t'

','

'VERB'

'\t'

'complaining'

'PRON'

'\t'

'he'

'AUX'

'\t'

'was'

'VERB'

'\t'

'misled'

'ADP'

'\t'

'about'

'ADV'

'\t'

'how'

'ADJ'

'\t'

'easy'

'PRON'

'\t'

'it'

'VERB'

'\t'

'would'

'AUX'

'\t'

'be'

'PART'

'\t'

'to'

'VERB'

'\t'

'replace'

'DET'

'\t'

'the'

'ADJ'

'\t'

'socialist'

'NOUN'

'\t'

'strongman'

'ADP'

'\t'

'with'

'DET'

'\t'

'a'

'ADJ'

'\t'

'young'

'NOUN'

'\t'

'opposition'

'NOUN'

'\t'

'figure'

'PUNCT'

'\t'

','

'VERB'

'\t'

'according'

'ADP'

'\t'

'to'

'NOUN'

'\t'

'administration'

'NOUN'

'\t'

'officials'

'CCONJ'

'\t'

'and'

'PROPN'

'\t'

'White'

'PROPN'

'\t'

'House'

'NOUN'

'\t'

'advisers'

'PUNCT'

'\t'

'.'

In [19]:
def extract_adjectives(text):
    adjectives = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'ADJ':
            adjectives.append(token.text)
    adjectives = ', '.join(adjectives)
    return adjectives

In [20]:
extract_adjectives(wp)

'aggressive, easy, socialist, young'

In [22]:
import pandas as pd

bg_df_full = pd.read_csv('data/boardgames.csv')


In [23]:
bg_df = bg_df_full.sample(500)
bg_df.head()

Unnamed: 0,name,description,max_players,min_players,min_playtime,max_playtime,min_age,category,mechanics,year_published,...,mechanic_auctionbidding,mechanic_simulation,mechanic_areamovement,mechanic_simultaneousactionselection,mechanic_actionpointallowancesystem,mechanic_cooperativeplay,mechanic_pointtopointmovement,mechanic_partnerships,mechanic_memory,quality_game
12656,Zombicide: Black Plague,Description from the publisher:\n\nZombicide: ...,6.0,1.0,60.0,180.0,10.0,"Adventure, Fantasy, Fighting, Horror, Medieval...","CooperativePlay, DiceRolling, ModularBoard, Va...",2015.0,...,False,False,False,False,False,True,False,False,False,True
8471,Gonzaga,Description from BoardgameNews.com:\n\nThe Gon...,4.0,2.0,30.0,60.0,8.0,"AgeofReason, CityBuilding, Renaissance","AreaControlAreaInfluence, HandManagement, Rout...",2009.0,...,False,False,False,True,False,False,False,False,False,True
4629,Patience,Solitaire or Patience are a family of solitair...,1.0,1.0,10.0,10.0,8.0,CardGame,PatternBuilding,1783.0,...,False,False,False,False,False,False,False,False,False,False
11223,Rampaging Jotunn,In the distant past the Vikings came upon a gr...,2.0,2.0,15.0,45.0,8.0,"Ancient, CardGame, Dice, Fantasy, Fighting, Me...","ActionPointAllowanceSystem, AreaMovement, Dice...",2016.0,...,False,False,True,False,True,False,False,False,False,False
2399,Tac Air,(from the back of the box:)\n\nIf the balloon ...,4.0,2.0,120.0,120.0,12.0,"AviationFlight, ModernWarfare, Wargame",HexandCounter,1987.0,...,False,False,False,False,False,False,False,False,False,True


In [24]:
bg_df['adjectives'] = bg_df['description'].apply(extract_adjectives)

In [25]:
bg_df['adjectives']

12656    fantastical, arcane, alive, dark, responsible,...
8471     northern, expansionist, secret, specific, fief...
4629     standard, most, few, commercial, available, fu...
11223    distant, great, hidden, uninhabited, verdant, ...
2399     Armored, tense, smoky, intense, advanced, deta...
                               ...                        
6739                         upcoming, big, European, such
285      mega, -, front, regular, special, special, oth...
394      geometrical, dead, other, long, other, subsequ...
7813     60th, 20th, special, gorgeous, simplified, abl...
1479                           plastic, alternate, Various
Name: adjectives, Length: 500, dtype: object

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer(lowercase = True,
                             stop_words= 'english',
                             max_df    = 1.0,
                             min_df    = 0.0)
vectorizer.fit(bg_df['adjectives'])

CountVectorizer(min_df=0.0, stop_words='english')

In [28]:
wf_array = vectorizer.transform(bg_df['adjectives'])

df= pd.DataFrame(wf_array.todense(),
                 columns=vectorizer.get_feature_names())


In [29]:
df.sum().sort_values(ascending=False)

new            187
different      138
special         85
possible        64
german          61
              ... 
goooood          1
gooey            1
respectable      1
respectful       1
zulu             1
Length: 1347, dtype: int64

<div class="alert alert-info">
<h3> Your turn</h3>
<p> 1. What are the most popular adjectives in hip hop lyrics ? Hint: create a new df with just the hip-hop genre.
<p> 2. Are nouns, verbs or adjectives more useful in telling music genres apart? Focus on one part of speech at time; extract just those words; create a vectorizer with the 400 most frequent (excluding stop words); run a logistic regression model; record the accuracy; repeat.
    <p> 3. What are the most popular organizations in country music lyrics ? 


</div>





In [34]:
df['quality_game']  = bg_df['quality_game'].values

In [53]:
df.groupby('quality_game').sum().T.sort_values(by=False, ascending=False)

quality_game,False,True
new,53,134
different,53,85
special,40,45
possible,28,36
simple,24,24
...,...,...
mutant,0,2
mutual,0,2
mythic,0,2
brutal,0,5


In [38]:
df[df['quality_game']==1].sum().sort_values(ascending=False)

quality_game    244.0
new             134.0
different        85.0
special          45.0
german           40.0
                ...  
psychic           0.0
promotional       0.0
energetic         0.0
engaging          0.0
lightweight       0.0
Length: 1348, dtype: float64

In [39]:
df[df['quality_game']==0].sum().sort_values(ascending=False)

different        53.0
new              53.0
special          40.0
possible         28.0
simple           24.0
                 ... 
infinite          0.0
infantry          0.0
inexperienced     0.0
inexorable        0.0
quality_game      0.0
Length: 1348, dtype: float64