# NLP - POS

In [24]:
import spacy


In [25]:
nlp = spacy.load("en_core_web_md")

In [26]:
sentence = "A riot is the language of the unheard."

In [27]:
doc = nlp(sentence)

In [28]:
for token in doc:
    print(token.pos_, "\t", token.text)

DET 	 A
NOUN 	 riot
AUX 	 is
DET 	 the
NOUN 	 language
ADP 	 of
DET 	 the
ADJ 	 unheard
PUNCT 	 .


https://universaldependencies.org/u/pos/


| Tag      | Definition                  |
|:------|:--------------------------|
| ADJ   | adjective                 |
| ADP   | adposition                |
| ADV   | adverb                    |
| AUX   | auxiliary                 |
| CCONJ | coordinating conjunction  |
| DET   | determiner                |
| INTJ  | interjection              |
| NOUN  | noun                      |
| NUM   | numeral                   |
| PART  | particle                  |
| PRON  | pronoun                   |
| PROPN | proper noun               |
| PUNCT | punctuation               |
| SCONJ | subordinating conjunction |
| SPACE | space                     |
| SYM   | symbol                    |
| VERB  | verb                      |
| X     | other                     |

In [29]:
for token in doc:
    print("\t".join([token.text, token.tag_, spacy.explain(token.tag_),]))

A	DT	determiner
riot	NN	noun, singular or mass
is	VBZ	verb, 3rd person singular present
the	DT	determiner
language	NN	noun, singular or mass
of	IN	conjunction, subordinating or preposition
the	DT	determiner
unheard	JJ	adjective
.	.	punctuation mark, sentence closer


https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

| Tag      | Definition                  |
|:------|:--------------------------|
|CC|Coordinating conjunction|
|CD|Cardinal number|
|DT|Determiner|
|EX|Existential there|
|FW|Foreign word|
|IN|Preposition or subordinating conjunction|
|JJ|Adjective|
|JJR|Adjective, comparative|
|JJS|Adjective, superlative|
|LS|List item marker|
|MD|Modal|
|NN|Noun, singular or mass|
|NNS|Noun, plural|
|NNP|Proper noun, singular|
|NNPS|Proper noun, plural|
|PDT|Predeterminer|
|POS|Possessive ending|
|PRP|Personal pronoun|
|PRP\$|Possessive pronoun|
|RB|Adverb|
|RBR|Adverb, comparative|
|RBS|Adverb, superlative|
|RP|Particle|
|SYM|Symbol|
|TO|to|
|UH|Interjection|
|VB|Verb, base form|
|VBD|Verb, past tense|
|VBG|Verb, gerund or present participle|
|VBN|Verb, past participle|
|VBP|Verb, non-3rd person singular present|
|VBZ|Verb, 3rd person singular present|
|WDT|Wh-determiner|
|WP|Wh-pronoun|
|WP$|Possessive wh-pronoun|
|WRB|Wh-adverb|




In [30]:
long_sentence = (
    "The strongest rain ever recorded in India shut down "
    "the financial hub of Mumbai, snapped communication lines, "
    "closed airports and forced thousands of people to sleep "
    "in their offices or walk home during the night, "
    "officials said today."
)

In [50]:
doc = nlp(long_sentence)
for token in doc:
    print(token.tag_, "\t", token.text)

DT 	 The
JJS 	 strongest
NN 	 rain
RB 	 ever
VBN 	 recorded
IN 	 in
NNP 	 India
VBD 	 shut
RP 	 down
DT 	 the
JJ 	 financial
NN 	 hub
IN 	 of
NNP 	 Mumbai
, 	 ,
VBD 	 snapped
NN 	 communication
NNS 	 lines
, 	 ,
JJ 	 closed
NNS 	 airports
CC 	 and
VBD 	 forced
NNS 	 thousands
IN 	 of
NNS 	 people
TO 	 to
VB 	 sleep
IN 	 in
PRP$ 	 their
NNS 	 offices
CC 	 or
VB 	 walk
RB 	 home
IN 	 during
DT 	 the
NN 	 night
, 	 ,
NNS 	 officials
VBD 	 said
NN 	 today
. 	 .


In [32]:
import pandas as pd

tokens = []

for token in nlp(long_sentence):
    t_attr = {
        "text": token.text,
        "pos": token.pos_,
        "tag": token.tag_,
    }
    # print(t_attr)
    tokens.append(t_attr)

df = pd.DataFrame.from_records(tokens)

In [33]:
df

Unnamed: 0,text,pos,tag
0,The,DET,DT
1,strongest,ADJ,JJS
2,rain,NOUN,NN
3,ever,ADV,RB
4,recorded,VERB,VBN
5,in,ADP,IN
6,India,PROPN,NNP
7,shut,VERB,VBD
8,down,ADP,RP
9,the,DET,DT


How can you use this?

In [34]:
def extract_adjectives(text):
    adjectives = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "ADJ":
            adjectives.append(token.text)
    return adjectives

In [35]:
extract_adjectives(long_sentence)

['strongest', 'financial', 'closed']

In [36]:
import pandas as pd

wine_df = pd.read_csv("data/wine_reviews_10k.csv")

In [37]:
wine_df.sample(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,rating
5859,5859,Italy,"Heady floral scents of wisteria, jasmine and h...",Brut,87,26.0,Veneto,Conegliano Valdobbiadene Prosecco Superiore,,Kerin O’Keefe,@kerinokeefe,Masottina NV Brut (Conegliano Valdobbiadene P...,Glera,Masottina,Low
8880,8880,US,A Chardonnay at a decent price for what you ge...,,84,15.0,California,Monterey County,Central Coast,,,Bridlewood 2011 Chardonnay (Monterey County),Chardonnay,Bridlewood,Low
2892,2892,Italy,"Red berry, leather and aromatic herb aromas sl...",San Lorenzo di Verduno,92,65.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Fratelli Alessandria 2013 San Lorenzo di Verdu...,Nebbiolo,Fratelli Alessandria,High
8995,8995,Spain,"Apple, lettuce and celery aromas are a bit har...",,86,17.0,Northern Spain,Valdeorras,,Michael Schachner,@wineschach,Pagos del Galir 2012 Godello (Valdeorras),Godello,Pagos del Galir,Low
7424,7424,US,"Rather overripe, this Cab has the flavors of s...",Le Vigne di San Domenico,84,25.0,California,Paso Robles,Central Coast,,,Sylvester 2005 Le Vigne di San Domenico Cabern...,Cabernet Sauvignon,Sylvester,Low


In [38]:
wine_df["adjectives"] = wine_df["description"].apply(extract_adjectives)

In [39]:
wine_df["adjectives"].sample(5)

1465    [Brawny, flavorful, black, prime, plush, struc...
4994    [good, full, ripe, integrated, high, sweet, wh...
2670    [high, Peppery, young, tight, compact, excepti...
7671     [Creamy, foamy, soft, cheerful, early, low, own]
6565                                 [flat, heavy, short]
Name: adjectives, dtype: object

In [40]:
wine_df["adj_count"] = wine_df["adjectives"].str.len()

In [53]:
wine_df["adj_count"].median()

7.0

In [52]:
wine_df.groupby("rating")["adj_count"].median()

rating
High    8
Low     6
Name: adj_count, dtype: int64

In [42]:
wine_df_long = wine_df.explode("adjectives")

In [55]:
wine_df_long.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,rating,adjectives,adj_count
0,0,Italy,"Scents of citrus blossom, honey, sea breeze an...",Santigaini,94,157.0,Sicily & Sardinia,Isola dei Nuraghi,,Kerin O’Keefe,@kerinokeefe,Capichera 2011 Santigaini Vermentino (Isola de...,Vermentino,Capichera,High,aromatic,9
0,0,Italy,"Scents of citrus blossom, honey, sea breeze an...",Santigaini,94,157.0,Sicily & Sardinia,Isola dei Nuraghi,,Kerin O’Keefe,@kerinokeefe,Capichera 2011 Santigaini Vermentino (Isola de...,Vermentino,Capichera,High,stunning,9
0,0,Italy,"Scents of citrus blossom, honey, sea breeze an...",Santigaini,94,157.0,Sicily & Sardinia,Isola dei Nuraghi,,Kerin O’Keefe,@kerinokeefe,Capichera 2011 Santigaini Vermentino (Isola de...,Vermentino,Capichera,High,single,9
0,0,Italy,"Scents of citrus blossom, honey, sea breeze an...",Santigaini,94,157.0,Sicily & Sardinia,Isola dei Nuraghi,,Kerin O’Keefe,@kerinokeefe,Capichera 2011 Santigaini Vermentino (Isola de...,Vermentino,Capichera,High,Fresh,9
0,0,Italy,"Scents of citrus blossom, honey, sea breeze an...",Santigaini,94,157.0,Sicily & Sardinia,Isola dei Nuraghi,,Kerin O’Keefe,@kerinokeefe,Capichera 2011 Santigaini Vermentino (Isola de...,Vermentino,Capichera,High,citrus,9


In [43]:
wine_df_long['adjectives'].value_counts()

black         2188
ripe          1925
red           1460
rich          1341
fresh         1165
              ... 
Vintage          1
stupendous       1
duller           1
permissive       1
Thinner          1
Name: adjectives, Length: 2903, dtype: int64

In [63]:
ratings = wine_df_long.groupby("rating")["adjectives"].apply(
    lambda x: x.value_counts(normalize=True).head(10)
)
ratings

rating       
High    black    0.040171
        ripe     0.030727
        rich     0.025737
        full     0.018202
        dark     0.017998
        red      0.017234
        fresh    0.014205
        dry      0.012525
        fine     0.011176
        juicy    0.011099
Low     red      0.025950
        ripe     0.023796
        dry      0.021542
        black    0.020217
        soft     0.020184
        fresh    0.020117
        sweet    0.019024
        crisp    0.015676
        light    0.014284
        good     0.013920
Name: adjectives, dtype: float64

In [46]:
common_adjs = wine_df_long["adjectives"].value_counts().index[:100]

In [47]:
ratio_df = (
    wine_df_long.groupby("adjectives")["rating"]
    .apply(lambda x: x.value_counts(normalize=True))
    .unstack(level=0)
)

In [48]:
ratio_df[common_adjs].T.sort_values(by="High", ascending=False).head(10)

Unnamed: 0_level_0,High,Low
adjectives,Unnamed: 1_level_1,Unnamed: 2_level_1
impressive,0.966507,0.033493
beautiful,0.952381,0.047619
powerful,0.928571,0.071429
velvety,0.918129,0.081871
complex,0.894472,0.105528
lovely,0.875,0.125
long,0.859189,0.140811
concentrated,0.858038,0.141962
polished,0.857143,0.142857
dense,0.853012,0.146988


In [49]:
ratio_df[common_adjs].T.sort_values(by="Low", ascending=False).head(10)

Unnamed: 0_level_0,High,Low
adjectives,Unnamed: 1_level_1,Unnamed: 2_level_1
simple,0.017123,0.982877
easy,0.129278,0.870722
little,0.18662,0.81338
lean,0.189024,0.810976
attractive,0.224409,0.775591
bitter,0.253731,0.746269
sour,0.272727,0.727273
heavy,0.27907,0.72093
medium,0.290323,0.709677
nice,0.291925,0.708075
