# NLP - POS

In [3]:
import spacy


In [4]:
nlp = spacy.load("en_core_web_md")

In [5]:
sentence = "A riot is the language of the unheard."

In [6]:
doc = nlp(sentence)

In [7]:
for token in doc:
    print(token.pos_, "\t", token.text)

DET 	 A
NOUN 	 riot
AUX 	 is
DET 	 the
NOUN 	 language
ADP 	 of
DET 	 the
ADJ 	 unheard
PUNCT 	 .


https://universaldependencies.org/u/pos/


| Tag      | Definition                  |
|:------|:--------------------------|
| ADJ   | adjective                 |
| ADP   | adposition                |
| ADV   | adverb                    |
| AUX   | auxiliary                 |
| CCONJ | coordinating conjunction  |
| DET   | determiner                |
| INTJ  | interjection              |
| NOUN  | noun                      |
| NUM   | numeral                   |
| PART  | particle                  |
| PRON  | pronoun                   |
| PROPN | proper noun               |
| PUNCT | punctuation               |
| SCONJ | subordinating conjunction |
| SPACE | space                     |
| SYM   | symbol                    |
| VERB  | verb                      |
| X     | other                     |

In [8]:
for token in doc:
    print("\t".join([token.text, token.tag_, spacy.explain(token.tag_),]))

A	DT	determiner
riot	NN	noun, singular or mass
is	VBZ	verb, 3rd person singular present
the	DT	determiner
language	NN	noun, singular or mass
of	IN	conjunction, subordinating or preposition
the	DT	determiner
unheard	JJ	adjective
.	.	punctuation mark, sentence closer


https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

| Tag      | Definition                  |
|:------|:--------------------------|
|CC|Coordinating conjunction|
|CD|Cardinal number|
|DT|Determiner|
|EX|Existential there|
|FW|Foreign word|
|IN|Preposition or subordinating conjunction|
|JJ|Adjective|
|JJR|Adjective, comparative|
|JJS|Adjective, superlative|
|LS|List item marker|
|MD|Modal|
|NN|Noun, singular or mass|
|NNS|Noun, plural|
|NNP|Proper noun, singular|
|NNPS|Proper noun, plural|
|PDT|Predeterminer|
|POS|Possessive ending|
|PRP|Personal pronoun|
|PRP\$|Possessive pronoun|
|RB|Adverb|
|RBR|Adverb, comparative|
|RBS|Adverb, superlative|
|RP|Particle|
|SYM|Symbol|
|TO|to|
|UH|Interjection|
|VB|Verb, base form|
|VBD|Verb, past tense|
|VBG|Verb, gerund or present participle|
|VBN|Verb, past participle|
|VBP|Verb, non-3rd person singular present|
|VBZ|Verb, 3rd person singular present|
|WDT|Wh-determiner|
|WP|Wh-pronoun|
|WP$|Possessive wh-pronoun|
|WRB|Wh-adverb|




In [9]:
long_sentence = (
    "The strongest rain ever recorded in India shut down "
    "the financial hub of Mumbai, snapped communication lines, "
    "closed airports and forced thousands of people to sleep "
    "in their offices or walk home during the night, "
    "officials said today."
)

In [10]:
doc = nlp(long_sentence)
for token in doc:
    print(token.tag_, "\t", token.text)

DT 	 The
JJS 	 strongest
NN 	 rain
RB 	 ever
VBN 	 recorded
IN 	 in
NNP 	 India
VBD 	 shut
RP 	 down
DT 	 the
JJ 	 financial
NN 	 hub
IN 	 of
NNP 	 Mumbai
, 	 ,
VBD 	 snapped
NN 	 communication
NNS 	 lines
, 	 ,
JJ 	 closed
NNS 	 airports
CC 	 and
VBD 	 forced
NNS 	 thousands
IN 	 of
NNS 	 people
TO 	 to
VB 	 sleep
IN 	 in
PRP$ 	 their
NNS 	 offices
CC 	 or
VB 	 walk
RB 	 home
IN 	 during
DT 	 the
NN 	 night
, 	 ,
NNS 	 officials
VBD 	 said
NN 	 today
. 	 .


In [11]:
import pandas as pd

tokens = []

for token in nlp(long_sentence):
    t_attr = {
        "text": token.text,
        "pos": token.pos_,
        "tag": token.tag_,
    }
    # print(t_attr)
    tokens.append(t_attr)

df = pd.DataFrame.from_records(tokens)

In [12]:
df

Unnamed: 0,text,pos,tag
0,The,DET,DT
1,strongest,ADJ,JJS
2,rain,NOUN,NN
3,ever,ADV,RB
4,recorded,VERB,VBN
5,in,ADP,IN
6,India,PROPN,NNP
7,shut,VERB,VBD
8,down,ADP,RP
9,the,DET,DT


How can you use this?

In [13]:
def extract_adjectives(text):
    adjectives = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "ADJ":
            adjectives.append(token.text)
    return adjectives

In [14]:
extract_adjectives(long_sentence)

['strongest', 'financial', 'closed']

In [15]:
import pandas as pd

wine_df = pd.read_csv("data/aita_20K.csv")

In [16]:
wine_df.sample(5)

Unnamed: 0,id,timestamp,title,body,edited,verdict,score,num_comments,is_asshole
13657,duvw71,1573494000.0,AITA for calling my partner an asshole?,"Hey yall,\n\nDidn't know I would one day turn ...",1573494876.0,not the asshole,5,27.0,0
10273,b7vpd3,1554083000.0,AITA Stopping a man from hiring staff,"Some background, I live and work as a bartende...",False,not the asshole,6,18.0,0
17824,dthesg,1573229000.0,"AITA for ""acquaintance zoning"" a former friend...",&#x200B;\n\nA coworker I work with is a notori...,False,not the asshole,10,37.0,0
1806,c98k3k,1562279000.0,AITA for thinking my mom overreacted when I sa...,Lemme make this short. I( 15F) was by my broth...,False,asshole,10,26.0,1
1196,a8dex3,1545420000.0,AITA for taking all of our presents away?,Edit: Dad and I have been texting about this (...,1545448574.0,asshole,26,115.0,1


In [18]:
wine_df.dropna(inplace=True)

In [19]:
wine_df["adjectives"] = wine_df["body"].apply(extract_adjectives)

In [20]:
wine_df["adjectives"].sample(5)

1840     [best, female, rude, weird, happy, few, other,...
11836    [annoyed, first, sure, okay, sure, other, free...
5643     [creepy, low, more, higher, willing, more, muc...
6885     [many, other, experienced, fine, fun, exact, n...
13317    [long, respective, low, cheap, fine, perfect, ...
Name: adjectives, dtype: object

In [21]:
wine_df["adj_count"] = wine_df["adjectives"].str.len()

In [22]:
wine_df["adj_count"].median()

17.0

In [23]:
wine_df.groupby("is_asshole")["adj_count"].median()

is_asshole
0    17.5
1    16.0
Name: adj_count, dtype: float64

In [24]:
wine_df_long = wine_df.explode("adjectives")

In [25]:
wine_df_long.head(5)

Unnamed: 0,id,timestamp,title,body,edited,verdict,score,num_comments,is_asshole,adjectives,adj_count
0,d115xx,1567888000.0,AITA for hating a family friend’s son who has ...,Here’s some context. So ever since I was born ...,1567977353.0,asshole,8,27.0,1,few,29
0,d115xx,1567888000.0,AITA for hating a family friend’s son who has ...,Here’s some context. So ever since I was born ...,1567977353.0,asshole,8,27.0,1,whole,29
0,d115xx,1567888000.0,AITA for hating a family friend’s son who has ...,Here’s some context. So ever since I was born ...,1567977353.0,asshole,8,27.0,1,horrible,29
0,d115xx,1567888000.0,AITA for hating a family friend’s son who has ...,Here’s some context. So ever since I was born ...,1567977353.0,asshole,8,27.0,1,single,29
0,d115xx,1567888000.0,AITA for hating a family friend’s son who has ...,Here’s some context. So ever since I was born ...,1567977353.0,asshole,8,27.0,1,long,29


In [26]:
wine_df_long['adjectives'].value_counts()

other            13639
few               8094
more              5990
good              5950
last              5533
                 ...  
grownup              1
unsuited             1
Wasteful             1
infintie             1
uncoordinated        1
Name: adjectives, Length: 8600, dtype: int64

In [27]:
ratings = wine_df_long.groupby("is_asshole")["adjectives"].apply(
    lambda x: x.value_counts(normalize=True).head(10)
)
ratings

is_asshole        
0           other     0.035723
            few       0.022076
            more      0.015644
            last      0.015162
            good      0.015006
            bad       0.014539
            new       0.013274
            first     0.013180
            same      0.013040
            little    0.012962
1           other     0.037000
            few       0.021032
            good      0.016752
            more      0.016297
            same      0.014467
            last      0.014302
            bad       0.014231
            little    0.013579
            first     0.013157
            new       0.011694
Name: adjectives, dtype: float64

In [28]:
common_adjs = wine_df_long["adjectives"].value_counts().index[:100]

In [40]:
ratio_df = (
    wine_df_long.groupby("adjectives")["is_asshole"]
    .apply(lambda x: x.value_counts(normalize=True))
    .unstack(level=0)
)

In [41]:
ratio_df[common_adjs].T.sort_values(by=1, ascending=False).head(10)

Unnamed: 0_level_0,0,1
adjectives,Unnamed: 1_level_1,Unnamed: 2_level_1
honest,0.433584,0.566416
fair,0.453321,0.546679
clear,0.471669,0.528331
serious,0.474308,0.525692
interested,0.478972,0.521028
real,0.479927,0.520073
least,0.481714,0.518286
good,0.486218,0.513782
cool,0.486581,0.513419
important,0.487342,0.512658


In [33]:
ratio_df[common_adjs].T.sort_values(by=0, ascending=False).head(10)

Unnamed: 0_level_0,0,1
adjectives,Unnamed: 1_level_1,Unnamed: 2_level_1
uncomfortable,0.583553,0.416447
comfortable,0.583408,0.416592
alone,0.582049,0.417951
selfish,0.57602,0.42398
tired,0.575569,0.424431
worse,0.571263,0.428737
mental,0.569106,0.430894
older,0.55803,0.44197
multiple,0.557171,0.442829
terrible,0.554785,0.445215


In [39]:
ratio_df.T

adjectives,"""agriculture",-,-as,/,1$.,1-sided,10/20/30,100cc,100th,100yo,...,zany,zero,zippy,zombified,zoned,~1000$a,~£100,​,•her,😡
0,1.0,0.509271,1.0,0.545455,1.0,1.0,1.0,,0.4,,...,,1.0,1.0,1.0,1.0,1.0,1.0,0.4,1.0,1.0
1,,0.490729,,0.454545,,,,1.0,0.6,1.0,...,1.0,,,,,,,0.6,,
