# NLP - POS & NER

```
%conda install spacy  spacy-model-en_core_web_sm -c conda-forge
```


~~~
import spacy
spacy.cli.download("en") 
~~~

In [1]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

In [1]:
import spacy
from spacy import displacy

In [6]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [7]:
wp = ('President Trump is questioning his administration’s aggressive strategy '
      'in Venezuela following the failure of a US-backed effort to oust '
      'President Nicolás Maduro, complaining he was misled about how easy '
      'it would be to replace the socialist strongman with a young opposition '
      'figure, according to administration officials and White House advisers.')

In [8]:
doc = nlp(wp)

In [10]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


In [11]:
# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


[https://course.spacy.io/en/chapter1]
In addition to the part-of-speech tags, we can also predict how the words are related. For example, whether a word is the subject of the sentence or an object.

The .dep_ attribute returns the predicted dependency label.

The .head attribute returns the syntactic head token. You can also think of it as the parent token this word is attached to.

In [12]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [13]:
# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [9]:
for token in doc:
    if token.pos_ == 'PROPN':
        print(token.head.text, '\t', token.text, '\t', token.pos_)

Trump 	 President 	 PROPN
questioning 	 Trump 	 PROPN
in 	 Venezuela 	 PROPN
backed 	 US 	 PROPN
Maduro 	 President 	 PROPN
Maduro 	 Nicolás 	 PROPN
oust 	 Maduro 	 PROPN
House 	 White 	 PROPN
advisers 	 House 	 PROPN


In [5]:
for e in doc.ents:
    print(e)

In [8]:
import pandas as pd

df_full = pd.read_csv('data/wine_reviews.csv')



In [9]:
df = df_full.sample(10000)
# df = df_full.copy()

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,rating
62378,62378,Spain,Dusty white-fruit aromas are grassy and smell ...,Brut,84,14.0,Catalonia,Cava,,Michael Schachner,@wineschach,Conde de Subirats NV Brut Sparkling (Cava),Sparkling Blend,Conde de Subirats,Low
77849,77849,US,"A blend of Cabernet Sauvignon, Cabernet Franc ...",Anne Martin McCool,91,40.0,Washington,Columbia Valley (WA),Columbia Valley,Sean P. Sullivan,@wawinereport,àMaurice 2013 Anne Martin McCool Red (Columbia...,Bordeaux-style Red Blend,àMaurice,High
69730,69730,Israel,"Always a good value, Recanati's Yasmin line is...",Yasmin,85,11.0,Shomron,,,Lauren Buzzeo,@laurbuzz,Recanati 2011 Yasmin White (Shomron),White Blend,Recanati,Low
16704,16704,Portugal,"Lightly fizzy, this young fruity wine is an ea...",Rotas de Portugal Branco,85,9.0,Vinho Verde,,,Roger Voss,@vossroger,Santos & Seixo NV Rotas de Portugal Branco Whi...,Portuguese White,Santos & Seixo,Low
63143,63143,US,"One of the “12 Zins of Lodi 2014,” it was chos...",Old Vine,92,15.0,California,Lodi,Central Valley,Jim Gordon,@gordone_cellars,Za Zin 2010 Old Vine Zinfandel (Lodi),Zinfandel,Za Zin,High


In [11]:
df['spacy_doc'] = df['description'].apply(nlp)

In [12]:
def extract_adjectives(doc):
    adjectives = [t.text for t in doc if t.pos_ == 'ADJ']
    adjectives = ', '.join(adjectives)
    return adjectives

In [13]:
df['adj'] = df['spacy_doc'].apply(extract_adjectives)

In [14]:
df['adj']

62378         Dusty, white, grassy, plump, little, Generic
77849                 reserved, assorted, black, immediate
69730                      good, easy, green, short, clean
16704    fizzy, young, easygoing, dry, soft, lively, fl...
63143    finest, deep, red, black, Deep, evocative, rip...
21155                  its, fresh, crisp, lively, fragrant
28127         which, salva, imminent, wonderful, asparagus
176      local, tight, crisp, green, Its, fresh, good, ...
40279    plush, straightforward, that, upfront, medium,...
1495      lean, green, grippy, spicy, herbal, green, basic
11610                                  dark, molten, hefty
18241    good, pure, excellent, muscular, aromatic, den...
84964                     black, bright, full, great, good
71386    standard, rich, opulent, ripe, black, fragrant...
84904    shy, closed, all, firm, bright, dense, firm, j...
56809    New, ripe, complex, baked, complete, melted, c...
12421              rich, rewarding, black, pleasing, bla

In [15]:
def extract_pos(doc, pos):
    adjectives = [t.text for t in doc if t.pos_ == pos]
    adjectives = ', '.join(adjectives)
    return adjectives

In [16]:
def extract_people(doc):
    people = [e for e in doc.ents if e.label_ == "PERSON"]
    return people

In [17]:
df['verb'] = df['spacy_doc'].apply(extract_pos, pos='VERB')

In [18]:
%matplotlib inline

import pandas as pd
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from afinn import Afinn

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation




In [19]:
lr_classifier = LogisticRegression(solver = 'lbfgs', max_iter= 5000)



In [20]:
vectorizer = CountVectorizer(lowercase   = True,
                             max_features = 100)

In [30]:
vectorizer.fit(df['adj'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [31]:
word_counts = vectorizer.transform(df['description'])

In [32]:
lr_classifier = LogisticRegression(solver = 'lbfgs', max_iter= 5000)



In [33]:
word_counts + word_counts

<10000x100 sparse matrix of type '<class 'numpy.int64'>'
	with 48288 stored elements in Compressed Sparse Row format>

In [34]:
lr_classifier.fit(word_counts, df['rating'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [35]:
prediction = lr_classifier.predict(word_counts)

In [36]:
print(accuracy_score(df['rating'], prediction))

0.8048


In [37]:
coeficients = pd.Series(lr_classifier.coef_[0],
                        index = vectorizer.get_feature_names())

In [38]:
coeficients.sort_values()

beautiful      -3.010947
lovely         -2.445937
impressive     -2.301826
complex        -2.135415
powerful       -1.755412
polished       -1.748744
long           -1.748496
elegant        -1.508430
deep           -1.281375
pure           -1.280541
great          -1.182156
lush           -1.134629
delicious      -1.112672
generous       -1.076086
concentrated   -1.038677
silky          -1.002609
new            -1.000521
structured     -0.995696
best           -0.983826
vibrant        -0.943164
aromatic       -0.915793
exotic         -0.880467
rich           -0.832525
fine           -0.757293
dark           -0.754106
dense          -0.746692
balanced       -0.720734
french         -0.695804
intense        -0.666481
yellow         -0.587220
                  ...   
warm            0.161018
oaky            0.197185
light           0.237068
much            0.240855
green           0.291665
hard            0.292264
dry             0.302795
sharp           0.306992
solid           0.307573
