In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_md')

In [5]:
nlp(u'The quick brown fox jumped').vector.shape # the vector components for string 'lion'

# doc and span objects also have vector components

(300,)

In [6]:
nlp(u'fox').vector.shape

(300,)

In [7]:
tokens = nlp(u'lion cat pet') # notice some similarities

In [8]:
for token1 in tokens:
    for token2 in tokens: # essentially comparing every word to every other word
        print(token1.text,token2.text,token1.similarity(token2))

lion lion 1.0
lion cat 0.5265438
lion pet 0.39923766
cat lion 0.5265438
cat cat 1.0
cat pet 0.7505457
pet lion 0.39923766
pet cat 0.7505457
pet pet 1.0


In [None]:
# similarity values between 0 and 1
# can see lion and cat show similarity 
# lion and pet have less similarity

In [9]:
tokens = nlp(u'like love hate') # opposite words but similar context

In [10]:
for token1 in tokens:
    for token2 in tokens: # essentially comparing every word to every other word
        print(token1.text,token2.text,token1.similarity(token2))

like like 1.0
like love 0.657904
like hate 0.65746516
love like 0.657904
love love 1.0
love hate 0.63930994
hate like 0.65746516
hate love 0.63930994
hate hate 1.0


In [11]:
# if words are used in similar context they have similarity even though opposites

# sometimes helpful to aggregate 300 dimensions into euclidian L2 norm
# square root of sum of square vectors

In [12]:
# current vocab
len(nlp.vocab.vectors)

20000

In [13]:
tokens = nlp(u'dog cat nargle')

In [14]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov) # out of vocab / made up word is out of vocabulary

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


#### vector arithmetic

In [15]:
# calculate new vector by adding and subtracting vectors
# king - man + women = queen

In [17]:
# need to calculate cosine similarity ourselves for this

from scipy import spatial

cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1,vec2)

In [18]:
king = nlp.vocab['king'].vector

man = nlp.vocab['man'].vector

woman = nlp.vocab['woman'].vector

In [19]:
# king - man + woman ----> new should be similar to queen, princess, highness, etc

new_vector = king-man+woman

In [20]:
computed_similarities = []

# for all words in vocab
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha: # is not a number
                similarity = cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

In [21]:
# come in descending order, most similar words
computed_similarities = sorted(computed_similarities,key=lambda item:-item[1])

In [22]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']


#### Vadar Sentiment Analysis

In [23]:
# descern sentiment - positive/negative/neutral on raw text with no labels
# valence aware dictionary for sEntiment reasoning VADER
# polarity pos/neg and intensity

In [24]:
# map features to emotion intense - sentiment score
# sum up the intensity of each word in the text
# sarcasm cannot be detected
# positive and negative in same / movie review doesnt work

In [25]:
import nltk

In [26]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/miguelsantana/nltk_data...


True

In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [28]:
# takes in a string and returns back in 4 categories, neg, pos, neutral, compound score

a = 'This is a good movie'

In [29]:
sid.polarity_scores(a) # compound is normalize all first 3 together
# max value is 1.0

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [30]:
a = 'This was the best, most awesome movie EVER MADE!!!'

In [31]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [33]:
a = 'This was the WORST movie that has ever disgraced the screen.'

In [34]:
sid.polarity_scores(a) # compound 0 neutral, above 0 is positive, below 0 negative

{'neg': 0.465, 'neu': 0.535, 'pos': 0.0, 'compound': -0.8331}

In [35]:
import pandas as pd

In [36]:
df = pd.read_csv('amazonreviews.tsv',sep='\t')

In [37]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [38]:
df['label'].value_counts() # pretty balanced

neg    5097
pos    4903
Name: label, dtype: int64

In [39]:
df.dropna(inplace=True)

In [40]:
blanks = []

for i,lb,rv in df.itertuples():
    # index, label, review
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

In [41]:
# df.drop(blanks,inplace=True) # if had blanks

In [42]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [43]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [44]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [45]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [46]:
# just deal with compound
df['compound'] = df['scores'].apply(lambda d:d['compound']) # d:d dictionary and score off

In [47]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [48]:
# logic, if greater zero then pos, if less than zero then neg
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >=0 else 'neg')

In [49]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [50]:
# overall report of the accuracy
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [51]:
accuracy_score(df['label'],df['comp_score']) # actual vs computed score

0.7091

In [52]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [53]:
# more trouble with neg vs positive prob cuz of sarcasm

print(confusion_matrix(df['label'],df['comp_score']))

[[2623 2474]
 [ 435 4468]]


In [None]:
# overall not bad but could be alot better