In [None]:
#for semantics and sentiment analysis, you need to install the medium or lage "en_core_web_md/lg"
#it uses Word2Vec which is a 2 layer neural net and is able to make highly accurate guesses
# eg: 'man' is to 'boy' as 'woman' is to 'girl'
# we used sklearn feature extraction to extract and classify text
#but for Semantics and Sentiment analysis we will use spacy

In [1]:
#we have to use spacy embedded word vectors, the small model doesn't contain word vectors
import spacy
nlp= spacy.load('en_core_web_lg')
nlp(u'lion').vector #will print a vectors for the word lion

array([  1.2746  ,   0.46242 ,  -1.1829  ,  -5.2661  ,  -2.7128  ,
         1.8521  ,  -0.94273 ,   2.1865  ,   6.503   ,   0.6704  ,
         1.5361  ,   2.5992  ,  -0.36233 ,   4.3965  ,  -6.5644  ,
         1.6141  ,  -1.2897  ,   2.1184  ,  -0.63654 ,  -3.4572  ,
        -4.3771  ,   4.2074  ,  -3.6411  ,  -0.97214 ,   1.3253  ,
        -2.3125  ,  -3.6531  ,  -2.8398  ,   2.7913  ,  -1.53    ,
        -2.9984  ,  -2.6357  ,   0.50615 ,  -2.6925  ,   4.3401  ,
        -5.6017  ,   0.045691,   4.3832  ,  -0.19535 ,  -1.0751  ,
         0.32172 ,   2.4395  ,   4.6638  ,   3.4471  ,  -3.3847  ,
        -1.8238  ,   0.70212 ,   0.58557 ,   5.0032  ,  -3.1072  ,
         1.2364  ,   7.4595  ,   0.057368,   1.0111  ,  -1.0827  ,
         0.69113 ,   2.8009  ,  -3.4383  ,  -1.0599  ,  -2.2627  ,
        -5.149   ,  -5.0636  ,   3.1405  ,   1.0793  ,  -0.72892 ,
        -3.9939  ,  -0.69551 ,  -0.55767 ,   3.2555  ,  -2.9449  ,
         4.7114  ,   1.6388  ,   1.3828  ,   1.4255  ,  -3.233

In [2]:
nlp(u'a quick brown fox jumps over a lazy dog').vector
#will also print word vectors for this sentence but they're mean of the word vectors of individual words

array([-2.0963445 ,  3.1478603 , -4.026431  ,  0.18799962,  3.0217922 ,
       -0.81836665, -0.23268168,  3.379449  ,  1.8528315 ,  0.98227364,
        6.8238606 ,  2.5775054 , -1.932785  , -0.83819884,  3.0662131 ,
       -0.28882104,  2.365316  ,  0.16479887,  0.7192789 ,  0.23712094,
        0.47869888,  1.1155534 , -0.31480443, -1.7490444 ,  1.3065993 ,
       -2.0616498 , -4.0919814 , -1.70963   ,  0.04291116,  0.11381115,
       -1.3997625 , -0.41237673,  1.4471135 , -0.8094456 , -1.1143432 ,
       -2.4969068 ,  1.4495867 , -0.6378867 ,  5.0790224 , -3.0803456 ,
       -0.14456667, -0.9488199 ,  3.2369456 , -0.8474011 ,  0.45137227,
       -0.31906378,  0.58768225, -1.9120687 , -1.8271345 ,  1.1791145 ,
        1.1146156 ,  2.7400522 ,  0.95190006, -0.73420775, -1.379021  ,
       -0.25003672,  2.4629102 ,  0.95856667,  0.47657108, -0.81721866,
        0.47450364, -1.5174489 ,  0.3691578 ,  0.05139445, -3.3208878 ,
       -0.63805556, -3.1304102 , -3.5538244 ,  0.7286822 ,  0.67

In [3]:
#to identify similar vectors we can use .similarity() function
tokens=nlp('lion cat pet')
for token1 in tokens:
    for token2 in tokens:
        print(token1,token2, token1.similarity(token2))

lion lion 1.0
lion cat 0.3854507803916931
lion pet 0.20031584799289703
cat lion 0.3854507803916931
cat cat 1.0
cat pet 0.732966423034668
pet lion 0.20031584799289703
pet cat 0.732966423034668
pet pet 1.0


In [26]:
# we have .has_vector, .vector_norm, and .is_oov methods available for finding out if a word has vectors, what is its normal and if its out of vocabulary

#vector arithmetic: you can create a new vector by adding/subtracting other vectors
# eg: king-man + woman --> should give us queen

from scipy import spatial

cosine_similarity = lambda vec1, vec2: 1- spatial.distance.cosine(vec1, vec2)
king= nlp.vocab['king'].vector
man= nlp.vocab['man'].vector
woman= nlp.vocab['woman'].vector

new_vector = king-man +woman
computed_similarities =[]

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if not word.is_stop:
                if word.is_alpha:
                    similarity= cosine_similarity(new_vector, word.vector)
                    computed_similarities.append((word.text, similarity))


computed_similarities= sorted(computed_similarities, key=lambda x: -x[1])
print([t[0] for t in computed_similarities[:10]])

['king', 'woman', 'cause', 'ought', 'somethin', 'lion', 'space', 'nothin', 'brown', 'need']


In [27]:
#sentiment analysis: to discern sentiment imprompto , with unlabelled data
#we'll use VADER(Valence Aware Deictonary for sEntiment Reasoning) for such unlabelled sentiment analysis
#VADER is available in NLTK, and it's smart enough to understand 'didn't love' and 'LOVE!!!' meaning and intensity

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid= SentimentIntensityAnalyzer()

a= 'This is an awesome movie. Best so far I have watched'
sid.polarity_scores(a)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...


{'neg': 0.0, 'neu': 0.491, 'pos': 0.509, 'compound': 0.8519}

In [28]:
b='this was one the worst movies I have ever watched'
sid.polarity_scores(b)

{'neg': 0.339, 'neu': 0.661, 'pos': 0.0, 'compound': -0.6249}

In [29]:
#now we are going to use VADER for analyzing the sentiment in the amazon reviews
import pandas as pd
df= pd.read_csv('./amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [30]:
df.isna().sum()

label     0
review    0
dtype: int64

In [31]:
#cleaning data:
blanks =[]
for i,lb, rv in df.itertuples():
    if type(rv)== str:
        if rv.isspace():
                blanks.append(i)


df.drop(blanks, inplace=True)

In [32]:
df['scores']= df['review'].apply(lambda x: sid.polarity_scores(x))
df['compound']= df['review'].apply(lambda x:sid.polarity_scores(x)['compound'])
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [34]:
df['comp_score']= df['compound'].apply(lambda x:'pos' if x>=0 else 'neg')

In [35]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [36]:
#to check the accuracy:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
print(confusion_matrix(df['label'], df['comp_score']))
print(classification_report(df['label'], df['comp_score']))
print(accuracy_score(df['label'], df['comp_score']))

[[2629 2468]
 [ 435 4468]]
              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000

0.7097
