In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [2]:
# read yelp.csv into a DataFrame
yelp = pd.read_csv('yelp.csv')

# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp["sentiment"] = yelp["stars"]


In [3]:
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,sentiment
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0,5
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,5
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0,4
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0,5
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,5


In [4]:
yelp["sentiment"] = yelp["sentiment"].replace([1,2,3,4,5],["bad","bad","neutral","good","good"])

In [5]:
yelp.stars.value_counts()

4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64

In [6]:
yelp["sentiment"].value_counts()

good       6863
bad        1676
neutral    1461
Name: sentiment, dtype: int64

In [7]:
# Create training data
X = yelp[["text","sentiment"]]

In [8]:
X.head()

Unnamed: 0,text,sentiment
0,My wife took me here on my birthday for breakf...,good
1,I have no idea why some people give bad review...,good
2,love the gyro plate. Rice is so good and I als...,good
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",good
4,General Manager Scott Petello is a good egg!!!...,good


### Vectorize Text

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# initialize vectorizer
vect = CountVectorizer(ngram_range=(1,2),stop_words='english', min_df = 0.01)
#min_df is the percentage of occurance of a word in the entire set. 

In [11]:
vect.fit(X["text"])
X_features = vect.transform(X["text"])

In [12]:
X_features.shape

(10000, 1010)

#### In class assignment

What happens when we change the value of min_df value? What is this argument controlling?

In [13]:
### Get the list of features
vect.get_feature_names()

['00',
 '10',
 '100',
 '11',
 '12',
 '15',
 '20',
 '25',
 '30',
 '40',
 '50',
 '99',
 'able',
 'absolutely',
 'actually',
 'add',
 'added',
 'addition',
 'admit',
 'afternoon',
 'ago',
 'agree',
 'ahead',
 'amazing',
 'ambiance',
 'american',
 'apparently',
 'appetizer',
 'appetizers',
 'appreciate',
 'area',
 'aren',
 'arizona',
 'arrived',
 'art',
 'asian',
 'ask',
 'asked',
 'asking',
 'ass',
 'ate',
 'atmosphere',
 'attention',
 'attentive',
 'authentic',
 'available',
 'average',
 'avoid',
 'away',
 'awesome',
 'awful',
 'az',
 'baby',
 'bacon',
 'bad',
 'bag',
 'baked',
 'bar',
 'barely',
 'bars',
 'bartender',
 'bartenders',
 'based',
 'basically',
 'basil',
 'bathroom',
 'bbq',
 'bean',
 'beans',
 'beat',
 'beautiful',
 'beef',
 'beer',
 'beers',
 'believe',
 'best',
 'better',
 'big',
 'birthday',
 'bit',
 'bite',
 'black',
 'bland',
 'blue',
 'book',
 'bottle',
 'bought',
 'bowl',
 'box',
 'boyfriend',
 'bread',
 'break',
 'breakfast',
 'bring',
 'brought',
 'brown',
 'brunch

In [14]:
vect = CountVectorizer(ngram_range=(1,2),stop_words='english', max_features=500)

In [15]:
vect.fit(X["text"])
X_features = vect.transform(X["text"])
X_features.shape

(10000, 500)

#### Q. how many bi-grams are included in the feature list?

In [16]:
features = vect.get_feature_names()

In [17]:
feature_len = [len(x.split(" ")) for x in features]


In [18]:
for i in features:
    if(len(i.split(" ")) == 2):
        print(i)

customer service
don know
feel like
food good
good food
great food
great place
happy hour
ice cream
love place
pretty good
really good
service great


In [19]:
index = np.where(np.array(feature_len)==2)
print(index)

(array([ 92, 119, 153, 162, 185, 188, 189, 199, 219, 259, 332, 346, 388],
      dtype=int64),)


Q: find the words correspodning to length 2 (bigrams)

In [20]:
np.array(features)[index]

array(['customer service', 'don know', 'feel like', 'food good',
       'good food', 'great food', 'great place', 'happy hour',
       'ice cream', 'love place', 'pretty good', 'really good',
       'service great'], dtype='<U16')

#### Q. how many reviews mention customer service?

In [21]:
dtm = X_features.toarray()

In [22]:
features.index('customer service')

92

In [23]:
dtm[:,92]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [24]:
np.where(dtm[:,92]>0) 

(array([  13,   19,   55,   61,  113,  148,  155,  184,  348,  355,  380,
         559,  567,  662,  687,  714,  720,  857,  928,  973, 1032, 1085,
        1090, 1115, 1212, 1252, 1266, 1271, 1292, 1378, 1379, 1394, 1404,
        1405, 1494, 1520, 1548, 1558, 1592, 1641, 1666, 1703, 1723, 1777,
        1802, 1901, 2002, 2018, 2044, 2069, 2081, 2101, 2212, 2222, 2330,
        2444, 2486, 2492, 2510, 2551, 2658, 2665, 2677, 2755, 2758, 2810,
        2874, 2882, 2902, 2927, 2947, 2967, 2977, 2979, 3010, 3316, 3357,
        3405, 3406, 3426, 3448, 3469, 3545, 3557, 3570, 3581, 3639, 3705,
        3716, 3737, 3740, 3777, 3953, 4032, 4057, 4075, 4090, 4137, 4226,
        4250, 4265, 4298, 4327, 4416, 4424, 4480, 4540, 4565, 4590, 4605,
        4615, 4646, 4743, 4858, 4893, 4923, 4956, 5039, 5072, 5086, 5116,
        5214, 5215, 5226, 5281, 5314, 5328, 5491, 5610, 5637, 5699, 5701,
        5833, 5869, 5882, 5889, 5896, 5937, 5944, 6017, 6078, 6096, 6102,
        6156, 6211, 6318, 6344, 6347, 

In [25]:
yelp.text.iloc[19]

"They've gotten better and better for me in the time since this review was written. \n\nMy last contact with them was a few days ago when I was having trouble redeeming some Groupons on their website. I called customer service and after waiting a few minutes I spoke with a rep who cheerfully booked four separate flights for me, patiently and manually entering my Groupon info for each one.\n\nI think the acquisition by Republic has helped them overall. After Republic took over the in-flight cookies started. It still tends to suck if you aren't Ascent club (like just about any budget-centric airline does), but once you get there it's a good value. When I've had to fly Southwest or USAir I've been disappointed in comparison."

In [26]:
len(np.where(dtm[:,92]>0)[0])
#228 reviews talk about customer service.

228

#### Punctuation handling while tokenization, removing stopwords etc

In [27]:
text = ["applicant should be proficient in Go","need experience in node.js","expert in C & C#"]

In [28]:
vect = CountVectorizer(stop_words="english")

In [29]:
dtm = pd.DataFrame(vect.fit_transform(text).toarray())
dtm.columns = vect.get_feature_names()

In [30]:
dtm

Unnamed: 0,applicant,experience,expert,js,need,node,proficient
0,1,0,0,0,0,0,1
1,0,1,0,1,1,1,0
2,0,0,1,0,0,0,0


In [31]:
from sklearn.feature_extraction import stop_words
 
print(stop_words.ENGLISH_STOP_WORDS)

frozenset({'be', 'during', 'herself', 'out', 'across', 'sincere', 'made', 'along', 'above', 'wherein', 'been', 'nothing', 'mostly', 'none', 'serious', 'although', 'fifty', 'for', 'how', 'former', 'full', 'since', 'down', 'too', 'noone', 'two', 'may', 'us', 'hasnt', 'through', 'we', 'toward', 'further', 'itself', 'found', 'both', 'of', 'by', 'side', 'via', 'however', 'was', 'where', 'those', 'became', 'twenty', 'whenever', 'before', 'enough', 'hundred', 'me', 'per', 'she', 'ten', 'go', 'third', 'very', 'i', 'forty', 'formerly', 'would', 'keep', 'her', 'latter', 'fill', 'describe', 'are', 'every', 'mine', 'within', 'cry', 'anything', 'their', 'together', 'onto', 'everyone', 'hence', 'nobody', 'whose', 'but', 'even', 'can', 'between', 'elsewhere', 'first', 'towards', 'give', 'whereafter', 'three', 'call', 'wherever', 'seemed', 'anyone', 'a', 'move', 'each', 'seems', 'sixty', 'something', 'some', 'well', 'because', 'anyway', 'neither', 'still', 'yours', 'becomes', 'less', 'in', 'whereby', 

In [32]:
# Cutsom tokenization
vect = CountVectorizer(tokenizer=lambda x: x.split(" "))

In [33]:
dtm = pd.DataFrame(vect.fit_transform(text).toarray())
dtm.columns = vect.get_feature_names()
dtm

Unnamed: 0,&,applicant,be,c,c#,experience,expert,go,in,need,node.js,proficient,should
0,0,1,1,0,0,0,0,1,1,0,0,1,1
1,0,0,0,0,0,1,0,0,1,1,1,0,0
2,1,0,0,1,1,0,1,0,1,0,0,0,0


In [34]:
import re


In [35]:
re.sub("C#","CSharp","proficiency in c#",flags=re.IGNORECASE)

'proficiency in CSharp'

#### Q: Create the DTM again after replacing C# with CSharp and Go with GoLang

In [36]:
text[0]=re.sub("Go","GoLang",text[0],flags=re.IGNORECASE)
text[2]=re.sub("C#","CSharp",text[2],flags=re.IGNORECASE)

In [37]:
dtm = pd.DataFrame(vect.fit_transform(text).toarray())
dtm.columns = vect.get_feature_names()

In [38]:
dtm

Unnamed: 0,&,applicant,be,c,csharp,experience,expert,golang,in,need,node.js,proficient,should
0,0,1,1,0,0,0,0,1,1,0,0,1,1
1,0,0,0,0,0,1,0,0,1,1,1,0,0
2,1,0,0,1,1,0,1,0,1,0,0,0,0


#### Reducing inflectional forms via Stemming & Lemmatization

In [39]:
text = ["loved the movie","love the acting and the cast","i am loving it","lovely movie"]

In [40]:
dtm = pd.DataFrame(vect.fit_transform(text).toarray())
dtm.columns = vect.get_feature_names()

In [41]:
dtm

Unnamed: 0,acting,am,and,cast,i,it,love,loved,lovely,loving,movie,the
0,0,0,0,0,0,0,0,1,0,0,1,1
1,1,0,1,1,0,0,1,0,0,0,0,2
2,0,1,0,0,1,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,1,0,1,0


In [1]:
import nltk
# nltk.download()

In [11]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [12]:
lp = LancasterStemmer()
sb = WordNetLemmatizer()

In [18]:
sb.lemmatize("residential address")

'residential address'

In [43]:
from nltk import word_tokenize          
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer

In [44]:
stem = PorterStemmer()
stem.stem("loving")

'love'

In [45]:
stem.stem("cognizant")

'cogniz'

In [46]:
text_stem = []
for review in text:
    text_stem.append(" ".join([stem.stem(x) for x in word_tokenize(review)]))

In [47]:
text_stem

['love the movi', 'love the act and the cast', 'i am love it', 'love movi']

#### Q: Lemmatization

In [48]:
# Part of speech may change the meaning of words

from nltk import pos_tag
print(pos_tag(word_tokenize("he works in banking")))
print(pos_tag(word_tokenize("i am banking on you to do this")))

[('he', 'PRP'), ('works', 'VBZ'), ('in', 'IN'), ('banking', 'NN')]
[('i', 'NN'), ('am', 'VBP'), ('banking', 'VBG'), ('on', 'IN'), ('you', 'PRP'), ('to', 'TO'), ('do', 'VB'), ('this', 'DT')]


In [49]:
lemma = WordNetLemmatizer()


In [50]:
lemma.lemmatize("banking","v")

'bank'

In [51]:
from nltk.corpus import wordnet
lemma.lemmatize("better",wordnet.ADJ)

'good'

In [52]:

def get_wordnet_pos_tag(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADVERB
    else:
        return wordnet.NOUN

In [53]:
lemma.lemmatize("loved",get_wordnet_pos_tag("VBD"))

'love'

In [54]:
[lemma.lemmatize(x[0],get_wordnet_pos_tag(x[1])) for x in pos_tag(word_tokenize(text[0]))]

['love', 'the', 'movie']

In [55]:
" ".join([lemma.lemmatize(x[0],get_wordnet_pos_tag(x[1])) for x in pos_tag(word_tokenize(text[0]))])

'love the movie'

#### Q: Using lemmatization normalize all the 3 movie reviews

### Building a classification model

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, X_train_orig, X_test_orig = train_test_split(X_features, X["sentiment"], X["text"],
                                                                             random_state=2)


In [57]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [58]:
# train the model using X_train_dtm
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [59]:
# make class predictions 
y_pred_class = nb.predict(X_test)

In [60]:
y_pred_class

array(['good', 'good', 'good', ..., 'good', 'good', 'bad'], dtype='<U7')

In [61]:
pred_df = pd.DataFrame({"actual":y_test, "pred":y_pred_class})

In [62]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7364

In [63]:
pd.crosstab(pred_df["actual"],pred_df["pred"])

pred,bad,good,neutral
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bad,235,139,56
good,112,1489,120
neutral,59,173,117


#### Q: what does the argument alpha control? How does the model performance change as we change the value of alpha?


### Get probabilities

In [64]:
# make class predictions for X_test_dtm
y_pred_proba = nb.predict_proba(X_test)

In [65]:
nb.classes_

array(['bad', 'good', 'neutral'], dtype='<U7')

In [66]:
y_pred_proba

array([[2.09759947e-02, 9.12384877e-01, 6.66391284e-02],
       [6.75070771e-05, 9.94569752e-01, 5.36274103e-03],
       [1.66133333e-01, 6.85600000e-01, 1.48266667e-01],
       ...,
       [3.46361230e-02, 9.02208028e-01, 6.31558486e-02],
       [4.90033506e-02, 5.13875805e-01, 4.37120844e-01],
       [9.99972344e-01, 6.89062534e-06, 2.07657840e-05]])

In [67]:
y_pred_proba[:,0]

array([2.09759947e-02, 6.75070771e-05, 1.66133333e-01, ...,
       3.46361230e-02, 4.90033506e-02, 9.99972344e-01])

In [68]:
### Find top 5 reviews with the highest probability of being "bad" reviews

In [69]:
result = pd.DataFrame({"text":X_test_orig, "probability":y_pred_proba[:,0]})
result.head()

Unnamed: 0,text,probability
7878,Great prices and fresh food ...,0.020976
3224,Delicious kosher food in Scottsdale? Vegetari...,6.8e-05
1919,D-scust-ing.,0.166133
4432,"All around great. Great food, great atmospher...",0.000135
4835,"Well it was off to Brio last night for ""date"" ...",1.0


In [70]:
result = result.sort_values(by="probability", ascending=False)

In [71]:
print(result.text.iloc[0]) 
#Printing 1st positionand not the 0th index

A couple Saturdays ago, went with my mom to get my name added to her accounts should the unthinkable happen. We first went to Chase which took all of ten minutes. The personal banker was great - in-out-done.  Then we headed to her Credit Union at approximately 10:00 AM. They were quite busy, apparently. When we checked in, I heard grumbling from other customers and asked the wait time.  I was told there were two people ahead of us in line and it "shouldn't be long."

30 minutes later, my mom went to ask how much longer it would take. Jennfer, who was manning the reception desk, dismissed her rudely in front of a lobby of customers. My mom is as sweet as tea in Tennesee, so to hear her called out in front of a lobby of people as a disruption was upsetting.  45 minutes later, I went up to ask if it would be much longer. Jennifer said curtly, "We're busy." Apparently, on Saturdays, they're busy and one should avoid transacting business there on the weekend. I don't recall being spoken to 

In [72]:
print(result.text.iloc[5]) 
#Printing 1st positionand not the 0th index

This 2 stars is extremely generous. Wanna know why? Read on.

My party of 7 and a baby got to Pappadeux's around 6:30. We were told that it would be a 30-minute wait. It ended up being more than an hour. We were standing right in front of the hostess station, yet no one ever came to give us updates on how long it would be for our table. We would go check about every 20 or so minutes. One time we were told that our table was ready. When we went to check on it after about 20 minutes we were told that they were waiting on a busboy to clean it. A few minutes later they were still waiting on a busboy. In what universe does it take a restaurant 30 minutes to find a busboy to clean a table? We watched group after group after group get seated. A group of 7 that came after us was seated before us. We ended up being sat next to them; by the time we were getting our menus, they were finishing up their meals.

Once we were finally seated, we waited almost 10 minutes before someone came to our tabl

### Using SVD

In [73]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

In [74]:

vectorizer = CountVectorizer(ngram_range=(1,2),stop_words='english', max_features=2000)

# Build the tfidf vectorizer from the training data ("fit"), and apply it 
# ("transform").
X_features = vectorizer.fit_transform(X["text"])


In [75]:
# Apply SVD
svd_mod = TruncatedSVD(1000)

# Run SVD on the training data, then project the training data.
X_features = svd_mod.fit_transform(X_features)

In [76]:
X_features.shape

(10000, 1000)

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, X_train_orig, X_test_orig = train_test_split(X_features, X["sentiment"], X["text"],
                                                                             random_state=2)


In [78]:
X_train.shape

(7500, 1000)

In [79]:
from sklearn.ensemble import RandomForestClassifier

In [80]:
rf = RandomForestClassifier(n_estimators=500, criterion='entropy')

In [81]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [82]:
# make class predictions for X_test_dtm
y_pred_class = rf.predict(X_test)

In [83]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.6884

### Using word2vec model

In [89]:
import os
os.getcwd()

'E:\\AIML\\GLAIML\\Resource Materials\\Statistical NLP\\Day1_snlp'

In [91]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove.6B.50d.txt", binary=False)

In [94]:
a = glove_model.word_vec("cat")
b = glove_model.word_vec("bat")

In [95]:
from scipy.stats.stats import pearsonr
pearsonr(a,b)

(0.50941926, 0.00015790963965300008)

In [96]:
import re
X["text_clean"] = [re.sub("[^a-zA-Z ]","",x).lower() for x in X["text"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [97]:
words = X["text_clean"].iloc[0].split(" ")
words

['my',
 'wife',
 'took',
 'me',
 'here',
 'on',
 'my',
 'birthday',
 'for',
 'breakfast',
 'and',
 'it',
 'was',
 'excellent',
 '',
 'the',
 'weather',
 'was',
 'perfect',
 'which',
 'made',
 'sitting',
 'outside',
 'overlooking',
 'their',
 'grounds',
 'an',
 'absolute',
 'pleasure',
 '',
 'our',
 'waitress',
 'was',
 'excellent',
 'and',
 'our',
 'food',
 'arrived',
 'quickly',
 'on',
 'the',
 'semibusy',
 'saturday',
 'morning',
 '',
 'it',
 'looked',
 'like',
 'the',
 'place',
 'fills',
 'up',
 'pretty',
 'quickly',
 'so',
 'the',
 'earlier',
 'you',
 'get',
 'here',
 'the',
 'betterdo',
 'yourself',
 'a',
 'favor',
 'and',
 'get',
 'their',
 'bloody',
 'mary',
 '',
 'it',
 'was',
 'phenomenal',
 'and',
 'simply',
 'the',
 'best',
 'ive',
 'ever',
 'had',
 '',
 'im',
 'pretty',
 'sure',
 'they',
 'only',
 'use',
 'ingredients',
 'from',
 'their',
 'garden',
 'and',
 'blend',
 'them',
 'fresh',
 'when',
 'you',
 'order',
 'it',
 '',
 'it',
 'was',
 'amazingwhile',
 'everything',
 'o

In [98]:
chk = [glove_model.word_vec(x) for x in words if x in glove_model.vocab]
np.array(chk).shape

(136, 50)

In [99]:
np.array(chk).mean(axis=0).shape

(50,)

In [100]:
from tqdm import tqdm
review_vec = np.zeros((X.shape[0],50))
for i in tqdm(range(0,X.shape[0])):
    words = X["text_clean"].iloc[i].split(" ")
    words = [x.strip() for x in words]
    ind_word_vecs = [glove_model.word_vec(x) for x in words if x in glove_model.vocab]
    review_vec[i] = np.array(ind_word_vecs).mean(axis=0)

  import sys
  ret = ret.dtype.type(ret / rcount)
100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4461.80it/s]


In [101]:
review_vec.shape

(10000, 50)

In [102]:
review_vec = np.nan_to_num(review_vec)

In [103]:
X_features = review_vec