In [1]:
import pandas as pd
import string
import numpy as np
import sklearn.metrics as metrics
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from nltk.tokenize import word_tokenize
import nltk
import gensim
from gensim.models import Word2Vec
from sklearn.ensemble import AdaBoostClassifier

In [2]:
df = pd.read_csv('~/yelp_review.csv')

In [3]:
df1 = df.sample(30000)

In [4]:
stop = stopwords.words('english')

In [5]:
df1['stopwords'] = df1['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
# df1[['text','stopwords']].head()
df1['numerics'] = df1['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
# df1[['text','numerics']].head()

In [6]:
df1['text length'] = df1['text'].apply(len)
df1.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,stopwords,numerics,text length
5232093,DjO13mZAyE3i_IMpOb9BsQ,e5mmLYgJusshnfV1ThxY1A,LQZ2yBIAFmXYbDD0Z2Nhug,1,2017-10-09,Terrible staff and customer service. They act ...,0,0,0,15,0,175
1187587,dfLNPlgWb0BjftNd3fc0CQ,U44SEf8RPptWRJQiMJXd8g,lSsuhdM4-Awp3m6-PSrMmg,5,2016-01-06,Brought some friends here from out of town thi...,0,0,0,20,0,233
3859661,KiPZVaXdifldY4kt3eADSQ,ymwSwK2aUp0LKpEElvl5lg,e0CTLPxTnFEQSqQ1FJUqog,1,2015-03-24,"Whatever you do, do not rent a car from this c...",0,0,0,188,4,2141
5095277,PmA2FBTfjPBYzMsR4lzWtA,zhT5MLTV4lyekxSQbqbT-w,BCSofuQwu1VE8wjzDaQ3qQ,5,2017-10-20,What a great find! Terrific cocktails. Superb ...,0,0,0,24,0,368
4871188,LHHf1fC3mkBYcor3zrf2yQ,t5vmjQB0ZqhzvO9oRE2hDg,mW97sJ9JVjs_REfp6XjRhQ,3,2016-01-24,As a lot of the previous reviews states althou...,1,0,0,50,0,537


In [7]:
df2 = df1[['text','stars']].copy()
df2.head()

Unnamed: 0,text,stars
5232093,Terrible staff and customer service. They act ...,1
1187587,Brought some friends here from out of town thi...,5
3859661,"Whatever you do, do not rent a car from this c...",1
5095277,What a great find! Terrific cocktails. Superb ...,5
4871188,As a lot of the previous reviews states althou...,3


In [8]:
df2['text'] = df2['text'].str.lower()

In [9]:
%%time
df2['token_text'] = df2.text.apply(lambda x: word_tokenize(x))

CPU times: user 37.5 s, sys: 513 ms, total: 38 s
Wall time: 39.2 s


In [10]:
def clean(text):
    cleaned = [w for w in text if w not in stop]
    cleaned = [w for w in cleaned if w not in string.punctuation]
    return ' '.join(cleaned)

In [11]:
%%time
df2['clean_text'] = df2['token_text'].apply(clean)
df2['text'] = df2['clean_text']
df2.drop(['clean_text','token_text'],1,inplace=True)

CPU times: user 8.42 s, sys: 84.1 ms, total: 8.5 s
Wall time: 8.77 s


In [12]:
df2.head()

Unnamed: 0,text,stars
5232093,terrible staff customer service act like favor...,1
1187587,brought friends town week tell best wings 've ...,5
3859661,whatever rent car company may look like gettin...,1
5095277,great find terrific cocktails superb music pla...,5
4871188,lot previous reviews states although chicken g...,3


In [13]:
%%time
cv = TfidfVectorizer(min_df=5)
X = cv.fit_transform(df2.text)

CPU times: user 2.1 s, sys: 59.8 ms, total: 2.16 s
Wall time: 2.21 s


In [14]:
%%time
features = pd.DataFrame(X.toarray(),columns=cv.get_feature_names())

CPU times: user 833 ms, sys: 1.23 s, total: 2.07 s
Wall time: 2.94 s


In [15]:
%%time
xtr1,xts1,ytr1,yts1 = train_test_split(features,df2.stars)
lr = LogisticRegression()
lr.fit(xtr1,ytr1)

CPU times: user 5min 23s, sys: 8.14 s, total: 5min 31s
Wall time: 2min 56s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
preds_lr1 = lr.predict(xts1)

In [17]:
accuracy_score(yts1,preds_lr1)

0.6237333333333334

In [18]:
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
wv.init_sims(replace=True)

In [19]:
stopwords = nltk.corpus.stopwords.words('english')

In [20]:
%%time
# 30000 sample size
docs_vectors = pd.DataFrame()

for doc in df2['text'].str.replace('\d+', ''):
    temp = pd.DataFrame()   
    for word in doc.split(' '): 
        if word not in stopwords: 
            try:
                word_vec = wv[word]  
                temp = temp.append(pd.Series(word_vec), ignore_index = True)  
            except:
                pass
    doc_vector = temp.mean()
    docs_vectors = docs_vectors.append(doc_vector, ignore_index = True) 
docs_vectors.shape

CPU times: user 38min 44s, sys: 3min 29s, total: 42min 13s
Wall time: 42min 41s


In [21]:
docs_vectors.shape

(30000, 300)

In [22]:
docs_vectors['stars'] = list(df2.stars)

In [23]:
docs_vectors = docs_vectors.dropna()

In [24]:
pd.isnull(docs_vectors).sum().sum()

0

In [25]:
vectors = docs_vectors.drop(['stars'],1)
stars = docs_vectors['stars']

In [26]:
xtr,xts,ytr,yts = train_test_split(vectors,stars)

In [27]:
%%time
lr = LogisticRegression()
lr.fit(xtr,ytr)

CPU times: user 7.42 s, sys: 300 ms, total: 7.72 s
Wall time: 4.44 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [28]:
preds_lr = lr.predict(xts)

In [29]:
accuracy_score(yts,preds_lr)

0.5828777170289372

In [30]:
%%time
ad = AdaBoostClassifier()
ad.fit(xtr,ytr)

CPU times: user 50.3 s, sys: 302 ms, total: 50.6 s
Wall time: 52.3 s


In [31]:
preds_ad = ad.predict(xts)

In [32]:
accuracy_score(yts,preds_ad)

0.5351380184024537