In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import pandas as pd
import numpy as np
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,f1_score
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 15000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 20000)

In [0]:
classifier = PassiveAggressiveClassifier()
vectorizer = HashingVectorizer(analyzer='word',lowercase=False)
negative = ["awful","irritated","harassed", "harassing", "neglect" , "neglected" , "failed" , "appalled" , 
"unwelcome" , "horrible" , "avoid" , "irritation" , "irritate" , "overpriced" , "bland" , "wait" , "waiting" ,
"expensive" , "bad" , "slow" , "inexperienced" , "dirty" , "smugness" , "disappointed" , "waited" , 
"understaffed" , "poor" , "confusing" , "nothing" , "confused" , "don't" , "disappoints" , "couldn't" ,
"not","few","small"]

positive = [ "pleased" , "adore" , "fan" , "great" , "flawless" , "perfectly" , "incredible" , "relax" ,
"relaxing" , "beautiful" , "organized" , "friendly", "great" , "professional" , "amazing" , "comfortable" ,
"recommend" , "nice" , "delicious" , "perfect" , "accommodating" , "happy" , "well" , "enjoyment" , 
"relaxed" , "excited" , "loved" , "highly" , "fancy" , "fun" , "wow" , "pleasant" , "knowledgeable" ,
"comfortably" , "wonderful" , "beautifully" , "smooth" , "easy" , "best" , "helpful" , "special" , "tasty" ,
"huge" , "fresh" , "good" , "thrilled" , "loves" , "quick"  , "biggest" , "comfort" ]

neutral = ["super" , "reasonable"  , "alright", "decent" , "limited" ]

In [0]:
business = pd.read_json('/content/gdrive/My Drive/business.json',lines=True)
business = business[['business_id','name','categories','stars']]
business = business[business['categories'].str.contains("Restaurants", na=False)]
business = business[['business_id','name','stars']]

reviews = pd.DataFrame(columns=['business_id','text'])
for i in pd.read_json('/content/gdrive/My Drive/review.json',lines=True,chunksize=20000):
  i = i[['business_id','text']]
  reviews = reviews.append(pd.merge(business,i,on=['business_id'],how='inner'),ignore_index=True)
  break

In [0]:
def getClassification(text):
    text = text.lower()
    pcount = 0
    ncount = 0
    nucount = 0
    
    for t in positive:
      if(t.lower() in text):
        pcount = pcount + 1
    
    for t in negative:
      if(t.lower() in text):
        ncount = ncount + 1
    
    for t in neutral:
      if(t.lower() in text):
        nucount = nucount + 1
  
    max_count = max(pcount,ncount,nucount)
    
    if(pcount-ncount>0):
      return 'positive'
    elif(ncount-pcount>0):
      return 'negative'
      
    return 'neutral'

In [0]:
reviews['rate'] = reviews['text'].apply(getClassification)
reviews.loc[reviews['rate']=='positive','rate'] = 3
reviews.loc[reviews['rate']=='negative','rate'] = 1
reviews.loc[reviews['rate']=='neutral','rate'] = 2
reviews.sort_values('business_id')
y_data = np.asarray(reviews['rate'],dtype=np.float64)
reviews

In [0]:
features = vectorizer.fit_transform((reviews['text']))
X_train, X_test, y_train, y_test  = train_test_split(
        features,
        y_data,
        train_size=0.80,
        random_state=1,
        shuffle=False)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

In [93]:
print('Accuracy Score - ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score - ' + str(precision_score(y_test, y_pred, average='macro')))
print('Recall Score  - ' + str(recall_score(y_test, y_pred, average='macro')))
print('Cross Validation score - ' + str(cross_val_score(classifier, features, np.asarray(reviews['rate'],dtype=np.float64), cv=4)))
print('F1 score - '+ str(f1_score(y_test, y_pred, average="macro")))

Accuracy Score - 0.8315830416180474
Precision Score - 0.6902202391291761
Recall Score  - 0.6634381499320864
Cross Validation score - [0.80678283 0.81549471 0.82072829 0.81911582]
F1 score - 0.6746368252485911


In [83]:
def getPredictions(input):
  return classifier.predict(vectorizer.fit_transform([input]))[0]
  
reviews['pred'] = reviews['text'].apply(getPredictions)
reviews

Unnamed: 0,business_id,name,stars,text,rate,pred
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,2.5,My girlfriend and I went for dinner at Emerald...,3,3.0
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,2.5,We've always been there on a Sunday so we were...,1,2.0
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,4.0,"Husband was craving Chicken Teriyaki & gyoza, ...",3,3.0
3,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,4.0,Went there Saturday noon they open at 12pm but...,2,2.0
4,fweCYi8FmbJXHCqLnwuk8w,Marco's Pizza,4.0,"Hands down, this is the best pizza place in Me...",3,3.0
5,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,4.0,"We went there for dinner the other night, bein...",3,3.0
6,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,4.0,i had the best Chicken Marcela ever. The spagh...,3,3.0
7,1RHY4K3BD22FK7Cfftn8Mg,Marathon Diner,4.0,"Marathon Diner may not look fancy, but I had t...",3,3.0
8,tstimHoMcYbkSC4eBA1wEg,Maria's Mexican Restaurant & Bakery,4.5,We found out about this gem from The Man's co-...,2,3.0
9,tstimHoMcYbkSC4eBA1wEg,Maria's Mexican Restaurant & Bakery,4.5,"The ""chips"" are a fried corn tortilla which we...",3,3.0


In [0]:
var = reviews.groupby(['business_id','name','stars'])['pred'].mean()
var = pd.DataFrame({'business_id':var.index,'pred':var.values})

var.loc[reviews['pred']>2.3,'predictedValue'] = 'Positive'
var.loc[reviews['pred']<1.7,'predictedValue'] = 'Negative'
var.loc[(reviews['pred']<2.3) & (reviews['pred']>1.7),'predictedValue'] = 'Neutral'
print(var)