<a href="https://colab.research.google.com/github/lupis30puc/yelp_bert_random_forest/blob/main/Yelp_RF_mimic_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Yelp Polarity on kaggle](https://www.kaggle.com/yelp-dataset/yelp-dataset)

12,993 samples from the Yelp Dataset Challenge 2020. 
Divided on train, validation and test subsets. 
Their corresponding sizes are: 10,394 train samples, 1,949 validation samples and 650 test samples.


Tutorial on which I support: 
[Sentiment Analysis Yelp with Random Forest](https://www.kaggle.com/omkarsabnis/sentiment-analysis-on-the-yelp-reviews-dataset)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
%matplotlib inline
import time

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_df = pd.read_pickle('/content/drive/MyDrive/Yelp/sample_train_10394.pkl')
validation_df = pd.read_pickle('/content/drive/MyDrive/Yelp/sample_validation_1949.pkl')
test_df = pd.read_pickle('/content/drive/MyDrive/Yelp/sample_test_650.pkl')

In [None]:
sample_df = pd.read_pickle('/content/drive/MyDrive/Yelp/sample_yelp_reviews_12993.pkl')

In [None]:
train_df.reset_index(drop=True, inplace=True)
validation_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
train_df.head()

Unnamed: 0,text,label
0,"Second best pool in Las Vegas! Pay the $20, c...",1
1,We went there on a Tuesday night in Jan. 2010....,0
2,My experience duplicates all other above/below...,0
3,Mmmmmmr. Sushi! \n\nThis was the place I lost ...,1
4,"It is a little off the strip, but they provid...",1


In [None]:
sample_df.head()

Unnamed: 0,text,label
0,This actually used to be one of my favorite ho...,0
1,Decent food. Fishermen lobster or even congee ...,0
2,"Pros: Fun atmosphere, great for people watchin...",0
3,"I love bookstores, and I love to spend some ti...",0
4,"I passed Five Guys, In-N-Out, Carl's Jr, and S...",0


## Cleaning

In [None]:
# Get the lists of sentences and their labels.
train_x = train_df.text.values
train_y = train_df.label.values

val_x = validation_df.text.values
val_y = validation_df.label.values

test_x = test_df.text.values
test_y = test_df.label.values

In [None]:
sample_x = sample_df.text.values
sample_y = sample_df.label.values

In [None]:
# CLEANING THE REVIEWS - REMOVAL OF STOPWORDS AND PUNCTUATION
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

### Tr

In [None]:
# CLASSIFICATION
data_classes = data[(data['stars']==1) | (data['stars']==3) | (data['stars']==5)]
data_classes.head()
print(data_classes.shape)

# Seperate the dataset into X and Y for prediction
x = data_classes['text']
y = data_classes['stars']
print(x.head())
print(y.head())

In [None]:
%%time
# CONVERTING THE WORDS INTO A VECTOR
vocab = CountVectorizer(analyzer=text_process).fit(train_x)
print(len(vocab.vocabulary_))
r0 = train_x[0]
print(r0)
vocab0 = vocab.transform([r0])
print(vocab0)


39269
Second best pool in Las Vegas!  Pay the $20, commit to being here all day, and drink the overpriced drinks while 80's and 90's hits are blast from the speakers.  They won't let you bring in your own alcohol (they are nice about it, but they do check), and the pools are fun and creative.  There's one for kids especially, and there's one for adults as well as a water slide.

We did the massage at the Spa in the Flamingo!  Pretty outstanding.
  (0, 428)	1
  (0, 1209)	1
  (0, 1316)	1
  (0, 5383)	1
  (0, 7541)	1
  (0, 9642)	1
  (0, 10038)	1
  (0, 11183)	1
  (0, 11671)	1
  (0, 12392)	1
  (0, 12971)	1
  (0, 14038)	1
  (0, 14231)	1
  (0, 15729)	1
  (0, 15949)	1
  (0, 16414)	1
  (0, 17364)	1
  (0, 18118)	1
  (0, 18903)	1
  (0, 19337)	1
  (0, 20508)	1
  (0, 20524)	1
  (0, 21284)	1
  (0, 22843)	1
  (0, 24237)	1
  (0, 25804)	1
  (0, 26286)	1
  (0, 27128)	1
  (0, 28332)	1
  (0, 28925)	2
  (0, 29224)	1
  (0, 29314)	1
  (0, 30501)	1
  (0, 30506)	1
  (0, 34206)	1
  (0, 34731)	1
  (0, 36425)	1
  

In [None]:
"""
    Now the words in the review number 78 have been converted into a vector.
    The data that we can see is the transformed words.
    If we now get the feature's name - we can get the word back!
"""
print("Getting the words back:")
print(vocab.get_feature_names()[428])
print(vocab.get_feature_names()[16414])

Getting the words back:
20
bring


In [None]:
x = vocab.transform(train_x)
#Shape of the matrix:
print("Shape of the sparse matrix: ", x.shape)
#Non-zero occurences:
print("Non-Zero occurences: ",x.nnz)

# DENSITY OF THE MATRIX
density = (x.nnz/(x.shape[0]*x.shape[1]))*100
print("Density of the matrix = ",density)

Shape of the sparse matrix:  (10394, 39269)
Non-Zero occurences:  520479
Density of the matrix =  0.1275177546788985


In [None]:
%%time
# CONVERTING THE WORDS INTO A VECTOR
vocab2 = CountVectorizer(analyzer=text_process).fit(test_x)
x2 = vocab.transform(test_x)
#Shape of the matrix:
print("Shape of the sparse matrix: ", x.shape)
#Non-zero occurences:
print("Non-Zero occurences: ",x.nnz)

# DENSITY OF THE MATRIX
density = (x.nnz/(x.shape[0]*x.shape[1]))*100
print("Density of the matrix = ",density)

Shape of the sparse matrix:  (10394, 39269)
Non-Zero occurences:  520479
Density of the matrix =  0.1275177546788985


## Training

In [None]:
%%time
# CONVERTING THE WORDS INTO A VECTOR
vocab = CountVectorizer(analyzer=text_process).fit(sample_x)
x = vocab.transform(sample_x)
#Shape of the matrix:
print("Shape of the sparse matrix: ", x.shape)
#Non-zero occurences:
print("Non-Zero occurences: ",x.nnz)

# DENSITY OF THE MATRIX
density = (x.nnz/(x.shape[0]*x.shape[1]))*100
print("Density of the matrix = ",density)

Shape of the sparse matrix:  (12993, 44314)
Non-Zero occurences:  650798
Density of the matrix =  0.11303054400013844
CPU times: user 5min 27s, sys: 40.9 s, total: 6min 8s
Wall time: 6min 8s


In [None]:
# SPLITTING THE DATASET INTO TRAINING SET AND TESTING SET
# SAME RANDOM STATE AS IN BERT....
x_train,x_test,y_train,y_test = train_test_split(x,sample_y,test_size=0.2,random_state=42)

In [None]:
len(y_train)

10394

In [None]:
%%time
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rmfr = RandomForestClassifier()
rmfr.fit(x_train,y_train)

CPU times: user 26.6 s, sys: 45.2 ms, total: 26.6 s
Wall time: 26.6 s


In [None]:
%%time
predrmfr = rmfr.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predrmfr))
print("Score:",round(accuracy_score(y_test,predrmfr)*100,2))
print("Classification Report:")
print(classification_report(y_test,predrmfr))

Confusion Matrix for Random Forest Classifier:
[[1165  137]
 [ 210 1087]]
Score: 86.65
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1302
           1       0.89      0.84      0.86      1297

    accuracy                           0.87      2599
   macro avg       0.87      0.87      0.87      2599
weighted avg       0.87      0.87      0.87      2599

CPU times: user 218 ms, sys: 3.79 ms, total: 222 ms
Wall time: 225 ms


In [None]:
importance = rmfr.feature_importances_

In [None]:
len(importance)

44314

In [None]:
feature_names = vocab.get_feature_names()

In [None]:
#feature_importance = pd.DataFrame(feature_names, importance, columns=['feature_names', 'importance'])
feature_importance = pd.DataFrame({'keys': feature_names, 'imp': importance})

In [None]:
feature_importance.head()

Unnamed: 0,keys,imp
0,0,0.0
1,0,0.0
2,0,0.0
3,7,0.0
4,1,0.0


In [None]:
feature_importance.sort_values(by=['imp'], ascending=False)

Unnamed: 0,keys,imp
44313,～＾＾～The,0.015123
44312,，buy,0.007866
44311,附近来讲算ok大间的tim,0.007128
44310,豆沙鍋餅,0.006616
44309,蒙古羊肉面,0.006184
...,...,...
13162,Smash,0.000000
13163,Smashburger,0.000000
13164,Smashed,0.000000
13165,Smear,0.000000


In [None]:
'/content/drive/MyDrive/Yelp/sample_yelp_tensors'