In [1]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D, GRU
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
%matplotlib inline

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
read_path = "/content/drive/My Drive/cs182final/yelp_review_training_dataset.jsonl"
data = pd.read_json(read_path, lines = True)

In [0]:
data = data.drop(["review_id"], axis =1)

In [5]:
data.head()

Unnamed: 0,text,stars
0,Total bill for this horrible service? Over $8G...,1
1,I *adore* Travis at the Hard Rock's new Kelly ...,5
2,I have to say that this office really has it t...,5
3,Went in for a lunch. Steak sandwich was delici...,5
4,Today was my second out of three sessions I ha...,1


In [6]:
data["text"] = data['text'].str.replace('[^\w\s]','').str.lower()
data["text"] = data['text'].str.replace('\d+', '')
data.head()

Unnamed: 0,text,stars
0,total bill for this horrible service over gs t...,1
1,i adore travis at the hard rocks new kelly car...,5
2,i have to say that this office really has it t...,5
3,went in for a lunch steak sandwich was delicio...,5
4,today was my second out of three sessions i ha...,1


In [7]:
data.shape

(533581, 2)

In [0]:
shuffled = data.sample(frac=1)
shuffled_sample = data.iloc[:10000]
train, validation = train_test_split(shuffled_sample, test_size=0.2)

In [9]:
train.shape, validation.shape

((8000, 2), (2000, 2))

In [10]:
train.head()

Unnamed: 0,text,stars
9973,great product lots of fun flavors but it is v...,3
3869,first of all i would like to say the guys who ...,2
1681,although some may be swayed by the interesting...,1
5926,this place did my nails perfectly they took th...,5
1284,obsessed thats pretty much all i can say i abs...,5


In [11]:
train = pd.get_dummies(train, columns = ['stars'])
train.head()

Unnamed: 0,text,stars_1,stars_2,stars_3,stars_4,stars_5
9973,great product lots of fun flavors but it is v...,0,0,1,0,0
3869,first of all i would like to say the guys who ...,0,1,0,0,0
1681,although some may be swayed by the interesting...,1,0,0,0,0
5926,this place did my nails perfectly they took th...,0,0,0,0,1
1284,obsessed thats pretty much all i can say i abs...,0,0,0,0,1


In [12]:
validation = pd.get_dummies(validation, columns = ['stars'])
train.shape, validation.shape

((8000, 6), (2000, 6))

In [13]:
train.head()

Unnamed: 0,text,stars_1,stars_2,stars_3,stars_4,stars_5
9973,great product lots of fun flavors but it is v...,0,0,1,0,0
3869,first of all i would like to say the guys who ...,0,1,0,0,0
1681,although some may be swayed by the interesting...,1,0,0,0,0
5926,this place did my nails perfectly they took th...,0,0,0,0,1
1284,obsessed thats pretty much all i can say i abs...,0,0,0,0,1


In [14]:
class_names = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']
# Splitting off y variable
y = train[class_names].values
y

array([[0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1]], dtype=uint8)

In [0]:
# max number of unique words 
max_features = 20000
# max number of words from review to use
maxlen = 200
# batch size
batchsize = 32

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train['text'].values))

In [17]:
X_train = tokenizer.texts_to_sequences(train['text'].values)
X_validation = tokenizer.texts_to_sequences(validation['text'].values)
x_train = pad_sequences(X_train, maxlen = maxlen)
x_validation = pad_sequences(X_validation, maxlen = maxlen)
x_train.shape

(8000, 200)

In [0]:
model = Sequential()
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
model.fit(x_train, y, batch_size = batchsize, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7fdd28ea7f28>

In [22]:
y_validation = model.predict([x_validation], batch_size=batchsize, verbose = 1)



In [23]:
model.evaluate(x_validation, validation[class_names].values, verbose = 1, batch_size=batchsize)



[118050.03225, 0.3434999883174896]