In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D, GRU
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy import sparse

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
%%time
review_all = pd.read_csv("C:/AIT 590/Sree-Final Project/yelp_review.csv/yelp_review.csv")

In [None]:
%%time
rev_samp = review_all.sample(n = 1000000, random_state = 4)

In [None]:
rev_samp = rev_samp[['text','stars']].copy()

In [None]:
rev_samp['text'] = rev_samp['text'].str.lower()

In [None]:
%%time
rev_samp['token_text'] = rev_samp.text.apply(lambda x: word_tokenize(x))

In [None]:
stop = stopwords.words('english')

In [None]:
def clean(text):
    cleaned = [w for w in text if w not in stop]
    cleaned = [w for w in cleaned if w not in string.punctuation]
    return ' '.join(cleaned)

In [None]:
%%time
rev_samp['clean_text'] = rev_samp['token_text'].apply(clean)
rev_samp['text'] = rev_samp['clean_text']
rev_samp.drop(['clean_text','token_text'],1,inplace=True)

In [None]:
X = rev_samp.text
Y = rev_samp.stars

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtr,xts,ytr,yts = train_test_split(X,Y, test_size = 0.15)

In [None]:
train = pd.DataFrame({'text':xtr,'stars':ytr})
test = pd.DataFrame({'text':xts,'stars':yts})
train_samp = pd.get_dummies(train, columns = ['stars'])
test_samp = pd.get_dummies(test, columns = ['stars'])

In [None]:
%%time
#embedding_size = 200 
no_of_features = 8000
input_shape = 400

file = 'glove.twitter.27B.200d.txt'

def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
glove_embeddings = dict(get_coefs(*o.strip().split()) for o in open(file))

In [None]:
y = train_samp[class_names].values
train_features = train_samp['text'].values
test_features = test_samp['text'].values
tok = Tokenizer(num_words=no_of_features)
tok.fit_on_texts(list(train_features))
train_seq = tok.texts_to_sequences(train_features)
test_seq = tok.texts_to_sequences(test_features)
x_train = pad_sequences(train_seq, maxlen = input_shape)
x_test = pad_sequences(test_seq, maxlen = input_shape)

In [None]:
word_index = tok.word_index
no_of_words = min(no_of_features, len(word_index))
embedding_matrix = np.zeros((no_of_words, embedding_size))

In [None]:
words_missed = []
for word, i in word_index.items():
    if i >= no_of_features: 
        break
    emb_vec = glove_embeddings.get(word)
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec
    else:
        words_missed.append(word)

In [None]:
#def create_model():
    input_layer = Input(shape = (input_shape,))
    x1 = Embedding(no_of_features, embedding_size, weights = [embedding_matrix], trainable = True)(input_layer)
    x2 = SpatialDropout1D(0.5)(x1)
    x3 = Bidirectional(LSTM(70, return_sequences=True))(x2)
    x4 = Bidirectional(LSTM(50, return_sequences=True))(x3)
    x5 = Bidirectional(GRU(60, return_sequences=True))(x4)
    max_pooling = GlobalMaxPooling1D()(x5)
    avg_pooling = GlobalAveragePooling1D()(x5)
    merged_layer = concatenate([avg_pooling, max_pooling])
    output_layer = Dense(5, activation = 'sigmoid')(merged_layer)
    model = Model(inputs = input_layer, outputs = output_layer)
    return model

In [None]:
earlystop = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 5)  
# patience --> how many epochs to wait to see if val_loss will improve again.
checkpoint = ModelCheckpoint(monitor = 'val_loss', save_best_only = True, filepath = 'yelp_bi_lstm_gru_weights.hdf5')

model = create_model()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [20]:
model.fit(x_train, y, batch_size = 512, epochs = 3, validation_split = .3,
          callbacks=[earlystop, checkpoint])

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 595000 samples, validate on 255000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1b52cb2be0>

In [21]:
y_test = model.predict([x_test], batch_size=1024, verbose = 1)



In [22]:
model.evaluate(x_test, test_samp[class_names].values, verbose = 1, batch_size=1024)



[0.2523356653499603, 0.8838220902061462]

In [23]:
v = metrics.classification_report(np.argmax(test_samp[class_names].values, axis = 1),np.argmax(y_test, axis = 1))
print(v)

             precision    recall  f1-score   support

          0       0.79      0.82      0.80     20896
          1       0.52      0.39      0.44     12615
          2       0.54      0.46      0.50     17676
          3       0.56      0.54      0.55     34536
          4       0.79      0.87      0.82     64277

avg / total       0.68      0.70      0.69    150000



In [24]:
metrics.accuracy_score(np.argmax(test_samp[class_names].values, axis = 1),np.argmax(y_test, axis = 1))

0.6971333333333334