In [1]:
import tensorflow as tf

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [49]:
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Embedding, Reshape, Activation, Input, Flatten, Dropout, Dense
from tensorflow.python.keras.layers.merge import Dot
from tensorflow.python.keras.utils import np_utils
from tensorflow.python.keras.utils.data_utils import get_file
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import skipgrams, pad_sequences
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

In [5]:
import numpy as np
import os
import pandas as pd
import gensim
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.manifold import TSNE

In [12]:
import re

In [6]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

reading dataframe

In [7]:
df = pd.read_csv('yelp.csv')
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [8]:
df = df.dropna()
df=df[['text','stars']]
df.head()

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5


In [9]:
labels = df['stars'].map(lambda x: 1 if int(x) > 3 else 0)

In [11]:
labels.value_counts()

1    6863
0    3137
Name: stars, dtype: int64

### cleaning the text:

In [13]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english') # retrieve the root of the word
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [14]:
df['text']

0       My wife took me here on my birthday for breakf...
1       I have no idea why some people give bad review...
2       love the gyro plate. Rice is so good and I als...
3       Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4       General Manager Scott Petello is a good egg!!!...
                              ...                        
9995    First visit...Had lunch here today - used my G...
9996    Should be called house of deliciousness!\n\nI ...
9997    I recently visited Olive and Ivy for business ...
9998    My nephew just moved to Scottsdale recently so...
9999    4-5 locations.. all 4.5 star average.. I think...
Name: text, Length: 10000, dtype: object

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vorme\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
df['text'] = df['text'].map(lambda x: clean_text(x))

In [18]:
df['text']

0       wife took birthday breakfast excel weather per...
1       idea peopl give bad review place goe show you ...
2         love gyro plate rice good also dig candi select
3       rosi dakota love chaparr dog park ! ! ! conven...
4       general manag scott petello good egg ! ! ! det...
                              ...                        
9995    first visit had lunch today use groupon + + we...
9996    call hous delici ! + + i could item item blah ...
9997    recent visit oliv ivi busi last week visit con...
9998    nephew move scottsdal recent bunch friend brou...
9999    4 - 5 locat 4 5 star averag think arizona real...
Name: text, Length: 10000, dtype: object

In [19]:
maxlen = 50
embed_dim = 100
max_words = 20000

convert text in tokens

In [22]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['text'])

In [23]:
sequences = tokenizer.texts_to_sequences(df['text'])

In [24]:
data = pad_sequences(sequences, maxlen=maxlen, padding='post')
data[0] # 50 values in a sequence

array([  35,    1,   20,  116,  398,    1,   24,   39,  100,   92,  476,
       1081, 1442,   82,   11,   13,  104,  908,  120,   49,   29,  192,
        379, 1543, 1722,  297,  727, 2401,  159,   75,   85,  411, 4791,
        156,  104,  408,   80,   90,  436,   35,  545,    1,   20,  116,
        398,  458,  117,   40,   42,   19])

In [25]:
labels = np.asarray(labels)

In [26]:
labels.shape

(10000,)

In [27]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

20182

In [28]:
validation_split = 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)

In [29]:
data = data[indices]
labels = labels[indices]

In [30]:
val_sample = int(validation_split * data.shape[0])

In [31]:
val_sample

2000

In [33]:
X_train = data[:-val_sample]
y_train = labels[:-val_sample]
x_val = data[-val_sample:]
y_val = labels[-val_sample:]

Load Glove Embeddings

We will use a transfer learning approach

In [35]:
embed_index = dict()
f = open(os.path.join('glove.6B.100d.txt'), encoding="utf8")

In [36]:
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embed_index[word] = coefs
f.close()

In [37]:
print('%s Word vectors' % len(embed_index))

400000 Word vectors


## Create a weight matrix:

In [38]:
embed_matrix = np.zeros((max_words, embed_dim))

# aqui eu pego todo o meu corpus encontrado no texto e construo uma matriz apenas com ela.
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embed_vector = embed_index.get(word)
        if embed_vector is not None:
            embed_matrix[i] = embed_vector

In [60]:
embed_matrix.shape

(20000, 100)

In [58]:
embed_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.046539  ,  0.61966002,  0.56647003, ..., -0.37616   ,
        -0.032502  ,  0.80620003],
       [-0.24506   ,  0.04481   ,  0.32789999, ...,  0.16635001,
         0.40259001,  0.3766    ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.45337999, -0.53738999,  0.016752  , ..., -0.20511   ,
         0.20451   ,  0.32938001],
       [ 0.24127001, -0.38940999, -0.43963   , ..., -0.29967999,
        -0.23554   ,  0.051041  ]])

## Creating the model:

In [83]:
model = Sequential()

In [84]:
model.add(Embedding(max_words,
                    embed_dim,
                    weights=[embed_matrix],
                    input_length=maxlen))

In [85]:
model.add(Flatten())

In [86]:
model.add(Dropout(0.5))

In [87]:
model.add(Dense(32, activation='relu'))

In [88]:
model.add(Dropout(0.5))

In [89]:
model.add(Dense(1, activation='sigmoid'))

In [90]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 5000)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                160032    
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 2,160,065
Trainable params: 2,160,065
Non-trainable params: 0
____________________________________________

In [92]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [93]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

In [94]:
save_best = ModelCheckpoint('./yelp_comments.h5', save_best_only=True, 
                               monitor='val_loss', mode='min')

In [95]:
%%time
model.fit(X_train, y_train,
          epochs=20,
          validation_data=(x_val, y_val),
          batch_size=128,
          verbose=1,
          callbacks=[early_stopping, save_best])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Wall time: 19.5 s


<tensorflow.python.keras.callbacks.History at 0x22da73ed668>

## Making predictions:

In [96]:
model.load_weights('yelp_comments.h5')

In [97]:
pred = model.predict(x_val)

In [100]:
pred

array([[0.91064286],
       [0.7417292 ],
       [0.8993537 ],
       ...,
       [0.958954  ],
       [0.999754  ],
       [0.36097008]], dtype=float32)

## Word embeddings visualization:

In [98]:
glove_embds = model.layers[0].get_weights()[0]

In [99]:
words = []
for word, i in tokenizer.word_index.items():
    words.append(word)

## Visualizing words:

In [101]:
def plot_words(data, start, stop, step):
    trace = go.Scatter(
        x = data[start:stop:step,0], 
        y = data[start:stop:step, 1],
        mode = 'markers',
        text= words[start:stop:step]
    )
    layout = dict(title= 't-SNE_factor1 vs t-SNE_factor2',
                  yaxis = dict(title='t-SNE_factor2'),
                  xaxis = dict(title='t-SNE_factor1'),
                  hovermode= 'closest')
    fig = dict(data = [trace], layout= layout)
    py.iplot(fig)

In [102]:
%%time
glove_tsne_embds = TSNE(n_components=2).fit_transform(glove_embds)

Wall time: 2min 42s


In [104]:
%matplotlib inline

In [106]:
plot_words(glove_tsne_embds, 0, 100, 1)