In [1]:
#DATA MANIPULATION
import pandas as pd
import numpy as np

#EMBEDDING AND PREPROCESSING
import tensorflow as tf
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
import gc

#TIME CONTROLS
import time

#PLOT
import matplotlib.pyplot as plt
plt.style.use("ggplot")
get_ipython().run_line_magic('matplotlib', 'inline')

#TENSORFLOW AND KERAS
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional, GlobalAveragePooling1D, MaxPooling1D, Flatten, Masking
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization


import random
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
#tf.debugging.set_log_device_placement(True)

Num GPUs Available:  1


In [2]:
def load_embedding(filename, encoding='utf-8'):
    # load embedding into memory, skip first line
    file = open(filename,'r',encoding=encoding)
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        try:
            embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
        except:
            pass
    return embedding


def get_weight_matrix(embedding, vocab, seq_len):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, seq_len))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

def cleaning(doc):
    txt = [token.text for token in doc]
    if len(txt) > 2:        
        return re.sub(' +', ' ', ' '.join(txt)).strip()

def preprocess_string(string, word_vectors):
    unk_string = '<unk>'
    counter = 0
    string = re.sub('[(,).\\/\-_\+":“0-9]', ' ', str(string)).lower()
    for word in string.split():
        try:
            word_vectors[word]
            string_to_attatch = word
        except:
            string_to_attatch = unk_string
        
        if counter:
            string = string +' '+ string_to_attatch
        else:
            string = string_to_attatch
            counter = 1
    
    return string

def pad_sequence(string, tokenizer):
    encoded_string = tokenizer.texts_to_sequences(string)
    padded_enconded = pad_sequences(encoded_string, maxlen=max_length, padding='post')
    return padded_enconded

def preprocess_to_predict(string, word_vectors, tokenizer):
    string = preprocess_string(string, word_vectors)    
    padded_sequence = pad_sequence(string, tokenizer)
    
    return padded_sequence

In [3]:
#CONTANTS
MAX_SEQ_LEN = 200 #number of words to consider
INPUT_DIMS = 50 #number of dimensions in GLOVE vector

In [4]:
df = pd.read_csv(
    'imdb-reviews-pt-br.csv',              
    index_col=0, 
    sep=',',
    encoding='utf-8', 
    dtype=str, 
    quotechar='"').dropna()

raw_embedding = load_embedding('glove_s50.txt')
txt = [preprocess_string(doc, raw_embedding) for doc in df['text_pt']]
df['clean'] = txt
df['sentiment_code'] = np.where(df['sentiment']=='neg', 0, 1)
df.head()

Unnamed: 0_level_0,text_en,text_pt,sentiment,clean,sentiment_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Once again Mr. Costner has dragged out a movie...,"Mais uma vez, o Sr. Costner arrumou um filme p...",neg,mais uma vez o sr costner arrumou um filme por...,0
2,This is an example of why the majority of acti...,Este é um exemplo do motivo pelo qual a maiori...,neg,este é um exemplo do motivo pelo qual a maiori...,0
3,"First of all I hate those moronic rappers, who...","Primeiro de tudo eu odeio esses raps imbecis, ...",neg,primeiro de tudo eu odeio esses raps imbecis q...,0
4,Not even the Beatles could write songs everyon...,Nem mesmo os Beatles puderam escrever músicas ...,neg,nem mesmo os beatles puderam escrever músicas ...,0
5,Brass pictures movies is not a fitting word fo...,Filmes de fotos de latão não é uma palavra apr...,neg,filmes de fotos de latão não é uma palavra apr...,0


In [5]:
x_input = df['clean'].values.tolist()
labels = df['sentiment_code'].values

In [6]:
t = Tokenizer(filters='!"#$%&*+,-./:;=?@[\\]^_`{|}~\t\n')
t.fit_on_texts(x_input)
vocab_size = len(t.word_index) + 1

In [7]:
encoded_docs = t.texts_to_sequences(x_input)

In [8]:
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQ_LEN, padding='post')

In [9]:
padded_docs[0]

array([   25,    12,    75,     4,   549,  9361, 51120,     6,     8,
          21,    27,    25,    72,    15,     2,     4,  1370,   154,
          69,   997,  1492,     1,  3326,    22,  1695,    69,   655,
          56,    27,  1217,    14,   167,     9,    55,  5942,    16,
         203,    34,    78,     5,   200,     1,   173,    46,  2091,
          22,  4411,     3,     4,   108, 70823,     7,  3810,   580,
          22,   360,     3,    77,  2021,    79,    27,    25,   388,
          43,    14,     9,    55,  3087,     4,   108,    16,     4,
         263,  5158,    84,  1815,     7,    27,  2609,     3, 51121,
       11472, 12074,     4,   355,     7,     2,    23,   851,    18,
           6,   594,     2,  1531,     2,     7,    66,    15,     2,
          73,   208,   277,    45,    33,   907,     3,     9,   340,
        3811,     1,     6,  4411, 16288,    33,   301,  9926,    85,
         223, 16289,  9361,   349,    43,  1391,    53,   154,    15,
         247,    15,

In [10]:
embedding_vectors = get_weight_matrix(raw_embedding, t.word_index, seq_len=50)

In [11]:
t.word_index['mais']

25

In [12]:
embedding_vectors[t.word_index['mais']]

array([-0.77328902, -0.214645  , -0.94515002, -3.27474403,  0.047547  ,
        0.33738399, -0.36342201, -0.624255  , -0.66911697,  1.02033901,
        0.62575299,  0.731152  , -0.479958  , -0.30400699,  0.19501901,
        0.59655303, -0.299766  , -0.22447699, -0.056082  ,  1.45373595,
        1.52957106, -0.033997  , -0.400985  , -0.716034  , -0.87385201,
       -0.97609699,  0.617194  , -0.52412701, -0.53311199,  2.09159398,
       -0.080944  ,  0.53168398,  0.048488  ,  0.62522602, -0.43868801,
       -0.85810298, -0.830791  , -0.159187  , -1.19930506,  0.297562  ,
        0.74319702, -0.246746  , -0.096066  ,  0.48618099,  0.620085  ,
       -0.40474099,  0.84864199, -0.51519102,  0.103377  , -0.43511599])

In [13]:
e = Embedding(vocab_size, INPUT_DIMS, weights=[embedding_vectors], mask_zero=False, input_length=MAX_SEQ_LEN, trainable=False)

In [14]:
#just to check if everything is in order
masked_output = e(padded_docs[0])
masked_output.shape

TensorShape([200, 50])

In [15]:
print(masked_output._keras_mask)

None


In [16]:
'''
#ARCHTECTURE #3
try:
    del model
except:
    pass
model = Sequential([
    e,
    Bidirectional(LSTM(64,  return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)

])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
'''

"\n#ARCHTECTURE #3\ntry:\n    del model\nexcept:\n    pass\nmodel = Sequential([\n    e,\n    Bidirectional(LSTM(64,  return_sequences=True)),\n    Bidirectional(LSTM(32)),\n    Dense(64, activation='relu'),\n    Dropout(0.2),\n    Dense(1)\n\n])\n\nmodel.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n              optimizer=tf.keras.optimizers.Adam(1e-4),\n              metrics=['accuracy'])\n"

In [17]:
#ARCHTECTURE #4
try:
    del model
except:
    pass
model = Sequential([
    e,
    Masking(mask_value=0),
    #Bidirectional(LSTM(64,  return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)

])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           4948950   
_________________________________________________________________
masking (Masking)            (None, 200, 50)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                21248     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 4,974,423
Trainable params: 25,473
Non-trainable params: 4,948,950
_________________________________________

In [19]:
gc.collect()

0

In [None]:
with tf.device('/device:GPU:0'):

    hist = model.fit(padded_docs, 
                     labels, 
                     validation_split=0.2,
                     epochs=50,
                     batch_size=128, 
                     shuffle=True,
                     verbose=1
    )

Train on 39567 samples, validate on 9892 samples
Epoch 1/50

In [None]:
history = pd.DataFrame(hist.history)
#plt.figure(figsize=(12,12))

plt.plot(history["loss"], 'r',label='loss')
plt.plot(history["val_loss"], 'b', label='val_loss')
plt.legend()
plt.show()

In [None]:
history = pd.DataFrame(hist.history)
#plt.figure(figsize=(12,12))

plt.plot(history["accuracy"], 'r',label='acc')
plt.plot(history["val_accuracy"], 'b', label='val_acc')
plt.legend()
plt.show()

In [None]:
model.save_weights('easy_checkpoint')