In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
import string
from nltk.stem.snowball import SnowballStemmer
import random
import io

In [2]:
vocab_size = 1000
embedding_dim = 16
max_length = 10
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = 0.95

In [3]:
comments = []
labels = []
stopwords = set(stopwords.words('spanish'))

In [4]:
def Normalize(text):
    #QUITAMOS ACENTOS
        
    
    #QUITAMOS \n y \r. TEXTO EN MINÚSCULA
    text = text.replace('\n', ' ').replace('\r', '').lower()
    
    #QUITAMOS NÚMEROS
    text = ''.join([i for i in text if not i.isdigit()])
    
    #QUITAMOS PUNTUACIÓN
    text = "".join(l for l in text if l not in (string.punctuation,'?','¿',',','.'))
    
    #QUITAMOS DOBLES ESPACIOS
    text = ' '.join(text.split())

    #STEMMING Y QUITAMOS STOPWORDS
    stemmer = SnowballStemmer("spanish")
    words = text.split()
    text = ''
    for word in words:
        if not word in stopwords:
            stemmed_word = stemmer.stem(word)
            text = text + " " + stemmed_word
    
    return text

In [5]:
with io.open(r"C:\Users\meiza\Documents\GitHub\Machine-Learning\Forum Politization\Dataset\dataset.csv", 'r',encoding='latin-1') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar="'")
    #next(reader)
    for row in reader:
        labels.append(row[0])
        comment = Normalize(row[1])        
        comments.append(comment)


In [6]:
#MEZCLAMOS EL DATASET YA QUE EN EL CSV ORIGINAL ESTÁN ORDENADOS
joint = list(zip(labels, comments))

random.shuffle(joint)

labels, comments = zip(*joint)

#SEPARAMOS ENTRE DATOS PARA ENTRENAR MODELO Y DATOS PARA VALIDAR
train_size = int(len(comments) * training_portion)
train_comments = comments[:train_size]
train_labels = labels[:train_size]
validation_comments = comments[train_size:]
validation_labels = labels[train_size:]

In [7]:
#INICIAMOS TOKENIZER CON NUESTRO DATASET DE ENTRENO
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_comments)
word_index = tokenizer.word_index

#TRANSFORMAMOS NUESTRO TEXTO EN TOKENS
train_sequences = tokenizer.texts_to_sequences(train_comments)

#PADDING A LOS COMENTARIOS QUE NO LLEGUEN A MAX_LENGTH.
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

#TOKENIZAMOS Y AÑADIMOS PADDING AL DATASET DE VALIDACIÓN
validation_sequences = tokenizer.texts_to_sequences(validation_comments)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_length)

#TRANSFORMAMOS A NUMPY LOS LABELS
train_labels = np.array(train_labels).astype('int')
validation_labels = np.array(validation_labels).astype('int')


In [8]:
#CREAMOS MODELO
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(2, activation='softmax')
])

#COMPILAMOS MODELO
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 16)            16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 128)           41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 102,978
Trainable params: 102,978
Non-trainable params: 0
__________________________________________________

In [9]:
#ENTRENAMOS MODELO

num_epochs = 15
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(validation_padded, validation_labels))

Train on 2135 samples, validate on 113 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [10]:
def Predict(text):
    print(text)
    input_data = Normalize(text)
    print(input_data)
    input_data = tokenizer.texts_to_sequences([input_data])
    print(input_data)
    input_data = pad_sequences(input_data, padding=padding_type, maxlen=max_length)
    pred = model.predict(input_data)
    return pred

In [11]:
x = Predict('gorda me la pone melafo')
print("Predicción:", x)

gorda me la pone melafo
 gord pon melaf
[[601, 45, 1]]
Predicción: [[0.9030627  0.09693731]]


In [12]:
x = Predict('vete a por una paguita')
print("Predicción:", x)

vete a por una paguita
 vet paguit
[[1, 290]]
Predicción: [[4.5741981e-06 9.9999547e-01]]
