In [247]:
import pandas as pd
df = pd.read_csv('./temp/misoginos Revisados.csv', index_col = 0)

In [248]:
df.shape

(223, 14)

In [249]:
df = df.dropna(subset = ['texto', 'label'])

In [250]:
df.label.value_counts()

0.0    117
1.0    104
Name: label, dtype: int64

In [251]:
from sklearn.utils import shuffle
df = shuffle(df)

In [252]:
X = df.texto
y = df.label

In [348]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [351]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    #txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in X]
corpus[:10]

['solyema hay tanta fea en el foro que tu post pasa a ser tema de la hermenéutica',
 'la gorda sabe q si se sube el helicóptero no despega así q está preparando su huida anticipada',
 'la gorda alcides con el agua al cuello en san fernando y aplaude a los moyano que paran en el ritz de madrid',
 'simplemente se trató de una pedorreta de la gorda',
 ' ¡¡¡¡¡¡ gorda ',
 'y es bien fea',
 'pinta de gorda con la bombacha amarronada jajaaa',
 'bimai y gorda no se olvide',
 'que falta le hacía aparecer así en estas fotos más fea imposible',
 'mrce44 en cualquier momento aparece la fea']

In [353]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[136, 166],
 [136, 166, 404],
 [136, 166, 404, 21],
 [136, 166, 404, 21, 11],
 [136, 166, 404, 21, 11, 9],
 [136, 166, 404, 21, 11, 9, 137],
 [136, 166, 404, 21, 11, 9, 137, 3],
 [136, 166, 404, 21, 11, 9, 137, 3, 92],
 [136, 166, 404, 21, 11, 9, 137, 3, 92, 218],
 [136, 166, 404, 21, 11, 9, 137, 3, 92, 218, 113]]

In [354]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [359]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_90 (Embedding)     (None, 95, 10)            14960     
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_146 (Dense)            (None, 1496)              151096    
Total params: 210,456
Trainable params: 210,456
Non-trainable params: 0
_________________________________________________________________


In [360]:
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
 - 23s - loss: 6.4707
Epoch 2/100
 - 19s - loss: 6.0177
Epoch 3/100
 - 18s - loss: 5.9145
Epoch 4/100
 - 18s - loss: 5.8290
Epoch 5/100
 - 18s - loss: 5.7361
Epoch 6/100
 - 19s - loss: 5.6303
Epoch 7/100


KeyboardInterrupt: 

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("united states", 5, model, max_sequence_len))