# Universidad Jorge Tadeo Lozano

### Taller de Keras

- Extraccion de Datos desde MongoDB
- Limpieza de Datos con Python
- Analisis de Sentimiento con Python

### Carga de Datos 

In [1]:
import pymongo
from pymongo import MongoClient

In [2]:
#conexión al servidor local
client = MongoClient('localhost', 27017)

#conexión a la base de datos
db = client.twitterdb

#conexion a la colección hurto para extraer los tweets
collection = db.twitter_hurto

In [3]:
collection.create_index([('text', 'text')])

'text_text'

In [4]:
datos = collection.find({"lang": "es",  "$text": {"$search": "robo"}})

In [5]:
datos_extraccion = []

In [6]:
try:
    for dato in datos:
        for key in dato:
            if(key == "text"):
                datos_extraccion.append(dato[key])
            if (key == "extended_tweet"):
                for d in key:
                    if (d == "full_text"):
                        printdatos_extraccion(d)
                        datos_extraccion[len(datos_extraccion):] = [datos_extraccion[key]]
except Exception as err:
    print("Error al imprimir: " + str(err))
    

### Transformación de Datos

In [7]:
import nltk
from nltk.corpus import stopwords
import string
import unicodedata
import re
import string

In [8]:
nltk.download('stopwords')
stops = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
stops

['de',
 'la',
 'que',
 'el',
 'en',
 'y',
 'a',
 'los',
 'del',
 'se',
 'las',
 'por',
 'un',
 'para',
 'con',
 'no',
 'una',
 'su',
 'al',
 'lo',
 'como',
 'más',
 'pero',
 'sus',
 'le',
 'ya',
 'o',
 'este',
 'sí',
 'porque',
 'esta',
 'entre',
 'cuando',
 'muy',
 'sin',
 'sobre',
 'también',
 'me',
 'hasta',
 'hay',
 'donde',
 'quien',
 'desde',
 'todo',
 'nos',
 'durante',
 'todos',
 'uno',
 'les',
 'ni',
 'contra',
 'otros',
 'ese',
 'eso',
 'ante',
 'ellos',
 'e',
 'esto',
 'mí',
 'antes',
 'algunos',
 'qué',
 'unos',
 'yo',
 'otro',
 'otras',
 'otra',
 'él',
 'tanto',
 'esa',
 'estos',
 'mucho',
 'quienes',
 'nada',
 'muchos',
 'cual',
 'poco',
 'ella',
 'estar',
 'estas',
 'algunas',
 'algo',
 'nosotros',
 'mi',
 'mis',
 'tú',
 'te',
 'ti',
 'tu',
 'tus',
 'ellas',
 'nosotras',
 'vosostros',
 'vosostras',
 'os',
 'mío',
 'mía',
 'míos',
 'mías',
 'tuyo',
 'tuya',
 'tuyos',
 'tuyas',
 'suyo',
 'suya',
 'suyos',
 'suyas',
 'nuestro',
 'nuestra',
 'nuestros',
 'nuestras',
 'vuestr

In [10]:
def remove_accent(text):
    result = [''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))]
    return result

In [11]:
#stop words sin acentos
stop_limp = [''.join(remove_accent(x)) for x in stops]

In [12]:
#tamaño original de la extracción en mongodb
len(datos_extraccion)

28449

In [13]:
#Eliminamos los retweets
datos_limp = [x.lower() for x in datos_extraccion if x[:2].lower() != "rt"]

In [14]:
#Array con usuarios
usuarios = [c.lower() for x in datos_extraccion for c in x.split() if c[:1] == "@"]

In [15]:
#Array con hashtag
hashtag = [c.lower() for x in datos_extraccion for c in x.split() if c[:1] == "#"]

In [16]:
#tamaño sin retweets
len(datos_limp)

10770

In [17]:
delete_words = [word for x in datos_limp for word in x.split() if len(word) < 3 and word not in ['no','si','da','el','lo','me','es','rt','fe','te','se','ir','ay','va']]

In [18]:
datos_limp = [x.lower() for x in datos_limp] #tweets en minuscula
datos_limp = [' '.join(word for word in x.split() if word not in (stops) and (stop_limp)) for x in datos_limp] #quitamos las stop words
datos_limp = [' '.join(word for word in x.split() if word not in (usuarios)) for x in datos_limp] #quitamos usuarios
datos_limp = [' '.join(word for word in x.split() if word not in (delete_words)) for x in datos_limp] #quitamos palabras sin sentido do as yu
datos_limp = [' '.join(word for word in x.split() if word not in (hashtag)) for x in datos_limp] #quitamos hashtags

In [19]:
datos_limp

['robo, robo gran robo',
 'robo robo...',
 'robo robo robo asco da real madrid',
 'robo plantel educativo, robo transeúnte, robo cuentahabiente, robo repartidor 19… https://t.co/kjgozcstsq',
 'robo robo robo familia ladrones',
 'robo robo!!! ratero!!!!!!',
 'robo robo',
 'robo robo',
 'si robo racing comio robo mas grande bernabeu, mas robo baile',
 'no, excepto opciones, robo energía eléctrica robo robo si codeca ladrones,… https://t.co/uqddgwnijw',
 'menos foros seguridad, robos trabajo,robo casas hijas, robo teléfono cel… https://t.co/lhq1osrj2q',
 'robe rena, robo forest, robo haru vez robo ranito. https://t.co/yi7ov9doyd',
 'robe rena, robo forest, robo haru vez robo ranito. https://t.co/sxq15xaoxr',
 'robo descarado visca robo farca',
 'saqueo robo robo delito.',
 'así sol, robo robo.',
 'así sol!!robo robo.',
 'robo robos https://t.co/0rngvhftef',
 'robo "pena robo" jijiji',
 'robó jajajajajajaja poesía robo amiga',
 'saqueo robo, justifica robo.',
 'así sol, robo robo',
 'basta

In [20]:
datos_limp = [re.sub(r'http\S+', '', x) for x in datos_limp] # Eliminamos las URL
datos_limp = [''.join(c for c in x if c not in string.punctuation) for x in datos_limp] # Eliminamos signos de puntuación
datos_limp = [''.join(c for c in x if c not in '0123456789') for x in datos_limp] #Eliminamos los numeros
datos_limp = [' '.join(x.split()) for x in datos_limp] # Eliminar espacios en blanco y separadores extras

In [21]:
#datos_limp = datos_limp.encode('ascii', 'ignore').decode('ascii')
datos_limp = [' '.join(a.encode('ascii', 'ignore').decode('ascii') for a in x.split()) for x in datos_limp]
datos_limp = [x for x in datos_limp if len(x) > 5]
datos_limp

['robo robo gran robo',
 'robo robo',
 'robo robo robo asco da real madrid',
 'robo plantel educativo robo transente robo cuentahabiente robo repartidor ',
 'robo robo robo familia ladrones',
 'robo robo ratero',
 'robo robo',
 'robo robo',
 'si robo racing comio robo mas grande bernabeu mas robo baile',
 'no excepto opciones robo energa elctrica robo robo si codeca ladrones',
 'menos foros seguridad robos trabajorobo casas hijas robo telfono cel',
 'robe rena robo forest robo haru vez robo ranito',
 'robe rena robo forest robo haru vez robo ranito',
 'robo descarado visca robo farca',
 'saqueo robo robo delito',
 'as sol robo robo',
 'as solrobo robo',
 'robo robos',
 'robo pena robo jijiji',
 'rob jajajajajajaja poesa robo amiga',
 'saqueo robo justifica robo',
 'as sol robo robo',
 'basta robos llama robo',
 'robo mas robo',
 'miren robo robo',
 'temporadas robo robo amiga',
 'chavismo rt saqueo robo robo delito',
 'veo venir hoy va ser robo tras robo',
 'da mano mano hombro hombro 

In [22]:
len(datos_limp)

10651

### Text frecuency - Inverse Document Frecuency

In [256]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt

In [257]:
def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words

In [259]:
texts = datos_limp
max_features = 1000

In [266]:
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words=stops, max_features=max_features)

In [268]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [269]:
sparse_tfidf_texts = tfidf.fit_transform(texts)

In [273]:
print(sparse_tfidf_texts.shape)

(10663, 1000)


In [274]:
train_idx = np.random.choice(sparse_tfidf_texts.shape[0], round(0.8*sparse_tfidf_texts.shape[0]), replace=False)
test_idx = np.array(list(set(range(sparse_tfidf_texts.shape[0]))-set(train_idx)))

### Skip grams

In [23]:
import collections

In [24]:
batch_size = 50
num_sampled = int(batch_size/2)
embedding_size = 200
generations = 50000
vocabulary_size = 10000
print_loss_every = 500
window_size = 2
print_valid_every = 2000
#valid_words = ['robo', 'policia', 'modalidad', 'familia', 'ladron', 'atraco']
valid_words = ['policia', 'modalidad', 'atraco']
#valid_words_littte = ['policia', 'modalidad', 'atraco']

In [31]:
def build_dictionary(sentences, vocabulary_size):
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist] #if len(x) < 3]
    count = [['RARE', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    word_dict = {}
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    return word_dict

In [32]:
word_dict = build_dictionary(datos_limp, vocabulary_size)

In [33]:
word_dict

{'RARE': 0,
 'robo': 1,
 'si': 2,
 'millones': 3,
 'dos': 4,
 'va': 5,
 'aos': 6,
 'reporte': 7,
 'dlares': 8,
 'mas': 9,
 'combustible': 10,
 'da': 11,
 'daddy': 12,
 'yankee': 13,
 'amiga': 14,
 'joyas': 15,
 'polica': 16,
 'impactante': 17,
 'sufre': 18,
 'ser': 19,
 'ahora': 20,
 'casa': 21,
 'valuadas': 22,
 'solo': 23,
 'as': 24,
 'dinero': 25,
 'hoy': 26,
 'amigo': 27,
 'rob': 28,
 'delito': 29,
 'vehculo': 30,
 'mil': 31,
 'corrupcin': 32,
 'ver': 33,
 'hace': 34,
 'no': 35,
 'vehculos': 36,
 'el': 37,
 'mano': 38,
 'gente': 39,
 'de': 40,
 'saqueo': 41,
 'tan': 42,
 'pas': 43,
 'puede': 44,
 'intento': 45,
 'detienen': 46,
 'mismo': 47,
 'parte': 48,
 'seguro': 49,
 'falta': 50,
 'gran': 51,
 'hacer': 52,
 'bien': 53,
 'vez': 54,
 'pueblo': 55,
 'siempre': 56,
 'despus': 57,
 'descarado': 58,
 'menos': 59,
 'tras': 60,
 'auto': 61,
 'san': 62,
 'detiene': 63,
 'decir': 64,
 'plata': 65,
 'suerte': 66,
 'gobierno': 67,
 'video': 68,
 'robos': 69,
 'tres': 70,
 'hombre': 71,
 'l

In [34]:
def text_to_numbers(sentences, word_dict):
    data = []
    for sentence in sentences:
        sentence_data = []
        for word in sentence:
            if word in word_dict:
                word_ix = word_dict[word] #posición/ID de la palabra en el word dict
            else:
                word_ix = 0 ##posición/ID de la palabra RARE
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return data

In [35]:
word_dict_rev = dict(zip(word_dict.values(), word_dict.keys()))
word_dict_rev

{0: 'RARE',
 1: 'robo',
 2: 'si',
 3: 'millones',
 4: 'dos',
 5: 'va',
 6: 'aos',
 7: 'reporte',
 8: 'dlares',
 9: 'mas',
 10: 'combustible',
 11: 'da',
 12: 'daddy',
 13: 'yankee',
 14: 'amiga',
 15: 'joyas',
 16: 'polica',
 17: 'impactante',
 18: 'sufre',
 19: 'ser',
 20: 'ahora',
 21: 'casa',
 22: 'valuadas',
 23: 'solo',
 24: 'as',
 25: 'dinero',
 26: 'hoy',
 27: 'amigo',
 28: 'rob',
 29: 'delito',
 30: 'vehculo',
 31: 'mil',
 32: 'corrupcin',
 33: 'ver',
 34: 'hace',
 35: 'no',
 36: 'vehculos',
 37: 'el',
 38: 'mano',
 39: 'gente',
 40: 'de',
 41: 'saqueo',
 42: 'tan',
 43: 'pas',
 44: 'puede',
 45: 'intento',
 46: 'detienen',
 47: 'mismo',
 48: 'parte',
 49: 'seguro',
 50: 'falta',
 51: 'gran',
 52: 'hacer',
 53: 'bien',
 54: 'vez',
 55: 'pueblo',
 56: 'siempre',
 57: 'despus',
 58: 'descarado',
 59: 'menos',
 60: 'tras',
 61: 'auto',
 62: 'san',
 63: 'detiene',
 64: 'decir',
 65: 'plata',
 66: 'suerte',
 67: 'gobierno',
 68: 'video',
 69: 'robos',
 70: 'tres',
 71: 'hombre',
 72

In [36]:
text_data = text_to_numbers(datos_limp, word_dict)
text_data[0]

[4152,
 5002,
 0,
 5002,
 0,
 4152,
 5002,
 0,
 5002,
 0,
 0,
 4152,
 4301,
 1655,
 0,
 4152,
 5002,
 0,
 5002]

In [37]:
valid_examples = [word_dict[x] for x in valid_words]

In [38]:
valid_examples

[1546, 1492, 1207]

In [39]:
#validar 1 negativo
target = [1 for ix, x in enumerate(datos_limp)]
len(target)

10651

In [40]:
def generate_batch_data(sentences, batch_size, window_size, method = 'skip_gram'):
    '''
        Mi perro come su comida -> (Mi, come), (perro, come), (su, come), (comida, come)
    '''
    batch_data = []
    label_data = []
    
    while len(batch_data) < batch_size:
        rand_sentences = np.random.choice(sentences)
        window_seq = [rand_sentences[max((ix-window_size),0):(ix+window_size+1)] 
                      for ix, x in enumerate(rand_sentences)]
        label_idx = [ix if ix < window_size else window_size for ix, x in enumerate(window_seq)]
        
        if method == 'skip_gram':
            batch_and_labels = [(x[y], x[:y]+x[(y+1):]) for x,y in zip(window_seq, label_idx)]
            tuple_data = [(x,y_) for x, y in batch_and_labels for y_ in y]
        else:
            raise ValueError("Método {} no implementado".format(method))
        
        batch, labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
        
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return (batch_data, label_data)

In [46]:
import tensorflow as tf
import numpy as np

In [42]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1, 1))
x_inputs = tf.placeholder(tf.int32, shape =[batch_size])
y_target = tf.placeholder(tf.int32, shape = [batch_size,1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

Instructions for updating:
Colocations handled automatically by placer.


In [43]:
embed = tf.nn.embedding_lookup(embeddings, x_inputs)

In [48]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], 
                                              stddev=1.0/np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [49]:
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
                                    inputs=embed, labels=y_target, 
                                     num_sampled = num_sampled, num_classes=vocabulary_size))

In [50]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1, keepdims=True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [51]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

Instructions for updating:
Use tf.cast instead.


In [52]:
session = tf.Session()
init = tf.global_variables_initializer()
session.run(init)

In [53]:
loss_vect = []
loss_x_vect = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
    feed_dict = {x_inputs: batch_inputs, y_target: batch_labels}
    session.run(optimizer, feed_dict=feed_dict)
    
    if (i+1) % print_loss_every == 0:
        loss_val = session.run(loss, feed_dict=feed_dict)
        loss_vect.append(loss_val)
        loss_x_vect.append(i+1)
        print("Iteración {}, Pérdida: {}".format(i+1, loss_val))
    
    ## Validación de palabras más cercanas a las 5 seleccionadas
    if (i+1) % print_valid_every == 0:
        sim = session.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dict_rev[valid_examples[j]]
            top_k = 10
            nearest = (-sim[j,:]).argsort()[1:top_k+1]
            log_string = "Palabras cercanas a {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dict_rev[nearest[k]]
                log_string = "%s %s, "%(log_string, close_word)
            print(log_string)

Iteración 500, Pérdida: 8.039772033691406
Iteración 1000, Pérdida: 16.55388641357422
Iteración 1500, Pérdida: 36.76530838012695
Iteración 2000, Pérdida: 24.588407516479492
Palabras cercanas a policia: curules,  contame,  argentinos,  drogadicto,  alcalde,  empezando,  disposici,  azul,  capaz,  redondo, 
Palabras cercanas a modalidad: capturada,  robono,  migratorio,  camisa,  chest,  mito,  flagrante,  novias,  ua,  decime, 
Palabras cercanas a atraco: colectivo,  revisan,  anteto,  conserva,  cllate,  atiende,  mejorar,  coordinadora,  orejas,  lave, 
Iteración 2500, Pérdida: 8.57458782196045
Iteración 3000, Pérdida: 7.421785354614258
Iteración 3500, Pérdida: 15.8325834274292
Iteración 4000, Pérdida: 1.8386590480804443
Palabras cercanas a policia: curules,  contame,  argentinos,  drogadicto,  alcalde,  empezando,  disposici,  azul,  capaz,  redondo, 
Palabras cercanas a modalidad: capturada,  robono,  migratorio,  camisa,  chest,  mito,  flagrante,  novias,  ua,  decime, 
Palabras ce

Iteración 30500, Pérdida: 0.6789681315422058
Iteración 31000, Pérdida: 1.0902553796768188
Iteración 31500, Pérdida: 1.6249897480010986
Iteración 32000, Pérdida: 0.6617197394371033
Palabras cercanas a policia: curules,  contame,  argentinos,  drogadicto,  alcalde,  empezando,  disposici,  azul,  capaz,  redondo, 
Palabras cercanas a modalidad: capturada,  robono,  migratorio,  camisa,  chest,  mito,  flagrante,  novias,  ua,  decime, 
Palabras cercanas a atraco: colectivo,  revisan,  anteto,  conserva,  cllate,  atiende,  mejorar,  coordinadora,  orejas,  lave, 
Iteración 32500, Pérdida: 0.9430350661277771
Iteración 33000, Pérdida: 0.5359370708465576
Iteración 33500, Pérdida: 4.100840091705322
Iteración 34000, Pérdida: 2.3507533073425293
Palabras cercanas a policia: curules,  contame,  argentinos,  drogadicto,  alcalde,  empezando,  disposici,  azul,  capaz,  redondo, 
Palabras cercanas a modalidad: capturada,  robono,  migratorio,  camisa,  chest,  mito,  flagrante,  novias,  ua,  deci

### Keras 

In [54]:
from keras_preprocessing import text

In [56]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(datos_limp)

In [57]:
word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

In [58]:
vocab_size = len(word2id) + 1 
embed_size = 100

In [61]:
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in datos_limp]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:100])

Vocabulary Size: 17106
Vocabulary Sample: [('robo', 1), ('si', 2), ('millones', 3), ('dos', 4), ('va', 5), ('aos', 6), ('reporte', 7), ('dlares', 8), ('mas', 9), ('combustible', 10), ('da', 11), ('daddy', 12), ('yankee', 13), ('amiga', 14), ('joyas', 15), ('polica', 16), ('impactante', 17), ('sufre', 18), ('ser', 19), ('ahora', 20), ('casa', 21), ('valuadas', 22), ('solo', 23), ('as', 24), ('dinero', 25), ('hoy', 26), ('amigo', 27), ('rob', 28), ('delito', 29), ('vehculo', 30), ('mil', 31), ('corrupcin', 32), ('ver', 33), ('hace', 34), ('no', 35), ('vehculos', 36), ('el', 37), ('mano', 38), ('gente', 39), ('de', 40), ('saqueo', 41), ('tan', 42), ('pas', 43), ('puede', 44), ('intento', 45), ('detienen', 46), ('mismo', 47), ('parte', 48), ('seguro', 49), ('falta', 50), ('gran', 51), ('hacer', 52), ('bien', 53), ('vez', 54), ('pueblo', 55), ('siempre', 56), ('despus', 57), ('descarado', 58), ('menos', 59), ('tras', 60), ('auto', 61), ('san', 62), ('detiene', 63), ('decir', 64), ('plata', 

### Keras

In [86]:
import numpy as np
np.random.seed(13)

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams

import gensim

In [87]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(datos_limp)

In [88]:
V = len(tokenizer.word_index) + 1
V

17106

In [89]:
dim_embedddings = 128

# inputs
w_inputs = Input(shape=(1, ), dtype='int32')
w = Embedding(V, dim_embedddings)(w_inputs)

# context
c_inputs = Input(shape=(1, ), dtype='int32')
c  = Embedding(V, dim_embedddings)(c_inputs)
o = Dot(axes=2)([w, c])
o = Reshape((1,), input_shape=(1, 1))(o)
o = Activation('sigmoid')(o)

SkipGram = Model(inputs=[w_inputs, c_inputs], outputs=o)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer='adam')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 128)       2189568     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 128)       2189568     input_2[0][0]                    
__________________________________________________________________________________________________
dot_1 (Dot

In [91]:
for _ in range(5):
    loss = 0.
    for i, doc in enumerate(tokenizer.texts_to_sequences(datos_limp)):
        data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=5, negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += SkipGram.train_on_batch(x, y)

    print(loss)

5144.289180833846
3417.9137711841613
3139.319450482726
2929.7750292615965
2933.212652293965


In [92]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim_embedddings))
vectors = SkipGram.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [93]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [94]:
w2v.most_similar(positive=['policia'])

[('conduca', 0.630648672580719),
 ('francisco', 0.61998450756073),
 ('narcomenudeo', 0.6185741424560547),
 ('lmparas', 0.6152634024620056),
 ('sanjuanense', 0.6050634384155273),
 ('drogas', 0.6000219583511353),
 ('implicados', 0.5861939191818237),
 ('romo', 0.5757334232330322),
 ('gaviotas', 0.5730146169662476),
 ('recuperamos', 0.5672511458396912)]

In [95]:
w2v.most_similar(positive=['modalidad'])

[('domicilio', 0.6512517929077148),
 ('rentar', 0.6473425626754761),
 ('vinculan', 0.6375187039375305),
 ('michoacn', 0.6219478845596313),
 ('homofbico', 0.6198636889457703),
 ('posesin', 0.6146078109741211),
 ('n', 0.6133899688720703),
 ('quedaron', 0.6118131875991821),
 ('secretara', 0.6096034646034241),
 ('colonia', 0.6056919097900391)]