In [98]:
#!/usr/bin/python

In [99]:
# Importamos las librerias necesarias desde bash para el desarrollo de este codigo

!git clone https://github.com/st1800eafit/st1800_20211.git
!pip install nltk
!pip install stop-words
!pip install pandas
!pip install gensim

fatal: destination path 'st1800_20211' already exists and is not an empty directory.


In [100]:
# Importamos las librerias necesarias desde Python para el desarrollo de este codigo

import os
import glob
import re 
import nltk 
import pandas
import numpy as np

In [101]:
# Dede la libreria de NLTK necesitaremos estos parametros para realizar la limpieza o preparacion de los documentos

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [102]:
# Abrimos los documentos .txt que en este caso utilizaremos y adicionalmente, abrimos el documento de xml. A este ultimo se le realizara una limpieza antes de tokenizar. Para asi remover las etiquetas

files_location = os.path.join("/","content","st1800_20211","datasets", "papers_sample_pdf/")
output_file_location = os.path.join("/","content/")
output_file = "scikit_model_vectorized.sav"

filenames = glob.glob(files_location+"*.txt")
corpus_per_document = []

for file in filenames:
  doc_corpus = open(file, "r").read()
  corpus_per_document.append(doc_corpus)

# Adicionamos el XML sin tags
filename_xml = glob.glob(files_location+"*.xml")
xml_file = open (filename_xml[0], "r").read()
xml_file = re.sub('<[^>]*>', "", xml_file)
corpus_per_document.append(xml_file)

In [103]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from gensim import corpora, models

# Aplicaremos la tecnica de TF-IDF pero en Gensim. Primero tokenizaremos los documentos con NLTK

Porter_Stemmer = PorterStemmer()

cleaned_corpus_per_document = []
stop_words_nltk = set(stopwords.words('english'))

#Primero preparamos cada elemento del corpus
for document in corpus_per_document:
  clean_document = nltk.word_tokenize(document)
  clean_document = [re.sub(r'[^A-Za-z0-9]+','',token) for token in clean_document]
  clean_document = [token.lower() for token in clean_document if len(token)>1]
  clean_document = [token for token in clean_document if token not in stop_words_nltk]
  clean_document = [Porter_Stemmer.stem(token) for token in clean_document]
  cleaned_corpus_per_document.append(clean_document)

#Luego aplicamos Gensim creando el bag of words
bow = corpora.Dictionary(cleaned_corpus_per_document)
#print(bow.token2id)
corpus = [bow.doc2bow(text) for text in cleaned_corpus_per_document]

#Con este mecanismo podremos guardar el corpus procesado
corpora.MmCorpus.serialize('/content/docs_corpus.mm', corpus)

#Ahora aplicaremos TF-IDF. Podemros obesrvar que el resultado seria una pareja, con el numero de ocurrencias y el valor del IDF
tfidf = models.TfidfModel(corpus)
for document in tfidf[corpus]:
  print(document)


INFO - 20:28:21: adding document #0 to Dictionary(0 unique tokens: [])
INFO - 20:28:21: built Dictionary(18953 unique tokens: ['012', '02', '10', '1030', '11']...) from 9 documents (total 172912 corpus positions)
INFO - 20:28:21: storing corpus in Matrix Market format to /content/docs_corpus.mm
INFO - 20:28:21: saving sparse matrix to /content/docs_corpus.mm
INFO - 20:28:21: PROGRESS: saving document #0
INFO - 20:28:21: saved 9x18953 matrix, density=15.104% (25764/170577)
INFO - 20:28:21: saving MmCorpus index to /content/docs_corpus.mm.index
INFO - 20:28:21: collecting document frequencies
INFO - 20:28:21: PROGRESS: processing document #0
INFO - 20:28:21: calculating IDF weights for 9 documents and 18952 features (25764 matrix non-zeros)


[(0, 0.010450743622335182), (1, 0.0019285292620283823), (2, 0.00336129584950534), (3, 0.005225371811167591), (4, 0.0071720187885617225), (5, 0.005225371811167591), (6, 0.017926911197361815), (7, 0.007153901073195973), (8, 0.014307802146391947), (9, 0.020901487244670364), (10, 0.010450743622335182), (11, 0.017930046971404306), (12, 0.0071720187885617225), (13, 0.010450743622335182), (14, 0.010450743622335182), (15, 0.019125383436164593), (16, 0.015539374041883732), (17, 0.010450743622335182), (18, 0.0071720187885617225), (19, 0.010450743622335182), (20, 0.005976682323801436), (21, 0.010450743622335182), (22, 0.010450743622335182), (23, 0.0057855877860851475), (24, 0.015676115433502773), (25, 0.031352230867005546), (26, 0.020901487244670364), (27, 0.015676115433502773), (28, 0.007153901073195973), (29, 0.007153901073195973), (30, 0.0038570585240567646), (31, 0.0027957122830687842), (32, 0.010450743622335182), (33, 0.020901487244670364), (34, 0.010450743622335182), (35, 0.0209014872446703

In [104]:
# Primeros ubicamos nuestro corpus en un dataframe para nuestra siguiente implementacion

df = pd.DataFrame(cleaned_corpus_per_document)
df_clean = df.dropna().drop_duplicates()
final_bow = df_clean.values.tolist()


In [105]:
# Utilizaremos la libreria de word2Vec de Gensim y adicionalmente, evidenciaremos cuantos nucleos tenemos disponibles en nuestro procesamiento
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count()

cores

2

In [106]:
# Creamos el modelo con los siguientes parametros, entre ellos que solo considere las palabras con minimmo 5 ocurrencias, un ritmo de aprendizaje de 0.03, entre otrods.
w2v_model = Word2Vec(min_count=5,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [107]:
# Permitimos que el modelo adquiera el vocabulario ingresado
t = time()

w2v_model.build_vocab(final_bow, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 20:28:48: collecting all words and their counts
INFO - 20:28:48: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 20:28:48: collected 16799 word types from a corpus of 128215 raw words and 1 sentences
INFO - 20:28:48: Loading a fresh vocabulary
INFO - 20:28:48: effective_min_count=5 retains 2997 unique words (17% of original 16799, drops 13802)
INFO - 20:28:48: effective_min_count=5 leaves 108267 word corpus (84% of original 128215, drops 19948)
INFO - 20:28:48: deleting the raw counts dictionary of 16799 items
INFO - 20:28:48: sample=6e-05 downsamples 1154 most-common words
INFO - 20:28:48: downsampling leaves estimated 46068 word corpus (42.6% of prior 108267)
INFO - 20:28:48: estimated required memory for 2997 words and 300 dimensions: 8691300 bytes
INFO - 20:28:48: resetting layer weights


Time to build vocab: 0.01 mins


In [108]:
# Entrenamos el modelo y evidenciamos el tiempo que se demoro en entrenarse.
t = time()

w2v_model.train(final_bow, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 20:28:48: training model with 1 workers on 2997 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 20:28:49: worker thread finished; awaiting finish of 0 more threads
INFO - 20:28:49: EPOCH - 1 : training on 128215 raw words (10000 effective words) took 0.1s, 135732 effective words/s
INFO - 20:28:49: worker thread finished; awaiting finish of 0 more threads
INFO - 20:28:49: EPOCH - 2 : training on 128215 raw words (10000 effective words) took 0.1s, 132238 effective words/s
INFO - 20:28:49: worker thread finished; awaiting finish of 0 more threads
INFO - 20:28:49: EPOCH - 3 : training on 128215 raw words (10000 effective words) took 0.1s, 135485 effective words/s
INFO - 20:28:49: worker thread finished; awaiting finish of 0 more threads
INFO - 20:28:49: EPOCH - 4 : training on 128215 raw words (10000 effective words) took 0.1s, 127432 effective words/s
INFO - 20:28:49: worker thread finished; awaiting finish of 0 more threads
INFO - 20:28:49: EP

Time to train the model: 0.04 mins


In [109]:
# Finalmente podremos guardar los parametros del modelo
w2v_model.init_sims(replace=True)

INFO - 20:28:51: precomputing L2-norms of word weight vectors


In [110]:
# Probamos el modelo. Aqui podremos observar que dada la plabra comput, la palabra que mas probabilidad tiene para que le siga es science, lo cual tiene sentido.
w2v_model.wv.most_similar(positive=["comput"])

[('scienc', 0.9999446272850037),
 ('mathemat', 0.9999327659606934),
 ('combinator', 0.9999205470085144),
 ('geometri', 0.9999165534973145),
 ('discret', 0.9999043345451355),
 ('theori', 0.9999015927314758),
 ('michael', 0.9998989105224609),
 ('physic', 0.9998955130577087),
 ('david', 0.9998953342437744),
 ('05c05', 0.9998953342437744)]

In [111]:
# Asi podremos guardar el modelo para no tener que realizarlo nuevamente
w2v_model.save("/content/gensim_model.model")

# Adicionalmente de esta manera podremos cargarlo para nuestro analisis
loaded_w2d_model = Word2Vec.load("/content/gensim_model.model")

INFO - 20:28:51: saving Word2Vec object under /content/gensim_model.model, separately None
INFO - 20:28:51: not storing attribute vectors_norm
INFO - 20:28:51: not storing attribute cum_table
INFO - 20:28:51: saved /content/gensim_model.model
INFO - 20:28:51: loading Word2Vec object from /content/gensim_model.model
INFO - 20:28:51: loading wv recursively from /content/gensim_model.model.wv.* with mmap=None
INFO - 20:28:51: setting ignored attribute vectors_norm to None
INFO - 20:28:51: loading vocabulary recursively from /content/gensim_model.model.vocabulary.* with mmap=None
INFO - 20:28:51: loading trainables recursively from /content/gensim_model.model.trainables.* with mmap=None
INFO - 20:28:51: setting ignored attribute cum_table to None
INFO - 20:28:51: loaded /content/gensim_model.model
