# IIC-3670 NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- gensim 4.1.2
- keras 2.9.0
- tensorflow 2.9.1


### Vamos a usar la librería gensim para obtener los vectores de FastText

In [1]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [2]:
ft_300 = gensim.downloader.load('fasttext-wiki-news-subwords-300')

### Un ejemplo clásico de analogías de palabras con estos vectores

In [3]:
ft_300.most_similar_cosmul(positive=['rey', 'mujer'], negative=['hombre'])

[('niña', 0.9072602391242981),
 ('reyna', 0.9052786231040955),
 ('baño', 0.8808259963989258),
 ('nena', 0.8808032870292664),
 ('viuda', 0.8759605884552002),
 ('león', 0.868635892868042),
 ('feo', 0.8684535622596741),
 ('mín', 0.868294894695282),
 ('iza', 0.8680763840675354),
 ('niñas', 0.8670858144760132)]

### Reproducimos el experimento del paper

In [4]:
score, results = ft_300.evaluate_word_analogies('questions-words.txt')

In [5]:
score

0.8827876424099353

### OK, usemos los vectores para construir un clasificador de texto. Los vamos a pasar a un array de numpy, lo mismo con las labels (a qué palabra corresponde cada vector)

In [8]:
import numpy as np

vectors_ft = np.asarray(ft_300.vectors)
labels_ft = np.asarray(ft_300.index_to_key)

In [9]:
labels_ft

array([',', 'the', '.', ..., 'Iseya', 'Bayyah', 'Vilaya'], dtype='<U66')

### Vamos a trabajar con keras y el dataset 20newsgroups

In [10]:
import tensorflow
from tensorflow import keras
from sklearn.datasets import fetch_20newsgroups

X_train_text, Y_train = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'), return_X_y=True)
X_test_text, Y_test  = fetch_20newsgroups(subset="test", remove=('headers', 'footers', 'quotes'), return_X_y=True)

### Preprocesamos el texto del dataset

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

classes = np.unique(Y_train)

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens

    text = ' '.join(words)
    return text

### Fíjese que armamos dos listas de textos, una para training y otra testing

In [12]:
train_docs = []
test_docs = []

for raw_text in X_train_text:
    text = tokenize(raw_text)
    train_docs.append(text)
    
for raw_text in X_test_text:
    text = tokenize(raw_text)
    test_docs.append(text)
    

### Un hiperparámetro del modelo es el largo de la entrada (max_tokens). Como va fijo, se usa padding (relleno cuando el texto es corto, o trunco cuando se pasa de max tokens).

In [14]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

max_tokens = 50 ## Hyperparameter, input length

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)

## Vectorizing data to keep 50 words per sample.
X_train_vect = pad_sequences(tokenizer.texts_to_sequences(train_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)
X_test_vect  = pad_sequences(tokenizer.texts_to_sequences(test_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)


X_train_vect.shape, X_test_vect.shape

((11314, 50), (7532, 50))

In [15]:
len(tokenizer.index_word)

72294

### Ahora vamos a crear la matriz de embeddings en base al modelo preentrenado FastText. Tomo el idx y la palabra del tokenizer, busco la palabra en FastText y traigo el vector a la matriz

In [16]:
embed_len = 300

ft_embeddings = np.zeros((len(tokenizer.index_word)+1, embed_len))

for idx, word in tokenizer.index_word.items():
    if word in labels_ft:
        ft_embeddings[idx] = vectors_ft[int(np.where(labels_ft == word)[0][0])]


### Ahora defino la arquitectura del modelo con Model de keras.

In [17]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input

inputs = Input(shape=(max_tokens, ))
embeddings_layer = Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=embed_len,
                             input_length=max_tokens, trainable=False, weights=[ft_embeddings])
dense1 = Dense(128, activation="relu")
dense2 = Dense(64, activation="relu")
dense3 = Dense(len(classes), activation="softmax")

x = embeddings_layer(inputs)
x = tensorflow.reduce_mean(x, axis=1) ### Averaged embeddings of tokens of each example
x = dense1(x)
x = dense2(x)
outputs = dense3(x)

model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 300)           21688500  
                                                                 
 tf.math.reduce_mean (TFOpLa  (None, 300)              0         
 mbda)                                                           
                                                                 
 dense (Dense)               (None, 128)               38528     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 20)                1300      
                                                             

In [18]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

### y entrenamos

In [19]:
model.fit(X_train_vect, Y_train, batch_size=32, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fb722b76fa0>

In [20]:
labels = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [21]:
from sklearn.metrics import accuracy_score, classification_report

Y_preds = model.predict(X_test_vect).argmax(axis=-1)

print("Test Accuracy : {}".format(accuracy_score(Y_test, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_test, Y_preds, target_names=labels))

Test Accuracy : 0.5471322357939459

Classification Report : 
                          precision    recall  f1-score   support

             alt.atheism       0.33      0.32      0.32       319
           comp.graphics       0.51      0.55      0.53       389
 comp.os.ms-windows.misc       0.37      0.48      0.42       394
comp.sys.ibm.pc.hardware       0.56      0.40      0.47       392
   comp.sys.mac.hardware       0.46      0.36      0.41       385
          comp.windows.x       0.63      0.48      0.54       395
            misc.forsale       0.65      0.60      0.63       390
               rec.autos       0.62      0.64      0.63       396
         rec.motorcycles       0.65      0.47      0.55       398
      rec.sport.baseball       0.42      0.78      0.55       397
        rec.sport.hockey       0.78      0.78      0.78       399
               sci.crypt       0.68      0.58      0.63       396
         sci.electronics       0.45      0.48      0.46       393
              