# IIC-3670 NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- gensim 4.1.2
- keras 2.9.0
- tensorflow 2.9.1

## Actividad en clase

Construya clasificadores de documentos sobre el dataset **20Newsgroups**. Para esto haga lo siguiente:

- Limpie el texto del dataset.
- Cargue los vectores de FastText.
- Construya las pad_sequences en base a max_tokens = 40.
- Cree la matriz de embeddings usando el modelo preentrenado de FastText. 
- Use la misma arquitectura vista en clases y entrene.
- Evalúe en base a classification report. 
- Cargue los vectores de word2vec Google news. 
- Cree la matriz de embeddings usando el modelo preentrenado de Word2vec. 
- Use la misma arquitectura vista en clases y entrene.
- Evalúe en base a classification report.
- Interprete los resultados.
- Cuanto termine, me avisa para entregarle una **L (logrado)**.
- Recuerde que las L otorgan un bono en la nota final de la asignatura.


***Tiene hasta el final de la clase.***

Vea la descripción del dataset en: https://www.kaggle.com/datasets/crawford/20-newsgroups


In [1]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [2]:
ft_300 = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [3]:
import numpy as np

vectors_ft = np.asarray(ft_300.vectors)
labels_ft = np.asarray(ft_300.index_to_key)

In [4]:
import tensorflow
from tensorflow import keras
from sklearn.datasets import fetch_20newsgroups

X_train_text, Y_train = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'), return_X_y=True)
X_test_text, Y_test  = fetch_20newsgroups(subset="test", remove=('headers', 'footers', 'quotes'), return_X_y=True)

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

classes = np.unique(Y_train)

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens

    text = ' '.join(words)
    return text

In [6]:
train_docs = []
test_docs = []

for raw_text in X_train_text:
    text = tokenize(raw_text)
    train_docs.append(text)
    
for raw_text in X_test_text:
    text = tokenize(raw_text)
    test_docs.append(text)
    

In [7]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

max_tokens = 40 ## Hyperparameter, input length

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)

## Vectorizing data to keep 40 words per sample.
X_train_vect = pad_sequences(tokenizer.texts_to_sequences(train_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)
X_test_vect  = pad_sequences(tokenizer.texts_to_sequences(test_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)


X_train_vect.shape, X_test_vect.shape

((11314, 40), (7532, 40))

In [8]:
# Create a dictionary mapping words to their indices in labels_ft
word_to_index = {word: idx for idx, word in enumerate(labels_ft)}

# Initialize the embeddings matrix
embed_len = 300
ft_embeddings = np.zeros((len(tokenizer.index_word)+1, embed_len))

# Iterate over the words in the tokenizer
for idx, word in tokenizer.index_word.items():
    # Check if the word exists in the FastText model's vocabulary
    if word in word_to_index:
        # Get the index of the word in labels_ft
        ft_idx = word_to_index[word]
        # Copy the corresponding embedding to ft_embeddings
        ft_embeddings[idx] = vectors_ft[ft_idx]

In [9]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input

inputs = Input(shape=(max_tokens, ))
embeddings_layer = Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=embed_len,
                             input_length=max_tokens, trainable=False, weights=[ft_embeddings])
dense1 = Dense(128, activation="relu")
dense2 = Dense(64, activation="relu")
dense3 = Dense(len(classes), activation="softmax")

x = embeddings_layer(inputs)
x = tensorflow.reduce_mean(x, axis=1) ### Averaged embeddings of tokens of each example
x = dense1(x)
x = dense2(x)
outputs = dense3(x)

model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 40)]              0         
                                                                 
 embedding (Embedding)       (None, 40, 300)           21688500  
                                                                 
 tf.math.reduce_mean (TFOpLa  (None, 300)              0         
 mbda)                                                           
                                                                 
 dense (Dense)               (None, 128)               38528     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 20)                1300      
                                                             

In [10]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [11]:
model.fit(X_train_vect, Y_train, batch_size=32, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f93707aeee0>

In [12]:
labels = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [13]:
from sklearn.metrics import accuracy_score, classification_report

Y_preds = model.predict(X_test_vect).argmax(axis=-1)

print("Test Accuracy : {}".format(accuracy_score(Y_test, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_test, Y_preds, target_names=labels))

Test Accuracy : 0.5331917153478491

Classification Report : 
                          precision    recall  f1-score   support

             alt.atheism       0.27      0.32      0.29       319
           comp.graphics       0.58      0.56      0.57       389
 comp.os.ms-windows.misc       0.58      0.31      0.41       394
comp.sys.ibm.pc.hardware       0.42      0.59      0.49       392
   comp.sys.mac.hardware       0.38      0.40      0.39       385
          comp.windows.x       0.62      0.56      0.59       395
            misc.forsale       0.68      0.52      0.59       390
               rec.autos       0.57      0.69      0.62       396
         rec.motorcycles       0.71      0.46      0.56       398
      rec.sport.baseball       0.36      0.79      0.50       397
        rec.sport.hockey       0.83      0.62      0.71       399
               sci.crypt       0.61      0.65      0.63       396
         sci.electronics       0.65      0.28      0.39       393
              

In [14]:
gv_300 = gensim.downloader.load('word2vec-google-news-300')

In [15]:
vectors_gv = np.asarray(gv_300.vectors)
labels_gv = np.asarray(gv_300.index_to_key)

In [16]:
# Create a dictionary mapping words to their indices in labels_ft
word_to_index = {word: idx for idx, word in enumerate(labels_gv)}

gv_embeddings = np.zeros((len(tokenizer.index_word)+1, embed_len))

# Iterate over the words in the tokenizer
for idx, word in tokenizer.index_word.items():
    # Check if the word exists in the FastText model's vocabulary
    if word in word_to_index:
        # Get the index of the word in labels_ft
        gv_idx = word_to_index[word]
        # Copy the corresponding embedding to ft_embeddings
        gv_embeddings[idx] = vectors_gv[gv_idx]

In [17]:
inputs = Input(shape=(max_tokens, ))
embeddings_layer = Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=embed_len,
                             input_length=max_tokens, trainable=False, weights=[gv_embeddings])
dense1 = Dense(128, activation="relu")
dense2 = Dense(64, activation="relu")
dense3 = Dense(len(classes), activation="softmax")

x = embeddings_layer(inputs)
x = tensorflow.reduce_mean(x, axis=1) ### Averaged embeddings of tokens of each example
x = dense1(x)
x = dense2(x)
outputs = dense3(x)

model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 40)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 40, 300)           21688500  
                                                                 
 tf.math.reduce_mean_1 (TFOp  (None, 300)              0         
 Lambda)                                                         
                                                                 
 dense_3 (Dense)             (None, 128)               38528     
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 20)                1300      
                                                           

In [18]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [19]:
model.fit(X_train_vect, Y_train, batch_size=32, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f9393c59100>

In [20]:
from sklearn.metrics import accuracy_score, classification_report

Y_preds = model.predict(X_test_vect).argmax(axis=-1)

print("Test Accuracy : {}".format(accuracy_score(Y_test, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_test, Y_preds, target_names=labels))

Test Accuracy : 0.5553637812002125

Classification Report : 
                          precision    recall  f1-score   support

             alt.atheism       0.31      0.42      0.35       319
           comp.graphics       0.57      0.55      0.56       389
 comp.os.ms-windows.misc       0.43      0.53      0.48       394
comp.sys.ibm.pc.hardware       0.47      0.48      0.48       392
   comp.sys.mac.hardware       0.49      0.42      0.45       385
          comp.windows.x       0.73      0.43      0.54       395
            misc.forsale       0.84      0.53      0.65       390
               rec.autos       0.66      0.62      0.64       396
         rec.motorcycles       0.37      0.67      0.48       398
      rec.sport.baseball       0.74      0.66      0.70       397
        rec.sport.hockey       0.74      0.83      0.78       399
               sci.crypt       0.54      0.63      0.59       396
         sci.electronics       0.42      0.52      0.47       393
              