###  Luis Ricardo Cruz García
#### Procesamiento de Lenguaje Natural

#### Tarea 5

In [1]:
import os
import time
import shutil
import random
from typing import Tuple
from argparse import Namespace
import matplotlib.pyplot as plt
from itertools import permutations
from random import shuffle

import nltk
from nltk.corpus import stopwords
from nltk import ngrams, FreqDist
from nltk.tokenize import TweetTokenizer

import pandas as pd
import numpy as np
from numpy import array

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import accuracy_score

from decimal import Decimal

In [2]:
torch.backends.cudnn.benchmark = False

In [5]:
x_train = pd.read_csv("mex_train.txt", sep="\r\n", engine="python", header=None).loc[:, 0].values.tolist()
x_val   = pd.read_csv("mex_val.txt"  , sep="\r\n", engine="python", header=None).loc[:, 0].values.tolist()

In [6]:
# esta class es escencialmente idéntica la vista en clase, sólo le agregué unos pequeños cambios para que 
# pudiera ser utilizada en todos los casos que se pide en la tarea
class NgramData():
	def __init__(self, 
				N: int,                          # número del N-grama
				max_vocabulary_size: int = 5000, # tamaño máximo del vocabulario
				tokenizer = None,                # tokenizador
				numeric = False,                 # acepta o no tokens numéricos
				embeddings_model = None,         # array de word-representations
				embedding_words = None,          # palabras de las cuales tenemos un embedding
				word_to_emb_id = None):          # diccionario de la palabra al índice en el array "embeddings_model"

		self.tokenizer = tokenizer if tokenizer else self.default_tokenizer()
		self.punct = set([".", ",", ";", ":", "^", "!", "¡", "¿", "?", "\'", "*", "<url>", "@usuario"])
		self.embeddings_model = embeddings_model
		self.embedding_words = embedding_words
		self.word_to_emb_id = word_to_emb_id
		self.max_vocabulary_size = max_vocabulary_size
		self.numeric = numeric
		self.N = N
		self.UNK = "<unk>"
		self.SOS = "<s>"
		self.EOS = "</s>"

	def remove_word(self, word:str) -> bool:
		lo_word = word.lower()
		# se pueden aceptar o no tokens numéricos
		if not self.numeric:
			return lo_word in self.punct or lo_word.isnumeric()
		else:
			return lo_word in self.punct

	def get_vocabulary(self, corpus: list) -> set:
		freq_dist = FreqDist([word.lower() for sentence in corpus for word in self.tokenizer(sentence) if not self.remove_word(word)])
		sorted_words = self.sortFreqDict(freq_dist)[:self.max_vocabulary_size - 3]
		return set(sorted_words)

	def build_embedding_matrix(self):
		embedding_dimension = self.embeddings_model.shape[1]
		self.embedding_matrix = np.zeros((len(self.vocabulary), embedding_dimension))
		
		for i, word in enumerate(self.vocabulary):
			if word in embedding_words:
				self.embedding_matrix[i] = self.embeddings_model[self.word_to_emb_id[word]]
			else:
				self.embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dimension, ))

	def fit(self, corpus:list) -> None:
		self.vocabulary = self.get_vocabulary(corpus)
		self.vocabulary.add(self.UNK)
		self.vocabulary.add(self.SOS)
		self.vocabulary.add(self.EOS)

		self.w2id = {}
		self.id2w = {}

		id_word = 0
		for doc in corpus:
			for word in self.tokenizer(doc):
				word_lower = word.lower()
				if word_lower in self.vocabulary and word_lower not in self.w2id:
					self.w2id[word_lower] = id_word
					self.id2w[id_word]    = word_lower
					id_word += 1

		# agregar tokens especiales
		self.w2id.update(
			{
				self.UNK: id_word,
				self.SOS: id_word + 1,
				self.EOS: id_word + 2

			}
		)

		self.id2w.update(
			{
				id_word: self.UNK,
				id_word + 1: self.SOS,
				id_word + 2: self.EOS 

			}
		)

		# si nos pasan un embeddings_model, creamos la matriz de embeddings
		if self.embeddings_model is not None:
			self.build_embedding_matrix()

	def transform(self, corpus : list) -> Tuple[np.ndarray, np.ndarray]:
		x_ngrams = []
		y = []
		for doc in corpus:
			doc_ngram = self.get_ngram_doc(doc)
			for words_window in doc_ngram:
				words_window_ids = [self.w2id[word] for word in words_window]
				x_ngrams.append(list(words_window_ids[:-1]))
				y.append(words_window_ids[-1])

		return array(x_ngrams), array(y)

	def get_ngram_doc(self, doc : str) -> list:
		doc_tokens = self.tokenizer(doc)
		doc_tokens = [word.lower() for word in doc_tokens]
		doc_tokens = self.replace_unk(doc_tokens)
		doc_tokens = [self.SOS] * (self.N - 1) + doc_tokens + [self.EOS]
		return list(ngrams(doc_tokens, self.N))

	def replace_unk(self, doc_tokens):
		for i, token in enumerate(doc_tokens):
			if token.lower() not in self.vocabulary:
				doc_tokens[i] = self.UNK

		return doc_tokens

	def sortFreqDict(self, fdist_dict):
		aux = list(fdist_dict.keys())
		aux.sort(key=lambda char : fdist_dict[char], reverse=True)
		return aux

	def default_tokenizer(self, doc : str) -> list:
		return doc.split(" ")

	def get_vocabulary_size(self) -> int:
		return len(self.vocabulary)

In [8]:
# se agrega la opción de que se tome la conexión directa entre la capa de embeddings y la capa de salida
# y se agrega la opción de usar una matriz de embeddings pre-entrenados
class NeuralLM(nn.Module):
	def __init__(self, args, embedding_matrix=None, direct_emb_to_output=False):
		super(NeuralLM, self).__init__()

		self.window_size = args.N - 1
		self.embedding_size = args.d
		self.direct_emb_to_output = direct_emb_to_output

		self.emb = nn.Embedding(args.vocabulary_size, args.d)

		if embedding_matrix is not None:
			self.emb.load_state_dict({'weight': torch.Tensor(embedding_matrix)})
			self.emb.weight.requires_grad = False

		self.fc1 = nn.Linear(args.d * (args.N - 1), args.d_h)
		self.drop1 = nn.Dropout(p=args.dropout)

		if self.direct_emb_to_output:
			self.fc2 = nn.Linear(args.d_h + (args.d * (args.N - 1)), args.vocabulary_size, bias=False)
		else:
			self.fc2 = nn.Linear(args.d_h, args.vocabulary_size, bias=False)

	# x = lista de ids de las palabras en un n-grama
	def forward(self, x):
		x = self.emb(x)
		x = x.view(-1, self.window_size * self.embedding_size)

		h = F.relu(self.fc1(x))
		h = self.drop1(h)

		# si se quiere que exista la conexión directa entre la capa de embeddings y la capa de salida
		# entonces se agrega a los argumentos que opera la capa de salida (fc2)
		if self.direct_emb_to_output:
			h = torch.cat((x, h), dim=1)
		return self.fc2(h)

In [9]:
def get_preds(raw_logits):
	probs = F.softmax(raw_logits.detach(), dim=1)
	y_pred = torch.argmax(probs, dim=1).cpu().numpy()
	return y_pred

In [10]:
def model_eval(data, model, gpu=False):
	with torch.no_grad():
		preds, tgts = [], []
		for window_words, labels in data:
			if gpu:
				window_words = window_words.cuda()

			outputs = model(window_words)

			y_pred = get_preds(outputs)

			tgt = labels.numpy()
			tgts.append(tgt)
			preds.append(y_pred)

	tgts  = [e for l in tgts  for e in l]
	preds = [e for l in preds for e in l]

	return accuracy_score(tgts, preds)

In [11]:
def save_checkpoint(state, is_best, checkpoint_path, filename="checkpoint.pt"): 
	filename = os.path.join(checkpoint_path, filename)
	torch.save(state, filename)
	if is_best:
		shutil.copyfile(filename, os.path.join(checkpoint_path, "model_best.pt"))

### 1. Con base en la implementación mostrada en clase, construya un modelo de lenguaje neuronal a nivel de carácter. Tomé en cuenta secuencias de tamaño 6 o más para el modelo, es decir hasta 5 caracteres o más en el contexto. Ponga al modelo a generar texto 3 veces, con un máximo de 300 caracteres. Escriba 5 ejemplos de oraciones y mídales el likelihood. Escriba un ejemplo de estructura morfológica (permutaciones con caracteres) similar al de estructura sintáctica del profesor con 5 o más caracteres de su gusto (e.g., "ando "). Calcule la perplejidad del modelo sobre los datos val.

In [24]:
args = Namespace()
args.N = 6

# el "tokenizer" es la función list, así hacemos que tokenize por caracter a los strings
ngram_data_chars = NgramData(args.N, 5000, list, numeric=False)
ngram_data_chars.fit(x_train)

x_ngram_train, y_ngram_train = ngram_data_chars.transform(x_train)
x_ngram_val  , y_ngram_val   = ngram_data_chars.transform(x_val)

args.batch_size = 64
args.num_workers = 2

In [25]:
train_dataset = TensorDataset(torch.tensor(x_ngram_train, dtype=torch.int64), torch.tensor(y_ngram_train, dtype=torch.int64))
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True)

In [26]:
val_dataset = TensorDataset(torch.tensor(x_ngram_val, dtype=torch.int64), torch.tensor(y_ngram_val, dtype=torch.int64))
val_loader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True)

In [27]:
batch = next(iter(train_loader))

# Model hyperparameters 

args.vocabulary_size = ngram_data_chars.get_vocabulary_size()
args.d = 100
args.d_h = 200
args.dropout = 0.1

# Train hyperparameters

args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

args.lr_patience = 10
args.lr_factor = 0.5

In [28]:
args.savedir = "model_chars"
os.makedirs(args.savedir, exist_ok=True)

In [29]:
model_chars = NeuralLM(args)

args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
	model_chars.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_chars.parameters(), lr=args.lr)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience=args.lr_patience, verbose=True, factor=args.lr_factor)

In [None]:
# esta parté sí la corrí, pero en un colab (porque toma muucho tiempo en mi computadora), sólo copié el archivo 
# "best_model.pt" (que generaba colab) a la carpeta correspondiente y seguí con lo demás
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args.num_epochs):
	epoch_start_time =  time.time()
	loss_epoch = []
	training_metric = []
	model_chars.train()

	for window_words, labels in train_loader:

		# if gpu available
		if args.use_gpu:
			window_words = window_words.cuda()
			labels = labels.cuda()

		# forward pass
		outputs = model_chars(window_words)
		loss = criterion(outputs, labels)
		loss_epoch.append(loss.item())

		# get_training metrics
		y_pred = get_preds(outputs)
		tgt = labels.cpu().numpy()
		training_metric.append(accuracy_score(tgt, y_pred))

		# posteriormente, hacemos el backward y optimizamos
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

	# get metric n training dataset
	mean_epoch_metric = np.mean(training_metric)
	train_metric_history.append(mean_epoch_metric)

	# get metric in validation dataset
	model_chars.eval()
	tuning_metric = model_eval(val_loader, model_chars, gpu=args.use_gpu)
	metric_history.append(mean_epoch_metric)

	# update scheduler
	scheduler.step(tuning_metric)

	# chech for metric improvement
	is_improvement = tuning_metric > best_metric
	if is_improvement:
		vest_metric = tuning_metric
		n_no_improve = 0
	else:
		n_no_improve += 1

	save_checkpoint(
		{
		"epoch" : epoch + 1, 
		"state_dict" : model_chars.state_dict(), 
		"optimizer" : optimizer.state_dict(),
		"scheduler" : scheduler.state_dict(), 
		"best_metric" : best_metric
		}, 
		is_improvement, 
		args.savedir
	)

	# detener el modelo si no hay mejora
	if n_no_improve >= args.patience:
		print("No improvement. Breaking out of loop")
		break

	print("Train acc: {}".format(mean_epoch_metric))
	print("Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time : {:.2f}".format(epoch + 1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time() - epoch_start_time)))

print("--- %s seconds" % (time.time() - start_time))

In [30]:
def parse_text(text, tokenizer, ngram_data):
	all_tokens = [w.lower() if w in ngram_data.w2id else "<unk>" for w in tokenizer(text)]
	token_ids = [ngram_data.w2id[word.lower()] for word in all_tokens]
	return all_tokens, token_ids

In [31]:
def sample_next_word(logits, temperature=1.0):
	logits = np.asarray(logits).astype("float64")
	preds = logits / temperature
	exp_preds = np.exp(preds)
	preds = exp_preds / np.sum(exp_preds)
	probas = np.random.multinomial(1, preds)
	return np.argmax(probas)

In [32]:
def predict_next_token(model, token_ids):
	word_ids_tensor = torch.LongTensor(token_ids).unsqueeze(0)
	y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()
	
	y_pred = sample_next_word(y_raw_pred, 1.0)
	return y_pred

In [33]:
def generate_sentence(model, initial_text, tokenizer, ngram_data, max_gen_tokens : int = 100, join_char : str = " "):
	all_tokens, window_word_ids = parse_text(initial_text, tokenizer, ngram_data)
	for i in range(max_gen_tokens):
		y_pred = predict_next_token(model, window_word_ids)
		next_word = ngram_data.id2w[y_pred]
		all_tokens.append(next_word)

		if next_word == "</s>":
			break
		else:
			window_word_ids.pop(0)
			window_word_ids.append(y_pred)

	return join_char.join(all_tokens)

In [34]:
best_model_chars = NeuralLM(args)
best_model_chars.load_state_dict(torch.load("model_chars/model_best.pt", map_location=torch.device("cpu"))["state_dict"])
best_model_chars.train(False)

NeuralLM(
  (emb): Embedding(348, 100)
  (fc1): Linear(in_features=500, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=348, bias=False)
)

In [39]:
initial_tokens = "ando "

# el tokenizer es la función "list"
print(generate_sentence(best_model_chars, initial_tokens, list, ngram_data_chars, max_gen_tokens=300, join_char=""))

ando en la verga moso pero se quieren exá mi bendiendo sus una interischino de inundo tu plocura que le dos de inventy está soy <unk> (su vas a la madre de esté como una no mamar misótarde la pela y no haya pinche recercial empezar la cera fojos de bendilda<unk>  😐</s>


In [54]:
initial_tokens = "hola "

# el tokenizer es la función "list" pues seguimos usando chars
print(generate_sentence(best_model_chars, initial_tokens, list, ngram_data_chars, max_gen_tokens=300, join_char=""))

hola del niños y una hdp<unk><unk></s>


In [27]:
initial_tokens = "lleva"

# el tokenizer es la función "list" pues seguimos usando chars
print(generate_sentence(best_model_chars, initial_tokens, list, ngram_data_chars, max_gen_tokens=300, join_char=""))

llevando para daba comería wala</s>


In [55]:
def log_likelihood(model, text, ngram_data):
	# generate n-gram window from input text and the respective label y
	x, y = ngram_data.transform([text])

	# no tomar en cuenta los dos primeros n-gram windows pues son "<s>"
	x, y = x[2:], y[2:]

	x = torch.LongTensor(x).unsqueeze(0)

	logits = model(x).detach()
	probs = F.softmax(logits, dim=1).numpy()

	return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

In [70]:
def perplexity(model, text, ngram_data):
	len_text = len(text)
	x, y = ngram_data.transform([text])
	x, y = x[2:], y[2:]

	x = torch.LongTensor(x).unsqueeze(0)

	logits = model(x).detach()

	probs = F.softmax(logits, dim=1).numpy()

	partial_prod = 1

	for i, w in enumerate(y):
		partial_prod *= probs[i][w]

	return Decimal(partial_prod) ** Decimal(1 / len_text)

In [57]:
print("log likelihood: ", log_likelihood(best_model_chars, "clase de lenguage natural", ngram_data_chars))

log likelihood:  -45.579407


In [58]:
print("log likelihood: ", log_likelihood(best_model_chars, "messi es el mejor jugador", ngram_data_chars))

log likelihood:  -38.200916


In [59]:
print("log likelihood: ", log_likelihood(best_model_chars, "ronaldo es el mejor jugador", ngram_data_chars))

log likelihood:  -46.30466


In [60]:
print("log likelihood: ", log_likelihood(best_model_chars, "amlo es un mal presidente", ngram_data_chars))

log likelihood:  -37.17421


In [61]:
print("log likelihood: ", log_likelihood(best_model_chars, "mexico le va a ganar a argentina", ngram_data_chars))

log likelihood:  -43.010994


In [62]:
char_list = "arresto"
perms = ["".join(perm) for perm in list(set(permutations(list(char_list))))]

likelihood_word = [(log_likelihood(best_model_chars, text, ngram_data_chars), text) for text in perms]
likelihood_word = sorted(likelihood_word, reverse=True)

for likelihood, permutation in likelihood_word[:5]:
	print(likelihood, " -> ", permutation)

print("-" * 50)

for likelihood, permutation in likelihood_word[-5:]:
	print(likelihood, " -> ", permutation)

-8.891865  ->  rateros
-11.700338  ->  ratores
-11.899188  ->  rreatos
-11.925676  ->  rroetas
-11.952572  ->  sratero
--------------------------------------------------
-53.344097  ->  seoatrr
-53.891838  ->  trsroea
-54.5099  ->  trsreoa
-55.720245  ->  trsroae
-56.395157  ->  trsraoe


In [72]:
# Tengo un problema de underflow con la perplejidad, no lo puede solucionar, pero creo que lo hace bien
print("P = {}".format(perplexity(best_model_chars, " ".join(x_val), ngram_data_chars)))

P = 0


### 2. Con base en la implementación mostrada en clase, construya un modelo de lenguaje neuronal a nivel de palabra, pero preinicializado con los embeddings proporcionados. Tomé en cuenta secuencias de tamaño 4 para el modelo, es decir hasta 3 palabras en el contexto. Después de haber entrenado el modelo, recupere las 10 palabras más similares a tres palabras de su gusto dadas. Ponga al modelo a generar texto a partir de tres secuencias de inicio de su gusto. Escriba 5 ejemplos de oraciones y mídales el likelihood. Proponga un ejemplo para ver estructuras sintácticas (permutaciones de palabras de alguna oración) buenas usando el likelihood a partir de una oración que usted proponga. Calcule la perplejidad del modelo sobre los datos val. Compárelo con la perplejidad del modelo de lenguaje sin embeddings preentrenados

In [74]:
tokenizer = TweetTokenizer()

In [75]:
# obtenemos los embeddings del archivo
embedding_words  = set()
word_to_emb_id   = {} 
vector_embeddings = []

with open('word2vec_col.txt', 'r') as f:
	next(f) # ignoramos la primer línea 
	for index, line in enumerate(f):
		tokens = line.split()
		word = tokens[0]
		embedding_words.add(word)
		word_to_emb_id[word] = index
		word_embedding = np.array(tokens[1:]).astype(float)
		vector_embeddings.append(word_embedding)

vector_embeddings = np.array(vector_embeddings)

In [76]:
args2 = Namespace()
args2.N = 4

In [77]:
ngram_data_w2v = NgramData(args2.N, 
							5000, 
							tokenizer.tokenize, 
							embeddings_model=vector_embeddings, 
							embedding_words=embedding_words,
							word_to_emb_id=word_to_emb_id)

In [78]:
ngram_data_w2v.fit(x_train)

In [79]:
x_ngram_train, y_ngram_train = ngram_data_w2v.transform(x_train)
x_ngram_val  , y_ngram_val   = ngram_data_w2v.transform(x_val)

In [80]:
args2.batch_size = 64
args2.num_workers = 2

In [81]:
train_dataset_w2v = TensorDataset(torch.tensor(x_ngram_train, dtype=torch.int64), torch.tensor(y_ngram_train, dtype=torch.int64))
train_loader_w2v = DataLoader(train_dataset_w2v, batch_size=args2.batch_size, num_workers=args2.num_workers, shuffle=True)

In [82]:
val_dataset_w2v = TensorDataset(torch.tensor(x_ngram_val, dtype=torch.int64), torch.tensor(y_ngram_val, dtype=torch.int64))
val_loader_w2v = DataLoader(val_dataset_w2v, batch_size=args2.batch_size, num_workers=args2.num_workers, shuffle=True)

In [83]:
batch = next(iter(train_loader_w2v))

In [84]:
# Model hyperparameters 

args2.vocabulary_size = ngram_data_w2v.get_vocabulary_size()
args2.d = 100
args2.d_h = 200
args2.dropout = 0.1

# Train hyperparameters

args2.lr = 2.3e-1
args2.num_epochs = 100
args2.patience = 20

args2.lr_patience = 10
args2.lr_factor = 0.5

In [85]:
args2.savedir = "model_w2v"
os.makedirs(args2.savedir, exist_ok=True)

In [86]:
model_w2v = NeuralLM(args2, embedding_matrix=ngram_data_w2v.embedding_matrix)

In [87]:
args2.use_gpu = torch.cuda.is_available()
if args2.use_gpu:
	model_w2v.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_w2v.parameters(), lr=args2.lr)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience=args2.lr_patience, verbose=True, factor=args2.lr_factor)

In [49]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args2.num_epochs):
	epoch_start_time =  time.time()
	loss_epoch = []
	training_metric = []
	model_w2v.train()

	for window_words, labels in train_loader_w2v:

		# if gpu available
		if args2.use_gpu:
			window_words = window_words.cuda()
			labels = labels.cuda()

		# forward pass
		outputs = model_w2v(window_words)
		loss = criterion(outputs, labels)
		loss_epoch.append(loss.item())

		# get_training metrics
		y_pred = get_preds(outputs)
		tgt = labels.cpu().numpy()
		training_metric.append(accuracy_score(tgt, y_pred))

		# posteriormente, hacemos el backward y optimizamos
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

	# get metric n training dataset
	mean_epoch_metric = np.mean(training_metric)
	train_metric_history.append(mean_epoch_metric)

	# get metric in validation dataset
	model_w2v.eval()
	tuning_metric = model_eval(val_loader_w2v, model_w2v, gpu=args2.use_gpu)
	metric_history.append(mean_epoch_metric)

	# update scheduler
	scheduler.step(tuning_metric)

	# chech for metric improvement
	is_improvement = tuning_metric > best_metric
	if is_improvement:
		vest_metric = tuning_metric
		n_no_improve = 0
	else:
		n_no_improve += 1

	save_checkpoint(
		{
		"epoch" : epoch + 1, 
		"state_dict" : model_w2v.state_dict(), 
		"optimizer" : optimizer.state_dict(),
		"scheduler" : scheduler.state_dict(), 
		"best_metric" : best_metric
		}, 
		is_improvement, 
		args2.savedir
	)

	# detener el modelo si no hay mejora
	if n_no_improve >= args2.patience:
		print("No improvement. Breaking out of loop")
		break

	print("Train acc: {}".format(mean_epoch_metric))
	print("Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time : {:.2f}".format(epoch + 1, args2.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time() - epoch_start_time)))

print("--- %s seconds" % (time.time() - start_time))

Train acc: 0.11865841806220095
Epoch [1/100], Loss: 5.9706 - Val accuracy: 0.1691 - Epoch time : 13.97
Train acc: 0.13365916566985647
Epoch [2/100], Loss: 5.5267 - Val accuracy: 0.1762 - Epoch time : 13.89
Train acc: 0.14068667763157894
Epoch [3/100], Loss: 5.2930 - Val accuracy: 0.1792 - Epoch time : 13.79
Train acc: 0.1450956937799043
Epoch [4/100], Loss: 5.1120 - Val accuracy: 0.1571 - Epoch time : 14.04
Train acc: 0.14803566088516745
Epoch [5/100], Loss: 4.9449 - Val accuracy: 0.1317 - Epoch time : 13.96
Train acc: 0.15063733552631578
Epoch [6/100], Loss: 4.8028 - Val accuracy: 0.1528 - Epoch time : 13.93
Train acc: 0.15492673444976077
Epoch [7/100], Loss: 4.6760 - Val accuracy: 0.1408 - Epoch time : 13.72
Train acc: 0.1594011662679426
Epoch [8/100], Loss: 4.5592 - Val accuracy: 0.1764 - Epoch time : 14.24
Train acc: 0.16508298444976077
Epoch [9/100], Loss: 4.4591 - Val accuracy: 0.1240 - Epoch time : 13.97
Train acc: 0.1712301883971292
Epoch [10/100], Loss: 4.3756 - Val accuracy: 

Train acc: 0.3948938397129187
Epoch [78/100], Loss: 2.5832 - Val accuracy: 0.1887 - Epoch time : 12.76
Train acc: 0.39453872607655505
Epoch [79/100], Loss: 2.5830 - Val accuracy: 0.1880 - Epoch time : 12.65
Train acc: 0.39366589413875597
Epoch [80/100], Loss: 2.5854 - Val accuracy: 0.1838 - Epoch time : 12.72
Epoch 00081: reducing learning rate of group 0 to 3.5938e-03.
Train acc: 0.3936434659090909
Epoch [81/100], Loss: 2.5821 - Val accuracy: 0.1870 - Epoch time : 12.83
Train acc: 0.3968376196172249
Epoch [82/100], Loss: 2.5698 - Val accuracy: 0.1825 - Epoch time : 12.70
Train acc: 0.397517942583732
Epoch [83/100], Loss: 2.5692 - Val accuracy: 0.1843 - Epoch time : 12.76
Train acc: 0.3983216208133971
Epoch [84/100], Loss: 2.5663 - Val accuracy: 0.1839 - Epoch time : 12.81
Train acc: 0.3977515699760765
Epoch [85/100], Loss: 2.5689 - Val accuracy: 0.1891 - Epoch time : 12.70
Train acc: 0.39917576255980863
Epoch [86/100], Loss: 2.5668 - Val accuracy: 0.1856 - Epoch time : 12.89
Train acc

In [88]:
best_model_w2v = NeuralLM(args2)
best_model_w2v.load_state_dict(torch.load("model_w2v/model_best.pt", map_location=torch.device("cpu"))["state_dict"])
best_model_w2v.train(False)

NeuralLM(
  (emb): Embedding(5000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=5000, bias=False)
)

In [56]:
initial_tokens = "<s> <s> <s>"

# el tokenizer es la función "list"
print(generate_sentence(best_model_w2v, initial_tokens, tokenizer.tokenize, ngram_data_w2v, max_gen_tokens=100, join_char=" "))

<s> <s> <s> porque <unk> <unk> no valgo de <unk> los <unk> <unk> <unk> lo vuelvo a subir pinche <unk> acá </s>


In [96]:
initial_tokens = "<s> <s> hola"

print(generate_sentence(best_model_w2v, initial_tokens, tokenizer.tokenize, ngram_data_w2v, max_gen_tokens=100, join_char=" "))

<s> <s> hola cuando <unk> <unk> pero como hoy un <unk> <unk> ” <unk> que holanda no le <unk> no quiero pedo con <unk> un <unk> </s>


In [97]:
print("log likelihood: ", log_likelihood(best_model_w2v, "clase de lenguage natural", ngram_data_w2v))

log likelihood:  -5.535452


In [98]:
print("log likelihood: ", log_likelihood(best_model_w2v, "messi es el mejor jugador", ngram_data_w2v))

log likelihood:  -29.040985


In [99]:
print("log likelihood: ", log_likelihood(best_model_w2v, "ronaldo es el mejor jugador", ngram_data_w2v))

log likelihood:  -30.505259


In [100]:
print("log likelihood: ", log_likelihood(best_model_w2v, "amlo es un mal presidente", ngram_data_w2v))

log likelihood:  -31.871273


In [101]:
print("log likelihood: ", log_likelihood(best_model_w2v, "mexico le va a ganar a argentina", ngram_data_w2v))

log likelihood:  -52.662636


In [105]:
word_list = "mexico le va a ganar a argentina"
perms = [" ".join(perm) for perm in permutations(word_list.split(" "))]

for p, t in sorted([(log_likelihood(best_model_w2v, text, ngram_data_w2v), text) for text in perms], reverse=True)[:5]:
	print(p, t)

    
print("-" * 50)

for p, t in sorted([(log_likelihood(best_model_w2v, text, ngram_data_w2v), text) for text in perms], reverse=True)[-5:]:
	print(p, t)

-29.469181 ganar le va a argentina a mexico
-29.469181 ganar le va a argentina a mexico
-29.807985 argentina ganar le va a a mexico
-29.807985 argentina ganar le va a a mexico
-33.405617 le ganar va a argentina a mexico
--------------------------------------------------
-124.671486 le a va mexico ganar a argentina
-126.85434 le a va mexico argentina ganar a
-126.85434 le a va mexico argentina ganar a
-128.51643 le a va mexico ganar argentina a
-128.51643 le a va mexico ganar argentina a


In [112]:
# Tengo un problema de underflow con la perplejidad, no lo puede solucionar, pero creo que lo hace bien
print("P = {}".format(perplexity(best_model_w2v, " ".join(x_val), ngram_data_w2v)))

P = 0


### 3 A partir del modelo anterior haga un modelo de lenguaje que integre una conexión directa de la capa de embeddings hacía la salida, justo como lo proponía Bengio. Discuta sobre las diferencias en el proceso de entrenamiento y la perplejidad respecto al modelo anterior y el visto en clase.

In [106]:
args3 = Namespace()

# (Hyper)parámetros

args3.N               = args2.N
args3.batch_size      = args2.batch_size
args3.num_workers     = args2.num_workers
args3.vocabulary_size = args2.vocabulary_size
args3.d               = args2.d
args3.d_h             = args2.d_h
args3.dropout         = args2.dropout
args3.lr              = args2.lr
args3.num_epochs      = args2.num_epochs
args3.patience        = args2.patience
args3.lr_patience     = args2.lr_patience
args3.lr_factor       = args2.lr_factor

In [107]:
args3.savedir = "model_w2v_with_direct_conxn"
os.makedirs(args3.savedir, exist_ok=True)

In [108]:
model_w2v_2 = NeuralLM(args3, embedding_matrix=ngram_data_w2v.embedding_matrix, direct_emb_to_output=True)

In [109]:
args3.use_gpu = torch.cuda.is_available()
if args3.use_gpu:
	model_w2v_2.cuda()


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_w2v_2.parameters(), lr=args3.lr)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience=args3.lr_patience, verbose=True, factor=args3.lr_factor)

In [67]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args3.num_epochs):
	epoch_start_time = time.time()
	loss_epoch = []
	training_metric = []
	model_w2v_2.train()

	for window_words, labels in train_loader_w2v:

		# if gpu available
		if args3.use_gpu:
			window_words = window_words.cuda()
			labels = labels.cuda()

		# forward pass
		outputs = model_w2v_2(window_words)
		loss = criterion(outputs, labels)
		loss_epoch.append(loss.item())

		# get_training metrics
		y_pred = get_preds(outputs)
		tgt = labels.cpu().numpy()
		training_metric.append(accuracy_score(tgt, y_pred))

		# posteriormente, hacemos el backward y optimizamos
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

	# get metric n training dataset
	mean_epoch_metric = np.mean(training_metric)
	train_metric_history.append(mean_epoch_metric)

	# get metric in validation dataset
	model_w2v_2.eval()
	tuning_metric = model_eval(val_loader_w2v, model_w2v_2, gpu=args3.use_gpu)
	metric_history.append(mean_epoch_metric)

	# update scheduler
	scheduler.step(tuning_metric)

	# chech for metric improvement
	is_improvement = tuning_metric > best_metric
	if is_improvement:
		vest_metric = tuning_metric
		n_no_improve = 0
	else:
		n_no_improve += 1

	save_checkpoint(
		{
		"epoch" : epoch + 1, 
		"state_dict" : model_w2v_2.state_dict(), 
		"optimizer" : optimizer.state_dict(),
		"scheduler" : scheduler.state_dict(), 
		"best_metric" : best_metric
		}, 
		is_improvement, 
		args3.savedir
	)

	# detener el modelo si no hay mejora
	if n_no_improve >= args3.patience:
		print("No improvement. Breaking out of loop")
		break

	print("Train acc: {}".format(mean_epoch_metric))
	print("Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time : {:.2f}".format(epoch + 1, args3.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time() - epoch_start_time)))

print("--- %s seconds" % (time.time() - start_time))

Train acc: 0.0852833433014354
Epoch [1/100], Loss: 8.7656 - Val accuracy: 0.0685 - Epoch time : 24.51
Train acc: 0.09734973086124403
Epoch [2/100], Loss: 8.2779 - Val accuracy: 0.0698 - Epoch time : 24.52
Train acc: 0.11349992523923445
Epoch [3/100], Loss: 7.9537 - Val accuracy: 0.1039 - Epoch time : 24.58
Train acc: 0.13167052930622009
Epoch [4/100], Loss: 7.7698 - Val accuracy: 0.0935 - Epoch time : 24.99
Train acc: 0.14615542763157893
Epoch [5/100], Loss: 7.5895 - Val accuracy: 0.0772 - Epoch time : 24.62
Train acc: 0.15960115131578945
Epoch [6/100], Loss: 7.4453 - Val accuracy: 0.0841 - Epoch time : 24.73
Train acc: 0.16911072069377991
Epoch [7/100], Loss: 7.3871 - Val accuracy: 0.1472 - Epoch time : 24.95
Train acc: 0.18089675538277514
Epoch [8/100], Loss: 7.3039 - Val accuracy: 0.0573 - Epoch time : 24.87
Train acc: 0.1897203947368421
Epoch [9/100], Loss: 7.1717 - Val accuracy: 0.0934 - Epoch time : 24.64
Train acc: 0.19794781698564592
Epoch [10/100], Loss: 7.1044 - Val accuracy:

Train acc: 0.5084741327751197
Epoch [77/100], Loss: 2.0722 - Val accuracy: 0.1693 - Epoch time : 24.96
Train acc: 0.5069882625598087
Epoch [78/100], Loss: 2.0703 - Val accuracy: 0.1565 - Epoch time : 24.98
Train acc: 0.5107562051435407
Epoch [79/100], Loss: 2.0656 - Val accuracy: 0.1668 - Epoch time : 25.05
Train acc: 0.5091918361244019
Epoch [80/100], Loss: 2.0684 - Val accuracy: 0.1652 - Epoch time : 24.72
Train acc: 0.5082834928229665
Epoch [81/100], Loss: 2.0675 - Val accuracy: 0.1631 - Epoch time : 24.94
Train acc: 0.5082984449760766
Epoch [82/100], Loss: 2.0698 - Val accuracy: 0.1650 - Epoch time : 25.10
Train acc: 0.5095020933014355
Epoch [83/100], Loss: 2.0679 - Val accuracy: 0.1602 - Epoch time : 25.00
Train acc: 0.5082292912679426
Epoch [84/100], Loss: 2.0699 - Val accuracy: 0.1683 - Epoch time : 25.11
Epoch 00085: reducing learning rate of group 0 to 1.7969e-03.
Train acc: 0.5081208881578947
Epoch [85/100], Loss: 2.0679 - Val accuracy: 0.1714 - Epoch time : 24.70
Train acc: 

In [110]:
best_model_w2v_2 = NeuralLM(args3, direct_emb_to_output=True)
best_model_w2v_2.load_state_dict(torch.load("model_w2v_with_direct_conxn/model_best.pt", map_location=torch.device("cpu"))["state_dict"])
best_model_w2v_2.train(False)

NeuralLM(
  (emb): Embedding(5000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=500, out_features=5000, bias=False)
)

In [106]:
initial_tokens = "amlo es un"

# el tokenizer es la función "list"
print(generate_sentence(best_model_w2v_2, initial_tokens, tokenizer.tokenize, ngram_data_w2v, max_gen_tokens=100, join_char=" "))

amlo es un pendejo maricón <unk> quien vergas hace raúl mejor … vas a ir a dormir un <unk> <unk> por qué pala ver a este par de semana era como de sopa de letras <unk> las ratas y putas como <unk> ☹ ️ </s>


In [111]:
# Tengo un problema de underflow con la perplejidad, no lo puede solucionar, pero creo que lo hace bien
print("P = {}".format(perplexity(best_model_w2v_2, " ".join(x_val), ngram_data_w2v)))

P = 0
