###  Luis Ricardo Cruz García
#### Procesamiento de Lenguaje Natural (NLP)

#### Tarea 2

In [1]:
import math
import numpy as np

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import bigrams

from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import (
	accuracy_score, 
	confusion_matrix, 
	f1_score, 
	precision_recall_fscore_support, 
	roc_auc_score)

In [2]:
def get_texts_from_file(path_corpus: str, path_label: str) -> tuple[list, list]:
	"""Given the corpus and label paths, returns the list of docs and labels."""
	docs, labels = [], []

	with open(path_corpus, "r") as f_corpus:
		for doc in f_corpus:
			docs.append(doc)

	with open(path_label, "r") as f_labels:
		for label in f_labels:
			labels.append(label)

	return docs, labels

In [3]:
class Vocabulary:
	"""Vocabulary class to store the frequencies and ranking of the words in the given corpus."""
	def __init__(self, corpus_words: list[str], n_words : int = 5000):
		corpus_freqdist = nltk.FreqDist(corpus_words)  # frequency distribution
		
		# the n_words most occurring words
		self.vocabulary_word_freq = self._sort_FreqDist(corpus_freqdist)[:n_words]

		self.vocabulary = [word for word, freq in self.vocabulary_word_freq]
		
		# dictionary of the rank (frequency) of words in the vocabulary, word: freq_ranking
		self.word_to_index = {word: rank for rank, word in enumerate(self.vocabulary)}
	
	@staticmethod
	def _sort_FreqDist(fd: nltk.FreqDist) -> list:
		"""Return the list of items (pairs of <word, freq>) sorted by frequency (desc)."""
		aux = list(fd.items())
		aux.sort(key=lambda x: x[1], reverse=True)
		return aux

	def __len__(self):
		return len(self.vocabulary)
	
	def __getitem__(self, key: str) -> int:
		return self.word_to_index[key]

	def __contains__(self, key: str) -> bool:
		return key in self.word_to_index

In [4]:
tokenizer = TweetTokenizer()

In [5]:
# get training docs and labels
train_docs, train_labels = get_texts_from_file("../../Data/mex_train.txt", "../../Data/mex_train_labels.txt")
train_labels = list(map(int, train_labels))  # cast to integer

In [6]:
# get validation docs and labels
val_docs, val_labels = get_texts_from_file("../../Data/mex_val.txt", "../../Data/mex_val_labels.txt")
val_labels = list(map(int, val_labels))

In [7]:
corpus_words = []
for doc in train_docs:
	corpus_words += tokenizer.tokenize(doc)

# remove stopwords and set to lowercase
set_stopwords = set(stopwords.words("spanish"))
corpus_words = [w.lower() for w in corpus_words if w not in set_stopwords]

In [8]:
# create a vocabulary object for the 5000 most occurring words
vocabulary_5k = Vocabulary(corpus_words)

## 2. Bolsas de Palabras, Bigramas y Emociones**

Representa los documentos y clasifica con SVM similar a la Práctica 3, pero con diferentes pesados de términos.

In [10]:
parameters = {'C' : [0.05, 0.12, 0.25, 0.5, 1, 2, 4]}

In [11]:
def prediction_svm_model(train_BoW, validation_BoW, train_labels, max_iter=2000):
	"""Creates a linear SVM classification model and returns the prediction 
	of labels for the validation BoW.
	"""
	svm_lin_class = svm.LinearSVC(class_weight="balanced", 
								  max_iter=max_iter)
	
	grid = GridSearchCV(
				estimator=svm_lin_class, 
				param_grid=parameters, 
				n_jobs=8, 
				scoring="f1_macro", 
				cv=5)
	
	grid.fit(train_BoW, train_labels)
	
	y_prediction = grid.predict(validation_BoW)
	
	return y_prediction

### 1. Evalue BoW (Bag of Words) con pesado binario

In [12]:
def build_BoW_bin(docs: list[str], 
				  vocabulary: Vocabulary) -> np.ndarray:
	BoW = np.zeros((len(docs), len(vocabulary)), dtype=float)

	for i, doc in enumerate(docs):
		doc_words = set(tokenizer.tokenize(doc))
		for word in doc_words:
			if word in vocabulary:
				BoW[i, vocabulary[word]] = 1.

	return BoW

In [13]:
train_BoW_bin = build_BoW_bin(train_docs, vocabulary_5k)
val_BoW_bin = build_BoW_bin(val_docs, vocabulary_5k)

In [14]:
labels_pred_bin = prediction_svm_model(train_BoW_bin, val_BoW_bin, train_labels)

In [15]:
print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_bin)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_bin))

confusion matrix:
[[326  71]
 [ 64 155]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       397
           1       0.69      0.71      0.70       219

    accuracy                           0.78       616
   macro avg       0.76      0.76      0.76       616
weighted avg       0.78      0.78      0.78       616



### 2. Evalue BoW con pesado de frecuencia

In [16]:
def build_BoW_freq(docs: list[str], 
				   vocabulary: Vocabulary) -> np.ndarray:
	BoW = np.zeros((len(docs), len(vocabulary)), dtype=float)

	for i, doc in enumerate(docs):
		fdist_docs = nltk.FreqDist(tokenizer.tokenize(doc))
		for w in vocabulary.vocabulary:
			BoW[i, vocabulary[w]] = fdist_docs[w]

	return BoW

In [17]:
train_BoW_freq = build_BoW_freq(train_docs, vocabulary_5k)
val_BoW_freq = build_BoW_freq(val_docs, vocabulary_5k)

In [18]:
labels_pred_freq = prediction_svm_model(train_BoW_freq, val_BoW_freq, train_labels, max_iter=3000)

In [19]:
print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_freq)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_freq))

confusion matrix:
[[328  69]
 [ 63 156]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       397
           1       0.69      0.71      0.70       219

    accuracy                           0.79       616
   macro avg       0.77      0.77      0.77       616
weighted avg       0.79      0.79      0.79       616



### 3. Evalue BoW con pesado de tf-idf

$BoW = (w_{i,j})$, where
$$w_{i,j} = \operatorname{tf}(t_j, d_i) \times \log\left(\frac{N}{\operatorname{df}(t_j)} \right)$$
where
- $\operatorname{tf}(t_j, d_i) =$ number of occurrences of term $t_j$ in document $d_j$. 
- $N =$ number documents. 
- $\operatorname{df}(t_j) =$ number documents s.t. $t_j$ term is contained in.


In [20]:
def build_BoW_tfidf(docs: list[str], 
					vocabulary: Vocabulary) -> np.ndarray:
	N = len(docs)
	BoW = np.zeros((len(docs), len(vocabulary)), dtype=float)

	tokenized_tweets = [tokenizer.tokenize(tweet) for tweet in docs]
	df = [0] * len(vocabulary)
	for i, w in enumerate(vocabulary.vocabulary):
		df[i] = 1 + sum(1 for j in range(len(docs)) if w in tokenized_tweets[j])

	for i, tweet in enumerate(docs):
		fdist_tweet = nltk.FreqDist(tokenized_tweets[i])
		for j, w in enumerate(vocabulary.vocabulary):
			BoW[i, vocabulary[w]] = float(fdist_tweet[w]) * math.log(N / df[j])

	return BoW

In [21]:
train_BoW_tfidf = build_BoW_tfidf(train_docs, vocabulary_5k)
val_BoW_tfidf = build_BoW_tfidf(val_docs, vocabulary_5k)

In [22]:
labels_pred_tfidf = prediction_svm_model(train_BoW_tfidf, val_BoW_tfidf, train_labels, max_iter=20000)



In [23]:
print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_tfidf)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_tfidf))

confusion matrix:
[[328  69]
 [ 75 144]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       397
           1       0.68      0.66      0.67       219

    accuracy                           0.77       616
   macro avg       0.74      0.74      0.74       616
weighted avg       0.76      0.77      0.77       616



### 4. Evalue BoW con pesado binario normalizado $l_2$ (no use sklearn).

In [24]:
def l2_normalize(M: np.ndarray) -> np.ndarray:
	"""Normalize (using the l_2 norm) the rows of a rank-2 numpy array (matrix). 
	Returns the matrix normalized.
	"""
	return M * (1 / np.sqrt(np.square(M).sum(axis=1)[:, np.newaxis]))

In [25]:
train_BoW_bin_normalized = l2_normalize(train_BoW_bin)
val_BoW_bin_normalized = l2_normalize(val_BoW_bin)

In [26]:
labels_pred_bin_normalized = prediction_svm_model(train_BoW_bin_normalized, val_BoW_bin_normalized, train_labels, max_iter=2000)

In [27]:
print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_bin_normalized)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_bin_normalized))

confusion matrix:
[[328  69]
 [ 61 158]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       397
           1       0.70      0.72      0.71       219

    accuracy                           0.79       616
   macro avg       0.77      0.77      0.77       616
weighted avg       0.79      0.79      0.79       616



### 5. Evalue BoW con pesado frecuencia normalizado l2 (no use sklearn).

In [28]:
train_BoW_freq_normalized = l2_normalize(train_BoW_freq)
val_BoW_freq_normalized = l2_normalize(val_BoW_freq)

In [29]:
labels_pred_freq_normalized = prediction_svm_model(train_BoW_freq_normalized, val_BoW_freq_normalized, train_labels, max_iter=3000)

In [30]:
print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_freq_normalized)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_freq_normalized))

confusion matrix:
[[326  71]
 [ 61 158]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       397
           1       0.69      0.72      0.71       219

    accuracy                           0.79       616
   macro avg       0.77      0.77      0.77       616
weighted avg       0.79      0.79      0.79       616



### 6. Evalué BoW con pesado tfidf normalizado l2 (no use sklearn).

In [31]:
train_BoW_tfidf_normalized = l2_normalize(train_BoW_tfidf)
val_BoW_tfidf_normalized = l2_normalize(val_BoW_tfidf)

In [32]:
labels_pred_tfidf_normalized = prediction_svm_model(train_BoW_tfidf_normalized, val_BoW_tfidf_normalized, train_labels, max_iter=20000)

In [33]:
print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_tfidf_normalized)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_tfidf_normalized))

confusion matrix:
[[320  77]
 [ 64 155]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       397
           1       0.67      0.71      0.69       219

    accuracy                           0.77       616
   macro avg       0.75      0.76      0.75       616
weighted avg       0.77      0.77      0.77       616



### 7. Ponga una tabla comparativa a modo de resumen con las seis entradas anteriores.

| | Binario | Frecuencia | tf-idf | Binario normalizado | Frecuencia normalizado | tf-idf normalizado |
|--|---------------|----------------------|---------------|--|--|--|
|Accuracy| 0.78 | 0.79 | 0.77 | 0.79 | 0.79 | 0.77|

### 8. De las configuraciones anteriores elija la mejor y evalúela con más y menos términos (1000 y 7000 términos)

La mejor configuración fue la de BoW con pesado de frecuencia, lo probaré con 7000 y 1000 términos.

In [34]:
vocabulary_7k = Vocabulary(corpus_words, n_words=7000)
vocabulary_1k = Vocabulary(corpus_words, n_words=1000)

Modelo con 7000 términos

In [97]:
train_BoW_freq_7k = build_BoW_freq(train_docs, vocabulary_7k)
val_BoW_freq_7k = build_BoW_freq(val_docs, vocabulary_7k)

labels_pred_freq_7k = prediction_svm_model(train_BoW_freq_7k, val_BoW_freq_7k, train_labels, max_iter=3000)

print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_freq_7k)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_freq_7k))

confusion matrix:
[[333  64]
 [ 65 154]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       397
           1       0.71      0.70      0.70       219

    accuracy                           0.79       616
   macro avg       0.77      0.77      0.77       616
weighted avg       0.79      0.79      0.79       616



Modelo con 1000 términos.

In [101]:
train_BoW_freq_1k = build_BoW_freq(train_docs, vocabulary_1k)
val_BoW_freq_1k = build_BoW_freq(val_docs, vocabulary_1k)

labels_pred_freq_1k = prediction_svm_model(train_BoW_freq_1k, val_BoW_freq_1k, train_labels, max_iter=10000)

print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_freq_1k)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_freq_1k))

confusion matrix:
[[329  68]
 [ 58 161]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       397
           1       0.70      0.74      0.72       219

    accuracy                           0.80       616
   macro avg       0.78      0.78      0.78       616
weighted avg       0.80      0.80      0.80       616



### 9. Utilice el recurso léxico del Consejo Nacional de Investigación de Canadá llamado "EmoLex" (https://www.saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) para construir una "Bolsa de Emociones" de los Tweets de agresividad (Debe usar EmoLex en Español). Para esto, una estrategia sencilla sería enmascarar cada palabra con su emoción, y después construir la Bolsa de Emociones (BoE).

In [37]:
# get NRC data
text_emotions = []
with open("../../Data/Spanish-es-NRC-Emotion-Intensity-Lexicon-v1.txt", "r") as file:
	for line in file:
		text_emotions += [line]

# remove the first line (has no words)
text_emotions = text_emotions[1:]

word_emotion = {}
for line in text_emotions:
	tmp = line.split("\t")
	word_emotion[tmp[1]] = tmp[2]

emotion_index = {'anger': 0, 
				 'anticipation': 1, 
				 'disgust': 2, 
				 'fear': 3, 
				 'joy' : 4, 
				 'sadness' : 5, 
				 'surprise' : 6, 
				 'trust' : 7}

In [38]:
def build_BoE(docs: list[str], 
			  emotion_index: dict[str: int], 
			  word_emotion: dict[str, str]):
	BoE = np.zeros((len(docs), len(emotion_index)))

	for i, tweet in enumerate(docs):
		tweet_words = tokenizer.tokenize(tweet)
		for word in tweet_words:
			if word in word_emotion:
				# frequency weight
				BoE[i, emotion_index[word_emotion[word]]] += 1

	return BoE

### 10. Evalúa tú BoE clasificando con SVM. Ponga una tabla comparativa a modo de resumen con los tres pesados, normalize cada uno si lo cree conveniente.

In [39]:
train_BoE = build_BoE(train_docs, emotion_index, word_emotion)
val_BoE = build_BoE(val_docs, emotion_index, word_emotion)

In [41]:
labels_pred_emotions = prediction_svm_model(train_BoE, val_BoE, train_labels, max_iter=10000)

print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_emotions)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_emotions))

confusion matrix:
[[202 195]
 [ 98 121]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.67      0.51      0.58       397
           1       0.38      0.55      0.45       219

    accuracy                           0.52       616
   macro avg       0.53      0.53      0.52       616
weighted avg       0.57      0.52      0.53       616



# 3 Recurso Lingüístico de Emociones Mexicano

### 1. Utilice el recurso léxico llamado "Spanish Emotion Lexicon (SEL)" del Dr. Grigori Sidorov, profesor del Centro de Investigación en Computación (CIC) del IPN (http://www.cic.ipn.mx/∼sidorov/), para enmascarar cada palabra con su emoción, y después construir la Bolsa de Emociones con algún pesado (e.g., binario, tf, tfidf). Proponga alguna estrategia para incorporar el "valor" del "Probability Factor of Affective use" en su representación vectorial del documento. Evalúa y escribe una tabla comparativa a modo de resumen con al menos tres pesados: binario, frecuencia, tfidf. Normalize cada pesado según lo crea conveniente de acuerdo al experimento (1).

In [54]:
text_emotions_SEL = []
with open("../../Data/SEL.txt", "r", encoding='latin-1') as file:
	for line in file:
		text_emotions_SEL += [line]

# remove the first line (has no words)
text_emotions_SEL = text_emotions_SEL[1:]

word_emotion_SEL = {}
for line in text_emotions_SEL:
	tmp = line.split("\t")
	word_emotion_SEL[tmp[0]] = [tmp[2], tmp[1]]

emotion_index_SEL = {'Alegría\n': 0, 
					 'Enojo\n': 1, 
					 'Miedo\n': 2, 
					 'Repulsión\n': 3, 
					 'Sorpresa\n': 4, 
					 'Tristeza\n': 5}

In [58]:
def build_BoE_PFA(docs: list[str], 
				  emotion_index: dict[str, int], 
				  word_emotion: dict[str, str]) -> np.ndarray:
	BoE = np.zeros((len(docs), len(emotion_index)), dtype=float)

	for i, tweet in enumerate(docs):
		tweet_words = tokenizer.tokenize(tweet)
		for word in tweet_words:
			if word in word_emotion:
				BoE[i, emotion_index[word_emotion[word][0]]] += float(word_emotion[word][1])

	return BoE

In [61]:
train_BoE_SEL = build_BoE_PFA(train_docs, emotion_index_SEL, word_emotion_SEL)
val_BoE_SEL = build_BoE_PFA(val_docs, emotion_index_SEL, word_emotion_SEL)

In [62]:
labels_pred_SEL = prediction_svm_model(train_BoE_SEL, val_BoE_SEL, train_labels, max_iter=2000)

print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_SEL)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_SEL))

confusion matrix:
[[358  39]
 [178  41]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.67      0.90      0.77       397
           1       0.51      0.19      0.27       219

    accuracy                           0.65       616
   macro avg       0.59      0.54      0.52       616
weighted avg       0.61      0.65      0.59       616



### 2. En un comentario aparte, discuta sobre la estrategía que utilizó para incorporar el "Probability Factor of Affective use". No más de 5 renglones.

Agregé la parte de PFA haciendo un tipo pesado de frecuencia en el cual le sumaba el PFA de cada palabra a la emoción que correspondía, creo es la forma más natural.

# 4 ¿Podemos mejorar con Bigramas?

#### 1. Hacer un experimento dónde concatene una buena BoW según sus experimentos anteriores con otra BoW construida a partir de los 1000 bigramas más frecuentes.

In [63]:
bigrams_corpus = list(bigrams(corpus_words))
vocabulary_bigrams = Vocabulary(bigrams_corpus, n_words=1000)

In [106]:
def build_BoW_with_bigrams_freq(docs: list[str], 
								vocabulary: Vocabulary, 
								vocabulary_bigrams: Vocabulary) -> np.ndarray:
	BoW_freq = build_BoW_freq(docs, vocabulary)
	BoW_bigrams_freq = build_BoW_freq(docs, vocabulary_bigrams)
	return np.concatenate((BoW_freq, BoW_bigrams_freq), axis=1)

In [107]:
train_BoW_with_bigrams_freq = build_BoW_with_bigrams_freq(train_docs, vocabulary_1k, vocabulary_bigrams)
val_BoW_with_bigrams_freq = build_BoW_with_bigrams_freq(val_docs, vocabulary_1k, vocabulary_bigrams)

In [110]:
labels_pred_BoW_with_bigrams_freq = prediction_svm_model(train_BoW_with_bigrams_freq, val_BoW_with_bigrams_freq, train_labels, max_iter=10000)

print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_BoW_with_bigrams_freq)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_BoW_with_bigrams_freq))

confusion matrix:
[[329  68]
 [ 58 161]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       397
           1       0.71      0.72      0.71       219

    accuracy                           0.79       616
   macro avg       0.77      0.78      0.78       616
weighted avg       0.79      0.79      0.79       616



### 2. Hacer un experimento con las Bolsas de Emociones, Bolsa de Palabras y Bolsa de Bigramas; usted elige las dimensionalidades. Para construir la representación final del documento utilice la concatenación de las representaciones según sus observaciones (e.g., Bolsa de Palabras + Bolsa de Bigramas + Bolsa de Sentimientos de Canadá + Bolsa de Sentimientos de Grigori), y aliméntelas a un SVM.

In [111]:
def build_BoW_with_bigrams_and_emotions_freq(docs: list[str], 
											 vocabulary: Vocabulary, 
											 vocabulary_bigrams: Vocabulary, 
											 emotion_index: dict[str, int], 
											 word_emotion: dict[str, str]) -> np.ndarray:
	BoW_freq = build_BoW_freq(docs, vocabulary)
	BoW_bigrams_freq = build_BoW_freq(docs, vocabulary_bigrams)
	BoE_PFA = build_BoE_PFA(docs, emotion_index, word_emotion)
	return np.concatenate((BoW_freq, BoW_bigrams_freq, BoE_PFA), axis=1)

In [112]:
train_BoW_with_bigrams_and_emotions_freq = build_BoW_with_bigrams_and_emotions_freq(train_docs, vocabulary_1k, vocabulary_bigrams, emotion_index_SEL, word_emotion_SEL)
val_BoW_with_bigrams_and_emotions_freq = build_BoW_with_bigrams_and_emotions_freq(val_docs, vocabulary_1k, vocabulary_bigrams, emotion_index_SEL, word_emotion_SEL)

In [114]:
labels_pred_BoW_with_bigrams_and_emotions_freq = prediction_svm_model(train_BoW_with_bigrams_and_emotions_freq, val_BoW_with_bigrams_and_emotions_freq, train_labels, max_iter=10000)

print(f"confusion matrix:\n{confusion_matrix(val_labels, labels_pred_BoW_with_bigrams_and_emotions_freq)}")
print("-" * 70)
print(metrics.classification_report(val_labels, labels_pred_BoW_with_bigrams_and_emotions_freq))

confusion matrix:
[[328  69]
 [ 57 162]]
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       397
           1       0.70      0.74      0.72       219

    accuracy                           0.80       616
   macro avg       0.78      0.78      0.78       616
weighted avg       0.80      0.80      0.80       616



### 3. Elabore conclusiones sobre toda esta Tarea, incluyendo observaciones, comentarios y posibles mejoras futuras. Discuta el comportamiento de la BoW de usar solo palabras a integrar bigramas, y luego a integrar todo ¿ayudó? o ¿empeoró?. Discuta también brevemente el costo computacional de los experimentos ¿Valió la Pena tener todo?. Sea breve: todo en NO más de dos párrafos.

Resulta que, a pesar de que hay formas más complejas de representar a un documento (en este caso, un tweet) y que uno pensaría que estas capturan mejor el "significado" de un texto, en este caso, una de las formas más simples (una BoW con pesado de frecuencia y sólo 1000 términos) resultó ser la mejor (mayor accuracy) para clasificar los tweets agresivos.

En cuanto a si valía la pena tener todo, depende, agregar los bigramas (a la BoW) resulto ser ligeramente menos efectivo (en accuracy) que una BoW con pesado de frecuencia (usando 1000 términos), así que no valió la pena, pero, al combinar BoW, BoE y bigramas el accuracy mejoró ligeramente (a sólo tener BoW y bigramas), pero esto sólo equiparó al mejor modelo obtenido previamente, el cual es más simple, así que es discutible el si valió la pena. 

Al final esta tarea deja ver que una idea simple puede ser la que mejor desempeño tiene, incluso superando ideas mucho más complejas que, en principio, uno podría pensar tienen mayor chance de capturar la complejidad de unos tweets.