In [30]:
import csv
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import itertools
import pickle
import gensim
from gensim import corpora, models
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, RepeatedStratifiedKFold
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA

In [None]:
nltk.download('book')

In [32]:
def sub_hashtags(tweet):
	clean_tweet = re.sub('#[A-Za-z0-9_]+', 'htag', tweet)
	return clean_tweet

In [33]:
def sub_cashtags(tweet):
	clean_tweet = re.sub('\$[A-Za-z0-9_]+', 'ctag', tweet)
	return clean_tweet

In [34]:
def sub_user(tweet):
	clean_tweet = re.sub('@[A-Za-z0-9_]+', 'user', tweet)
	return clean_tweet

In [35]:
def sub_urls(tweet):
	clean_tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'url', tweet)
	return clean_tweet

In [36]:
def delete_punctuation(tweet):
	clean_tweet = re.compile('[%s]' % re.escape(string.punctuation)).sub('', tweet)
	return clean_tweet

In [37]:
def sub_numbers(tweet):
	clean_tweet = re.sub('[0-9_]+', 'xyz', tweet)
	return clean_tweet

In [38]:
def sub_repeated_chars(tweet):
	clean_tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
	return clean_tweet

In [39]:
def pre_process_pipeline(tweets):
	clean_tweets = []

	# limpiamos todos los tweets

	for tweet in tweets:
		p_tweet = sub_hashtags(tweet)
		p_tweet = sub_cashtags(p_tweet)
		p_tweet = sub_user(p_tweet)
		p_tweet = sub_urls(p_tweet)
		p_tweet = delete_punctuation(p_tweet)
		p_tweet = sub_numbers(p_tweet)
		p_tweet = sub_repeated_chars(p_tweet)
		clean_tweets.append(p_tweet)

	return clean_tweets

In [40]:
def tokenize_tweets(tweets):
	tok_tweets = []

	for tweet in tweets:
		tok_tweets.append(word_tokenize(tweet))

	return tok_tweets

In [41]:
def lower_case(tweets):
	low_tweets = []

	for tweets in tweets:
		low_tweets.append([j.lower() for j in tweets])

	return low_tweets

In [42]:
def filter_stopwords(tweets, additional_stp = None):

	stop_words = list(stopwords.words('spanish'))

	stop_words.append('user')
	stop_words.append('htag')
	stop_words.append('ctag')
	stop_words.append('xyz')
	stop_words.append('mail')
	stop_words.append('url')
	stop_words.append('')
	stop_words.append('rt')
	stop_words.append('qt')

	if(additional_stp is not None):
		for item in additional_stp:
			stop_words.append(item)

	filtered_tweet = []

	for tweet in tweets:
		filtered_tweet.append([w for w in tweet if not w in stop_words])

	return filtered_tweet

In [43]:
def stemming(tweets):

	ps = PorterStemmer()

	stem_tweets = []

	for tweet in tweets:
		stem_tweets.append([ps.stem(j) for j in tweet])

	return stem_tweets

In [44]:
def get_wordnet_pos(treebank_tag):

	if treebank_tag.startswith('J'):
		return wn.ADJ
	elif treebank_tag.startswith('V'):
		return wn.VERB
	elif treebank_tag.startswith('N'):
		return wn.NOUN
	elif treebank_tag.startswith('R'):
		return wn.ADV
	else:
		return wn.NOUN

In [45]:
def eng_lemmatizer(tweets):

	wnl = WordNetLemmatizer()

	lem_tweets = []

	for tweet in tweets:
		tags = pos_tag(tweet)
		lem_tweets.append([wnl.lemmatize(j[0],get_wordnet_pos(j[1])) for j in tags])

	return lem_tweets

In [56]:
def process_tweets(file_name):

	tweets = []
	sentiment = []

	with open(file_name, 'r') as csv_file:
		csv_reader = csv.DictReader(csv_file, delimiter = ',')
		for row in csv_reader:
			tweets.append(row['text'])
			sentiment.append(int(row['text_sentiment']))

	# preprocesamiento de tweets
	prep_tweets = pre_process_pipeline(tweets)

	# procesamiento de tweets - tokenize
	tok_tweets = tokenize_tweets(prep_tweets)

	# procesamiento de tweets - letra minuscula
	low_tweets = lower_case(tok_tweets)

	# procesamiento de tweets - filtado de palabras vacias
	stop_tweets = filter_stopwords(low_tweets)

	# procesamiento de tweets - stemming
	stm_tweets = stemming(stop_tweets)

	# se guardan los tweets procesados en un archivo pickle para su uso posterior
	with open('canelo.pickle', 'wb') as f:
		pickle.dump(stm_tweets, f, pickle.HIGHEST_PROTOCOL)

	return stm_tweets, sentiment

In [47]:
def plot2_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
	"""
	This function prints and plots the confusion matrix.
	Normalization can be applied by setting `normalize=True`.
	"""
	if normalize:
		cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
		print("Normalized confusion matrix")
	else:
		print('Confusion matrix, without normalization')

	print(cm)

	plt.imshow(cm, interpolation='nearest', cmap=cmap, aspect='auto')
	plt.title(title)
	plt.colorbar()
	tick_marks = np.arange(len(classes))
	plt.xticks(tick_marks, classes, rotation=45)
	plt.yticks(tick_marks, classes)

	fmt = '.1f' if normalize else 'd'
	thresh = cm.max() / 2.
	for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
		plt.text(j, i, format(cm[i, j], fmt),horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
	
	#plt.tight_layout()
	plt.ylabel('Clase correcta')
	plt.xlabel('Clase predicha')

In [48]:
def bow_to_vector(tweet, dict_len):
	# hacemos un vector de ceros del tamaño del vocabulario
	vector = [0] * dict_len

	# agregamos los valores del bow del tweet
	for item in tweet:
		vector[item[0]] = item[1]

	return vector

In [49]:
def make_bow(tweets):
  # Hacemos la bolsa de palabras
	dictionary = gensim.corpora.Dictionary(tweets)
	dictionary.filter_extremes(no_below=15, no_above=1, keep_n=100000)
	bow_corpus = [dictionary.doc2bow(doc) for doc in tweets]

	return bow_corpus, len(dictionary)