In [3]:
import csv
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import itertools
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [None]:
nltk.download('book')

In [5]:
def sub_hashtags(tweet):
	clean_tweet = re.sub('#[A-Za-z0-9_]+', 'htag', tweet)
	return clean_tweet

In [6]:
def sub_cashtags(tweet):
	clean_tweet = re.sub('\$[A-Za-z0-9_]+', 'ctag', tweet)
	return clean_tweet

In [7]:
def sub_user(tweet):
	clean_tweet = re.sub('@[A-Za-z0-9_]+', 'user', tweet)
	return clean_tweet

In [8]:
def sub_urls(tweet):
	clean_tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'url', tweet)
	return clean_tweet

In [9]:
def delete_punctuation(tweet):
	clean_tweet = re.compile('[%s]' % re.escape(string.punctuation)).sub('', tweet)
	return clean_tweet

In [10]:
def sub_numbers(tweet):
	clean_tweet = re.sub('[0-9_]+', 'xyz', tweet)
	return clean_tweet

In [11]:
def sub_repeated_chars(tweet):
	clean_tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
	return clean_tweet

In [12]:
def pre_process_pipeline(tweets):
	clean_tweets = []

	# limpiamos todos los tweets

	for tweet in tweets:
		p_tweet = sub_hashtags(tweet)
		p_tweet = sub_cashtags(p_tweet)
		p_tweet = sub_user(p_tweet)
		p_tweet = sub_urls(p_tweet)
		p_tweet = delete_punctuation(p_tweet)
		p_tweet = sub_numbers(p_tweet)
		p_tweet = sub_repeated_chars(p_tweet)
		clean_tweets.append(p_tweet)

	return clean_tweets

In [13]:
def tokenize_tweets(tweets):
	tok_tweets = []

	for tweet in tweets:
		tok_tweets.append(word_tokenize(tweet))

	return tok_tweets

In [14]:
def lower_case(tweets):
	low_tweets = []

	for tweets in tweets:
		low_tweets.append([j.lower() for j in tweets])

	return low_tweets

In [15]:
def filter_stopwords(tweets, additional_stp = None):

	stop_words = list(stopwords.words('spanish'))

	stop_words.append('user')
	stop_words.append('htag')
	stop_words.append('ctag')
	stop_words.append('xyz')
	stop_words.append('mail')
	stop_words.append('url')
	stop_words.append('')
	stop_words.append('rt')
	stop_words.append('qt')

	if(additional_stp is not None):
		for item in additional_stp:
			stop_words.append(item)

	filtered_tweet = []

	for tweet in tweets:
		filtered_tweet.append([w for w in tweet if not w in stop_words])

	return filtered_tweet

In [16]:
def stemming(tweets):

	ps = PorterStemmer()

	stem_tweets = []

	for tweet in tweets:
		stem_tweets.append([ps.stem(j) for j in tweet])

	return stem_tweets

In [17]:
def get_wordnet_pos(treebank_tag):

	if treebank_tag.startswith('J'):
		return wn.ADJ
	elif treebank_tag.startswith('V'):
		return wn.VERB
	elif treebank_tag.startswith('N'):
		return wn.NOUN
	elif treebank_tag.startswith('R'):
		return wn.ADV
	else:
		return wn.NOUN

In [18]:
def eng_lemmatizer(tweets):

	wnl = WordNetLemmatizer()

	lem_tweets = []

	for tweet in tweets:
		tags = pos_tag(tweet)
		lem_tweets.append([wnl.lemmatize(j[0],get_wordnet_pos(j[1])) for j in tags])

	return lem_tweets

In [24]:
def process_tweets(file_name):

	tweets = []

	with open(file_name, 'r') as csv_file:
		csv_reader = csv.DictReader(csv_file, delimiter = ';')
		for row in csv_reader:
			tweets.append(row['text'])

	# preprocesamiento de tweets
	prep_tweets = pre_process_pipeline(tweets)

	# procesamiento de tweets - tokenize
	tok_tweets = tokenize_tweets(prep_tweets)

	# procesamiento de tweets - letra minuscula
	low_tweets = lower_case(tok_tweets)

	# procesamiento de tweets - filtado de palabras vacias
	stop_tweets = filter_stopwords(low_tweets)

	# procesamiento de tweets - stemming
	stm_tweets = stemming(stop_tweets)

	# se guardan los tweets procesados en un archivo pickle para su uso posterior
	with open('canelo.pickle', 'wb') as f:
		pickle.dump(stm_tweets, f, pickle.HIGHEST_PROTOCOL)

	return stm_tweets

In [25]:
stm_tweets = process_tweets('ukraine.csv')