In [12]:
# 75.06 - 1C202 - TP2 
# Competencia de Machine learning
#
# https://www.kaggle.com/c/nlp-getting-started/overview
#
# Fuentes:
# https://realpython.com/python-keras-text-classification/
# https://towardsdatascience.com/data-augmentation-in-nlp-2801a34dfc28

# Importacion de librerias necesarias
import re, string, random, datetime
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLTK
# https://www.nltk.org
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer 

# NLPAUG
# https://github.com/makcedward/nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

# Split y K-Fold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Importacion de Keras
# https://keras.io
from keras.models import Sequential, save_model, load_model
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Configuracion general
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /Users/juan0511/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juan0511/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juan0511/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/juan0511/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
# Seccion de CONFIGURACION

# Path de entrada
train_path = 'data/train.csv'
test_path = 'data/test.csv'

# Variables de control
append_keywords = True

lowercase_text = True
remove_noise = True

remove_stopwords = True
perform_lemmatization = True

augument_dataset = False

In [14]:
# Funciones AUXILIARES
# Pasa un texto a lowercase
def f_lowercase_text(text):
    return str(text).lower()

# Limpia los datos, mediante el uso de expresiones regulares
def f_remove_noise(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[^\x00-\x7F]+','', text)
    return text

In [15]:
# Set de train y test : carga, descarte de columnas no utilizadas y limpieza de los textos
# Carga de los archivos
df_twitter_train = pd.read_csv(train_path, sep=',')
df_twitter_test = pd.read_csv(test_path, sep=',')

# Completamos los NaN de keyword
df_twitter_train['keyword'].fillna(' ', inplace=True)
df_twitter_test['keyword'].fillna(' ', inplace=True)

# Descartamos la columna location
df_twitter_train.drop('location', axis=1, inplace=True)
df_twitter_test.drop('location', axis=1, inplace=True)

# Limpieza basica de los textos
# Pasar a minusculas
if (lowercase_text):
    df_twitter_train['keyword'] = df_twitter_train['keyword'].apply(lambda x: f_lowercase_text(x))
    df_twitter_test['keyword'] = df_twitter_test['keyword'].apply(lambda x: f_lowercase_text(x))    
    df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: f_lowercase_text(x))
    df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: f_lowercase_text(x))

# Filtrar caracteres no deseados
if (remove_noise):
    df_twitter_train['keyword'] = df_twitter_train['keyword'].apply(lambda x: x.replace('%20', ' '))    
    df_twitter_test['keyword'] = df_twitter_test['keyword'].apply(lambda x: x.replace('%20', ' '))    
    df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: f_remove_noise(x))
    df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: f_remove_noise(x))

# Sumamos los keywords al text
if (append_keywords):
    df_twitter_train['text'] = df_twitter_train['text'] + ' ' + df_twitter_train['keyword']
    df_twitter_test['text'] = df_twitter_test['text'] + ' ' + df_twitter_test['keyword']

In [16]:
# Tratamiento con NLTK
# Quitamos stop-words
if (remove_stopwords):
    stop_words = set(stopwords.words('english'))
    df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Lematizacion
if (perform_lemmatization):
    lem = WordNetLemmatizer()
    df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))
    df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))

In [17]:
# Tratamiento del set de train con nlpaug
# Aumentamos el tamaño de nuestro set para mejorar los resultados de la prediccion
if (augument_dataset):
    # Creamos un nuevo dataframe para trabajar
    df_twitter_train_aug = pd.DataFrame(df_twitter_train)

    # Reemplazamos una palabra del tweet con un sinonimo tomado de WordNet
    aug_syn = naw.SynonymAug(aug_src='wordnet')
    # Intercambiamos una palabra random por otra en el tweet
    aug_swp = naw.RandomWordAug(action="swap")

    # Usamos las 2 opciones
    df_twitter_train_aug['text'] = df_twitter_train_aug['text'].apply(lambda x: aug_syn.augment(x))
    df_twitter_train_aug['text'] = df_twitter_train_aug['text'].apply(lambda x: aug_swp.augment(x))

    # Unimos los 2 dataframes
    df_twitter_train = pd.concat([df_twitter_train, df_twitter_train_aug])

In [18]:
# Escribimos los archivos de salida
# Path de salida para el archivo de train y test
train_clean_path = 'data/processed/train.' + datetime.datetime.now().isoformat() + '.csv'
test_clean_path = 'data/processed/test.' + datetime.datetime.now().isoformat() + '.csv'
train_clean_path = train_clean_path.replace('-','.').replace(':','.')
test_clean_path = test_clean_path.replace('-','.').replace(':','.')

df_twitter_train.to_csv(train_clean_path, index=False)
df_twitter_test.to_csv(test_clean_path, index=False)

# Imprimimos un resumen de la operacion
print('Operación finalizada!\n')
print('Pasaje a minúsculas:   ' + str(lowercase_text))
print('Limpieza básica:       ' + str(remove_noise))
print('Agregado de keywords:  ' + str(append_keywords))
print('Remoción de stopwords: ' + str(remove_stopwords))
print('Lematización:          ' + str(perform_lemmatization))
print('Aumento del set:       ' + str(augument_dataset) + '\n')
print('Generado train: \'' + train_clean_path + '\' - (' + str(len(df_twitter_train['text'].index)) + ') registros.')
print('Generado test:  \'' + test_clean_path + '\' - (' + str(len(df_twitter_test['text'].index)) + ') registros.')

Operación finalizada!

Pasaje a minúsculas:   True
Limpieza básica:       True
Agregado de keywords:  True
Remoción de stopwords: True
Lematización:          True
Aumento del set:       False

Generado train: 'data/processed/train.2020.08.02T17.55.54.541125.csv' - (7613) registros.
Generado test:  'data/processed/test.2020.08.02T17.55.54.541165.csv' - (3263) registros.
