In [90]:
# 75.06 - 1C202 - TP2 
# Competencia de Machine learning
#
# https://www.kaggle.com/c/nlp-getting-started/overview
#
# Fuentes:
# https://realpython.com/python-keras-text-classification/
# https://towardsdatascience.com/data-augmentation-in-nlp-2801a34dfc28

# Importacion de librerias necesarias
import re, string, random, datetime
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLTK
# https://www.nltk.org
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer 

# NLPAUG
# https://github.com/makcedward/nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

# Split y K-Fold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Importacion de Keras
# https://keras.io
from keras.models import Sequential, save_model, load_model
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Configuracion general
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /home/andres/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/andres/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andres/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [91]:
# Seccion de CONFIGURACION

# Path de entrada
train_path = 'data/train.csv'
test_path = 'data/test.csv'

# Variables de control
append_keywords = False
lowercase_text = True
remove_noise = True
remove_stopwords = False
perform_lemmatization = False
augument_dataset = False
final_check = True

# Variables para usar con MLP.
sin_keywords = False
sin_location = False
con_longitud = True
con_total_palabras = True

In [92]:
# Funciones AUXILIARES
punc = list(string.punctuation)
punc.remove('\'')

# Pasa un texto a lowercase
def f_lowercase_text(text):
    return str(text).lower()

# Limpia los datos, mediante el uso de expresiones regulares
def f_remove_noise(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[^\x00-\x7F]+','', text)
    return text

In [93]:
# Set de train y test : carga, descarte de columnas no utilizadas y limpieza de los textos
# Carga de los archivos
df_twitter_train = pd.read_csv(train_path, sep=',')
df_twitter_test = pd.read_csv(test_path, sep=',')

# Completamos los NaN de keyword
df_twitter_train.fillna('empty', inplace = True)
df_twitter_test.fillna('empty', inplace = True)

In [94]:
# Limpieza basica de los textos
# Pasar a minusculas
if (lowercase_text):
    df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: f_lowercase_text(x))
    df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: f_lowercase_text(x))
    df_twitter_train['keyword'] = df_twitter_train['keyword'].apply(lambda x: f_lowercase_text(x))
    df_twitter_test['keyword'] = df_twitter_test['keyword'].apply(lambda x: f_lowercase_text(x))
    df_twitter_train['location'] = df_twitter_train['location'].apply(lambda x: f_lowercase_text(x))
    df_twitter_test['location'] = df_twitter_test['location'].apply(lambda x: f_lowercase_text(x))
    
# Filtrar caracteres no deseados
if (remove_noise):
    df_twitter_train['keyword'] = df_twitter_train.apply(lambda x: x['keyword'] if str(x['keyword']) is 'empty' else x['keyword'].replace('%20', ' '), axis = 1)
    df_twitter_test['keyword'] = df_twitter_test.apply(lambda x: x['keyword'] if str(x['keyword']) is 'empty' else x['keyword'].replace('%20', ' '), axis = 1)
    df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: f_remove_noise(x))
    df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: f_remove_noise(x))

# Sumamos los keywords al text
if (append_keywords):
    df_twitter_train['text'] = df_twitter_train.apply(lambda x: x['text'] if (str(x['keyword']) is 'empty') else (x['text'] + ' ' + x['keyword']), axis = 1)
    df_twitter_test['text'] = df_twitter_test.apply(lambda x: x['text'] if (str(x['keyword']) is 'empty') else (x['text'] + ' ' + x['keyword']), axis = 1)
    
# Tratamiento con NLTK
# Quitamos stop-words
if (remove_stopwords):
    stop_words = set(stopwords.words('english'))
    df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Lematizacion
if (perform_lemmatization):
    lem = WordNetLemmatizer()
    df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))
    df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))

In [95]:
# Tratamiento del set de train con nlpaug
# Aumentamos el tamaño de nuestro set para mejorar los resultados de la prediccion
if (augument_dataset):
    # Creamos un nuevo dataframe para trabajar
    df_twitter_train_aug = pd.DataFrame(df_twitter_train)

    # Reemplazamos una palabra del tweet con un sinonimo tomado de WordNet
    aug_syn = naw.SynonymAug(aug_src='wordnet')
    # Intercambiamos una palabra random por otra en el tweet
    aug_swp = naw.RandomWordAug(action="swap")

    # Usmos las 2 opciones
    df_twitter_train_aug['text'] = df_twitter_train_aug['text'].apply(lambda x: aug_syn.augment(x))
    df_twitter_train_aug['text'] = df_twitter_train_aug['text'].apply(lambda x: aug_swp.augment(x))

    # Unimos los 2 dataframes
    df_twitter_train = pd.concat([df_twitter_train, df_twitter_train_aug])

In [96]:
# Completamos los NaN de keyword y text, por si alguno quedo en blanco luego de limpiar
if(final_check):
    df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: x if x != '' else '0')
    df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: x if x != '' else '0')

In [97]:
#=================================================================================================
# COLUMNA NUMÉRICAS Y CATEGÓRICAS A SER AGREGADAS PARA USAR EN LA RED NEURONAL.
#=================================================================================================

In [98]:
# Corroboramos si usamos la columna categórica de location.
if (sin_location):
    df_twitter_train.drop('location', axis=1, inplace=True)
    df_twitter_test.drop('location', axis=1, inplace=True)
    
# Corroboramos si usamos la columna categórica de keyword.
if (sin_keywords):
    df_twitter_train.drop('keyword', axis=1, inplace=True)
    df_twitter_test.drop('keyword', axis=1, inplace=True)

# Agregamos una columna numérica con la longitud del tweet.
if (con_longitud):
    df_twitter_train['length'] = df_twitter_train['text'].str.len()
    df_twitter_test['length'] = df_twitter_test['text'].str.len()

# Agregamos una columna numérica con la cantidad de palabras.
if (con_total_palabras):
    df_twitter_train['totalwords'] = df_twitter_train['text'].str.split().str.len()
    df_twitter_test['totalwords'] = df_twitter_test['text'].str.split().str.len()

In [99]:
#=================================================================================================
# RESGUARDO FINAL DE LOS DATOS LUEGO DE LA LIMPIEZA Y PROCESAMIENTO DEL TEXTO.
#=================================================================================================

In [100]:
# Escribimos los archivos de salida
# Path de salida para el archivo de train y test
now = datetime.datetime.now().isoformat()
train_clean_path = 'data/processed/train.' + now + '.csv'
test_clean_path = 'data/processed/test.' + now + '.csv'
train_clean_path = train_clean_path.replace('-','.').replace(':','.')
test_clean_path = test_clean_path.replace('-','.').replace(':','.')

df_twitter_train.to_csv(train_clean_path, index=False)
df_twitter_test.to_csv(test_clean_path, index=False)

# Imprimimos un resumen de la operacion
print('Operación finalizada!\n')
print('Pasaje a minúsculas:       ' + str(lowercase_text))
print('Limpieza básica:           ' + str(remove_noise))
print('Agregado de keywords:      ' + str(append_keywords))
print('Remoción de stopwords:     ' + str(remove_stopwords))
print('Lematización:              ' + str(perform_lemmatization))
print('Aumento del set:           ' + str(augument_dataset))
print('SIN columnas location:     ' + str(sin_location))
print('SIN columnas keyword:      ' + str(sin_keywords))
print('CON longitudt:             ' + str(con_longitud))
print('CON cantidad de palabras:  ' + str(con_total_palabras))
print('Chequeo final:         ' + str(final_check) + '\n')
print('Generado train: \'' + train_clean_path + '\' - (' + str(len(df_twitter_train['text'].index)) + ') registros.')
print('Generado test:  \'' + test_clean_path + '\' - (' + str(len(df_twitter_test['text'].index)) + ') registros.')

Operación finalizada!

Pasaje a minúsculas:       True
Limpieza básica:           True
Agregado de keywords:      False
Remoción de stopwords:     False
Lematización:              False
Aumento del set:           False
SIN columnas location:     False
SIN columnas keyword:      False
CON longitudt:             True
CON cantidad de palabras:  True
Chequeo final:         True

Generado train: 'data/processed/train.2020.08.03T16.57.04.289197.csv' - (7613) registros.
Generado test:  'data/processed/test.2020.08.03T16.57.04.289197.csv' - (3263) registros.


In [101]:
df_twitter_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7613 non-null   int64 
 1   keyword     7613 non-null   object
 2   location    7613 non-null   object
 3   text        7613 non-null   object
 4   target      7613 non-null   int64 
 5   length      7613 non-null   int64 
 6   totalwords  7613 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 416.5+ KB


In [102]:
df_twitter_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          3263 non-null   int64 
 1   keyword     3263 non-null   object
 2   location    3263 non-null   object
 3   text        3263 non-null   object
 4   length      3263 non-null   int64 
 5   totalwords  3263 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 153.1+ KB
