In [1]:
# 75.06 - 1C202 - TP2 
# Competencia de Machine learning
#
# https://www.kaggle.com/c/nlp-getting-started/overview
#
# Fuentes:
# https://realpython.com/python-keras-text-classification/
# https://towardsdatascience.com/data-augmentation-in-nlp-2801a34dfc28

# Importacion de librerias necesarias
import re, string, random, datetime
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLTK
# https://www.nltk.org
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer 

# NLPAUG
# https://github.com/makcedward/nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

# Split y K-Fold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Importacion de Keras
# https://keras.io
from keras.models import Sequential, save_model, load_model
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Configuracion general
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /Users/juan0511/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juan0511/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juan0511/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/juan0511/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
# Seccion de CONFIGURACION

# Path de entrada
train_path = 'data/train.csv'
test_path = 'data/test.csv'

# Path de salida para el archivo de train
train_clean_path = 'data/processed/train.' + datetime.datetime.now().isoformat() + '.csv'
test_clean_path = 'data/processed/test.' + datetime.datetime.now().isoformat() + '.csv'
train_clean_path = train_clean_path.replace('-','.').replace(':','.')
test_clean_path = test_clean_path.replace('-','.').replace(':','.')

data/processed/train.2020.08.02T16.31.16.421462.csv


In [3]:
# Funciones AUXILIARES
# Pasa un texto a lowercase
def lowercase_text(text):
    return text.lower()

# Limpia los datos, mediante el uso de expresiones regulares
def remove_noise(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [4]:
# Set de train: carga, descarte de columnas no utilizadas y limpieza de los textos
df_twitter_train = pd.read_csv(train_path, sep=',')
#Sumo los keywords al text
df_twitter_train['keyword'].fillna(' ', inplace=True)
#df_twitter_train['text'] = (df_twitter_train['text'] + ' ' + df_twitter_train['keyword'])
df_twitter_train.drop('location', axis=1, inplace=True)
df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: lowercase_text(x))
#df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: remove_noise(x))

# Set de test: carga, descarte de columnas no utilizadas y limpieza de los textos
df_twitter_test = pd.read_csv(test_path, sep=',')
#Sumo los keywords al text
df_twitter_test['keyword'].fillna(' ', inplace=True)
#df_twitter_test['text'] = (df_twitter_test['text'] + ' ' + df_twitter_test['keyword'])
df_twitter_test.drop('location', axis=1, inplace=True)
df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: lowercase_text(x))
#df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: remove_noise(x))

In [5]:
# Tratamiento con NLTK y NLP Aug
stop_words = set(stopwords.words('english'))
#df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
#df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_twitter_train_aug = pd.DataFrame(df_twitter_train)

aug_syn = naw.SynonymAug(aug_src='wordnet')
aug_swp = naw.RandomWordAug(action="swap")
lem = WordNetLemmatizer()
df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))
df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))

In [6]:
# Escribimos los archivos de salida
df_twitter_train.to_csv(train_clean_path, index=False)
df_twitter_test.to_csv(test_clean_path, index=False)

print('Generado train: ' + train_clean_path)
print('Generado test:  ' + test_clean_path)