In [1]:
# 75.06 - 1C202 - TP2 
# Competencia de Machine learning
#
# https://www.kaggle.com/c/nlp-getting-started/overview
#
# Fuentes:
# https://realpython.com/python-keras-text-classification/
# https://towardsdatascience.com/data-augmentation-in-nlp-2801a34dfc28

# Importacion de librerias necesarias
import re, string, random
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLTK
# https://www.nltk.org
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer 

# NLPAUG
# https://github.com/makcedward/nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

# Split y K-Fold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Importacion de Keras
# https://keras.io
from keras.models import Sequential, save_model, load_model
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Configuracion general
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pato\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pato\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pato\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pato\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Seccion de CONFIGURACION

# Path de entrada
train_path = 'data/train.csv'
test_path = 'data/test.csv'

# Configuracion del modelo
maxlen = 100
num_filters = 32
kernel_size = 8
embedding_dim = 200
embeddings_path = 'embeddings/glove.twitter.27B.200d.txt'
# Embeddings disponibles en:
# https://worksheets.codalab.org/bundles/0x4090ba96b8a444c2a44b2c47884c25f2

# Configuracion de entrenamiento
num_folds = 3
epochs = 1
verbose = False
batch_size = 4

# Path de para guardar los modelos
model_path = 'models/TP2.Keras.Conv1D.Iter.'

# Path de salida para el submission
#submission_path = 'data/submits/submission_01_20200730.csv'

In [3]:
# Funciones AUXILIARES

# Pasa un texto a lowercase
def lowercase_text(text):
    return text.lower()

# Limpia los datos, mediante el uso de expresiones regulares
def remove_noise(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Grafica para un modelo los resultados del entrenamiento
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Entrenamiento')
    plt.plot(x, val_acc, 'r', label='Validación')
    plt.title('Accuracy: validación y entrenamiento')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Entrenamiento')
    plt.plot(x, val_loss, 'r', label='Validación')
    plt.title('Loss: validación y entrenamiento')
    plt.legend()
    plt.show()

# Genera la matriz de embeddings
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # 0 es un indice reservado, sumamos 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

# Define el modelo a utilizar, con Conv1D
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [19]:
# Carga de los archivos del set de datos

# Set de train: carga, descarte de columnas no utilizadas y limpieza de los textos
df_twitter_train = pd.read_csv(train_path, sep=',')
#Sumo los keywords al text
df_twitter_train['keyword'].fillna(' ', inplace=True)
df_twitter_train['text'] = (df_twitter_train['text'] + ' ' + df_twitter_train['keyword'])
df_twitter_train.drop('location', axis=1, inplace=True)
df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: lowercase_text(x))
#df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: remove_noise(x))


# Set de test: carga, descarte de columnas no utilizadas y limpieza de los textos
df_twitter_test = pd.read_csv(test_path, sep=',')
#Sumo los keywords al text
df_twitter_test['keyword'].fillna(' ', inplace=True)
df_twitter_test['text'] = (df_twitter_test['text'] + ' ' + df_twitter_test['keyword'])
df_twitter_test.drop('location', axis=1, inplace=True)
df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: lowercase_text(x))
#df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: remove_noise(x))

# Vemos el tamaño de ambos sets
print(df_twitter_train.shape)
print(df_twitter_test.shape)

(7613, 4)
(3263, 3)


In [20]:
df_twitter_train.head()

Unnamed: 0,id,keyword,text,target
0,1,,our deeds are the reason of this #earthquake m...,1
1,4,,forest fire near la ronge sask. canada,1
2,5,,all residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,just got sent this photo from ruby #alaska as ...,1


In [21]:
df_twitter_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       7613 non-null   int64 
 1   keyword  7613 non-null   object
 2   text     7613 non-null   object
 3   target   7613 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 238.0+ KB


In [22]:
# Tratamiento del set con nltk y nlpaug
# NLTK
# Sabiendo que todos los tweets son en idioma ingles, quitamos las stopwords
stop_words = set(stopwords.words('english'))
df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


# NLPAUG
# Tratamiento del set de train con nlpaug
# Aumentamos el tamaño de nuestro set para mejorar los resultados de la prediccion

# Creamos un nuevo dataframe para trabajar
df_twitter_train_aug = pd.DataFrame(df_twitter_train)

# Reemplazamos una palabra del tweet con un sinonimo tomado de WordNet
aug_syn = naw.SynonymAug(aug_src='wordnet')
# Intercambiamos una palabra random por otra en el tweet
aug_swp = naw.RandomWordAug(action="swap")

# Usamos las 2 opciones
#df_twitter_train_aug['text'] = df_twitter_train_aug['text'].apply(lambda x: aug_syn.augment(x))
#df_twitter_train_aug['text'] = df_twitter_train_aug['text'].apply(lambda x: aug_swp.augment(x))

# Unimos los 2 dataframes
#df_twitter_train_aug = pd.concat([df_twitter_train, df_twitter_train_aug])

# Stemmizamos los textos de los tweets
#ps = PorterStemmer()
#df_twitter_train_aug['text'] =  df_twitter_train_aug['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
#df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))

# Lematizamos los textos de los tweets
lem = WordNetLemmatizer()
df_twitter_train['text'] =  df_twitter_train['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))
df_twitter_test['text'] =  df_twitter_test['text'].apply(lambda x: ' '.join([lem.lemmatize(word,"v") for word in x.split()]))


# Vemos el tamaño final del set de train
print(df_twitter_train_aug.shape)

(7613, 4)


In [23]:
# Vemos como quedo el set de train
df_twitter_train_aug.head()

Unnamed: 0,id,keyword,text,target
0,1,,deeds reason #earthquake may allah forgive us,1
1,4,,forest fire near la ronge sask. canada,1
2,5,,residents ask 'shelter place' notify officers....,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,get send photo ruby #alaska smoke #wildfires p...,1


In [24]:
# Vemos como quedo el set de test
df_twitter_test.head()

Unnamed: 0,id,keyword,text
0,0,,happen terrible car crash
1,2,,"hear #earthquake different cities, stay safe e..."
2,3,,"forest fire spot pond, geese flee across stree..."
3,9,,apocalypse lighting. #spokane #wildfires
4,11,,typhoon soudelor kill 28 china taiwan


In [25]:
# Nos quedamos con X e y para entrenar
tweets = df_twitter_train_aug['text'].values
y = df_twitter_train_aug['target'].values

# Separamos X para el set de test
tweets_predict = df_twitter_test['text'].values
keyword_predict = df_twitter_test['keyword'].values

In [26]:
# Tokenizamos los textos
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweets)

# Convertimos a secuencias
X = tokenizer.texts_to_sequences(tweets)
X_predict = tokenizer.texts_to_sequences(tweets_predict)

# Calculamos el tamaño
vocab_size = len(tokenizer.word_index) + 1  # 0 es un indice reservado, sumamos 1

#PROBAR CON EL TOKENIZADOR DE NTK DE TWITTER

In [27]:
# Paddeamos a maxlen
X = pad_sequences(X, padding='post', maxlen=maxlen)
X_predict = pad_sequences(X_predict, padding='post', maxlen=maxlen)

In [28]:
# Generacion de la matriz de embeddings
embedding_matrix = create_embedding_matrix(embeddings_path,tokenizer.word_index, embedding_dim)

# Vemos que cobertura tenemos con los embeddings utilizados
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print('Cobertura de vocabulario: ' + str(nonzero_elements / vocab_size))

Cobertura de vocabulario: 0.5232057640123514


In [None]:
submission_path = 'data/submits/submission_02_20200730.csv'
# Definimos y entrenamos el modelo
# K-Fold Cross Validation

# Definimos los arrays para guardar los resultados
# Train
acc_per_fold_train = []
loss_per_fold_train = []
# Validacion
acc_per_fold_validation = []
loss_per_fold_validation = []

# Definomos el K-fold Cross Validator a usar
kfold = KFold(n_splits=num_folds, shuffle=True)

# Hacemos el bucle para la cantidad de folds definidos
fold_no = 1
for train, validation in kfold.split(X, y):

     # Imprimimos el progreso
    print('------------------------------------------------------------------------')
    print(f'Entrenando, fold {fold_no}...')

    # Definimos la arquitectura del modelo
    model = create_model(num_filters = num_filters,
                         kernel_size = kernel_size,
                         vocab_size = vocab_size,
                         embedding_dim = embedding_dim,
                         maxlen = maxlen)

        # Hacemos el fit
    history = model.fit(X[train], y[train],
                        epochs = epochs,
                        verbose = verbose,
                        validation_data = (X[validation], y[validation]),
                        batch_size = batch_size)      

    # Generamos la metrica de entrenamiento
    scores_train = model.evaluate(X[train], y[train], verbose=0)
    print(f'Puntaje de entrenamiento para el fold {fold_no}: {model.metrics_names[0]} de {scores_train[0]}; {model.metrics_names[1]} de {scores_train[1]*100}%')
    acc_per_fold_train.append(scores_train[1] * 100)
    loss_per_fold_train.append(scores_train[0])

    # Generamos la metrica de test
    scores_validation = model.evaluate(X[validation], y[validation], verbose=0)
    print(f'Puntaje de validación para el fold {fold_no}: {model.metrics_names[0]} de {scores_validation[0]}; {model.metrics_names[1]} de {scores_validation[1]*100}%')
    acc_per_fold_validation.append(scores_validation[1] * 100)
    loss_per_fold_validation.append(scores_validation[0])

    # Guardamos el modelo
    save_model(model, model_path + str(fold_no) + '.h5', save_format='h5')

    # Plotteamos el resultado final
    plot_history(history)

    # Incrementamos el fold
    fold_no = fold_no + 1

    
print('\n')
print('Puntaje promedio de entrenamiendo, para todos los folds:')
print(f'> Accuracy: {np.mean(acc_per_fold_train)} (+- {np.std(acc_per_fold_train)})')
print(f'> Loss: {np.mean(loss_per_fold_train)}')
print('\n')
print('Puntaje promedio de validación, para todos los folds:')
print(f'> Accuracy: {np.mean(acc_per_fold_validation)} (+- {np.std(acc_per_fold_validation)})')
print(f'> Loss: {np.mean(loss_per_fold_validation)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Entrenando, fold 1...


In [15]:
# Cargamos el modelo con el mejor puntaje
filepath = model_path + str((acc_per_fold_validation.index(max(acc_per_fold_validation)))+1) + '.h5'
loaded_model = load_model(filepath, custom_objects=None, compile=True)

# Imprimimos el detalle del modelo utilizado
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          2927400   
_________________________________________________________________
conv1d (Conv1D)              (None, 93, 32)            51232     
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                330       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 2,978,973
Trainable params: 2,978,973
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Prediccion en base al modelo entrenado
y_predict = (loaded_model.predict(X_predict) > 0.5).astype('int32')

In [17]:
# Generacion del dataframe que generara el submit para la competencia
kaggle_submission = pd.DataFrame(df_twitter_test,columns = ['id'])
kaggle_submission['target'] = y_predict
kaggle_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1


In [18]:
# Escritura del submit en formato CSV
kaggle_submission.to_csv(submission_path, index=False)

In [33]:
# ToDo:
# Grid search, busqueda de hiper-parametros ideales