In [1]:
# 75.06 - 1C202 - TP2 
# Competencia de Machine learning
#
# https://www.kaggle.com/c/nlp-getting-started/overview
#
# Fuentes:
# https://realpython.com/python-keras-text-classification/
# https://towardsdatascience.com/data-augmentation-in-nlp-2801a34dfc28

# Importacion de librerias necesarias
import re, string, random, datetime
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Split y K-Fold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Importacion de Keras
# https://keras.io
from keras.models import Model, Sequential, save_model, load_model
from keras.layers import Embedding, Conv1D, Dropout, Input, GlobalMaxPooling1D, Dense, concatenate, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from bert import bert_tokenization


#Configuracion general
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

import tokenization

In [2]:
# Path de entrada
train_path = 'data/processed/train.2020.08.03T11.00.50.547958.csv' #Cambiar por el archivo pre-procesado a usar!
test_path = 'data/processed/test.2020.08.03T11.00.50.547958.csv' #Cambiar por el archivo pre-procesado a usar!

# Configuracion
maxlen = 100

# Carga de los archivos del set de datos
# Set de train: carga
df_twitter_train = pd.read_csv(train_path, sep=',')
# Set de test: carga
df_twitter_test = pd.read_csv(test_path, sep=',')

# Print de los shapes
print('Shape train: ' + str(df_twitter_train.shape))
print('Shape test: ' + str(df_twitter_test.shape))

Shape train: (7613, 4)
Shape test: (3263, 3)


In [3]:
df_twitter_train.head()

Unnamed: 0,id,keyword,text,target
0,1,,our deeds are the reason of this earthquake ma...,1
1,4,,forest fire near la ronge sask canada,1
2,5,,all residents asked to shelter in place are be...,1
3,6,,people receive wildfires evacuation orders in...,1
4,7,,just got sent this photo from ruby alaska as s...,1


In [4]:
df_twitter_test.head()

Unnamed: 0,id,keyword,text
0,0,,just happened a terrible car crash
1,2,,heard about earthquake is different cities sta...
2,3,,there is a forest fire at spot pond geese are ...
3,9,,apocalypse lighting spokane wildfires
4,11,,typhoon soudelor kills in china and taiwan


In [5]:
#Link de BERT: https://www.kaggle.com/friskycodeur/nlp-with-disaster-tweets-bert-explained

#defino funcion bert_encode

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [6]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [7]:
#Cargamos BERT desde tensorflow
#From the bert-layer we will load the tokenizer
#We will encode and convert the data into Bert-input form

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [9]:
#We will load the tokenizer from our bert layer now !
 

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [10]:
# Nos quedamos con X e y para entrenar
tweets = df_twitter_train['text'].values
y = df_twitter_train['target'].values

# Separamos X para el set de test
tweets_predict = df_twitter_test['text'].values

train_encode_bert = bert_encode(tweets, tokenizer, max_len=160)
test_encode_bert = bert_encode(tweets_predict, tokenizer, max_len=160)
values_to_predict = df_twitter_train['target'].values


In [11]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [None]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_encode_bert, values_to_predict,
    validation_split=0.2,
    epochs=2,
    callbacks=[checkpoint],
    batch_size=2
)
#probar batch_size = 15

Epoch 1/2
 151/3045 [>.............................] - ETA: 4:58:59 - loss: 0.5458 - accuracy: 0.7119

In [None]:
metrics=pd.DataFrame(model.history.history)
metrics

In [None]:
model.load_weights('model.h5')
test_pred = model.predict(test_encode_bert)

In [None]:
model.load_weights('model.h5')
y_predict = model.predict(test_encode_bert)

kaggle_submission = pd.DataFrame(df_twitter_test,columns = ['id'])
kaggle_submission['target'] = y_predict


submission_path = 'data/submits/submission.' + datetime.datetime.now().isoformat() + '.csv'
submission_path = submission_path.replace('-','.').replace(':','.')

kaggle_submission.to_csv(submission_path, index=False)

# Imprimimos un resumen de la operacion
print('Operación finalizada!\n')
print('Generado submit: \'' + submission_path + '\' - (' + str(len(kaggle_submission['target'].index)) + ') registros.')