## <center>Inicio

In [1]:
# Importación de librerías

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import random
import math

In [2]:
import re
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

In [4]:
from sklearn.utils import shuffle

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import kerastuner as kt
from kerastuner.tuners import BayesianOptimization
import tensorflow_hub as hub
import bert

In [None]:
# Importación de los documentos

tw_train = pd.read_csv('./train.csv')
tw_test = pd.read_csv('./test.csv')
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 150

## <center>Preprocesamiento y Feature Extraction a partir de 'text'

In [6]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

regexes = {'hashtags':r'(#)+', 'mentions':r'(@)\w*', 'URLs':r'(http://)[a-zA-Z0-9./]*'}

referenceDF = pd.read_csv('./headlines_reference_values.csv').drop(columns = 'Unnamed: 0')
ref = referenceDF.loc[0]

basicStopwords = stopwords.words('english')
myStopwords = set(basicStopwords + list(punctuation))

In [7]:
def isInText(DF, feature):
    DF[feature[0]+'_in_t'] = 0   # Inicialización de la nueva columna
    for i in range(len(DF)):
        try:
            if (DF.loc[i, feature]).lower() in (DF.loc[i, 'text']).lower(): DF.loc[i, feature[0]+'_in_t'] = 1
        except:
            continue
        
def replace_contractions(text, mapping):
    noContText = []
    counter = 0
    for t in text.split(" "):
        if t.lower() in mapping:
            noContText.append(mapping[t.lower()])
            counter += 1
        else:
            noContText.append(t) 
    return [' '.join(noContText), counter]

def Qy(regex, text):   
    try:
        return len(re.compile(regex).findall(text))
    except:
        return 0

def newFeatures(DF, regexDict):
    for k, v in regexDict.items():
        DF[['text', k]] = pd.DataFrame(DF['text'].map(lambda x: [re.compile(v).sub(r' ', x), Qy(v, x)]).tolist())

def wilsonScore(num, den):
    if num == 0:
        return round(0, 2)
    phat = num/den
    z = 1.96
    return round(((phat + z*z/(2*den) - z * np.sqrt((phat*(1-phat)+z*z/(4*den))/den))/(1+z*z/den)), 2)

def tagging(text):
    tagsCount = {'ADJ':0, 'ADP':0, 'ADV':0, 'CONJ':0, 'DET':0, 'NOUN':0, 'NUM':0, 'PRT':0, 'PRON':0, 'VERB':0, '.':0, 'X':0}
    tags = []
    sentences = sent_tokenize(text)
    for sentence in sentences:
        tags += nltk.pos_tag(word_tokenize(sentence), tagset = 'universal')
    for word, tag in tags:
        tagsCount[tag] += 1    
    POS_ratio = [round((12 - list(tagsCount.values()).count(0))/12, 2)]
#     mostCommonPOS_ratio = [round(list(tagsCount.values()).count(max(tagsCount.values()))/(12-list(tagsCount.values()).count(0)), 2)]
    mostCommonPOS_ratio = [wilsonScore(list(tagsCount.values()).count(max(tagsCount.values())), 12-list(tagsCount.values()).count(0))]
    noun_ratio =  [wilsonScore(tagsCount['NOUN'], sum(tagsCount.values()))]
    return list(tagsCount.values()) + POS_ratio + mostCommonPOS_ratio + noun_ratio

def headlinesDistance(DF, ref):
    DF['dist'] = 0
    for i in range(len(DF)):
        x = DF.loc[i, ['textLenght','ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','.','X','POS_ratio','topPOS_ratio','NOUN/TOT', 'polarity', 'subjectivity']]
        DF.loc[i, 'dist'] =  np.dot(x, ref)/(np.linalg.norm(x)*np.linalg.norm(ref))

def sentimentAnalysis(text):
    blob = TextBlob(text)
    return blob.sentiment

def stemmizer(text, stopwords):
    words = word_tokenize(text)
    return ' '.join([PorterStemmer().stem(word) for word in words if word not in stopwords])

def SWRemoval(text, stopwords):
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stopwords])
    
def preprocessing(DF, contraction_mapping, regexes, ref, myStopwords):
#     originalTextLength = tw_train['text'].map(lambda x: len(x))
    isInText(DF, 'keyword')
    isInText(DF, 'location')
    DF[['text', 'conts']] = pd.DataFrame(DF['text'].map(lambda x: replace_contractions(x, contraction_mapping)).tolist())
    originalTextLength = DF['text'].map(lambda x: len(x))
    newFeatures(DF, regexes) 
    DF['textLenght'] = DF['text'].map(lambda x: len(x))
#     DF['textTrash'] = originalTextLength - DF['textLenght']
    DF[['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','.','X','POS_ratio','topPOS_ratio','NOUN/TOT']] = pd.DataFrame(DF['text'].map(lambda x: tagging(x)).tolist())
    DF[['polarity', 'subjectivity']] = pd.DataFrame(DF['text'].map(lambda x: sentimentAnalysis(x)).tolist())
    headlinesDistance(DF, ref)
    DF['K'] = DF['textLenght']*DF['NOUN/TOT']*DF['polarity']
    DF['text'] = DF['text'].map(lambda x: x.lower())
    DF['text'] = DF['text'].map(lambda x: re.compile(r'\d').sub(r' ', x))
#     DF['text'] = DF['text'].map(lambda x: stemmizer(x, myStopwords))
    DF['text'] = DF['text'].map(lambda x: SWRemoval(x, myStopwords))
    DF['textTrash'] = originalTextLength - DF['textLenght']    

In [10]:
# Preprocesamiento y feature extraction del text

preprocessing(tw_train, contraction_mapping, regexes, ref, myStopwords)
preprocessing(tw_test, contraction_mapping, regexes, ref, myStopwords)

## <center>Vocabulario - Embeddings

In [11]:
# Vocabulary
# las Stopwords ya quedaron afuera en el paso anterior

textList = tw_train['text'].tolist()
text = ' '.join(textList)
tokens = word_tokenize(text)
tokensFreq = nltk.FreqDist(tokens)
tokensFreq = {k:v for k,v in tokensFreq.items() if (v>5) & (v<150)}
myVocabulary = list(tokensFreq.keys())
print('El vocabulario consta de {} palabras'.format(len(myVocabulary)))

El vocabulario consta de 2173 palabras


In [None]:
# Embeddings

embeddingsFilePath = './glove.twitter.27B.100d.txt'
embeddingsFile = open(embeddingsFilePath)
embeddingsIndex = {}
for line in embeddingsFile:
    lineList = line.split()
    word = lineList[0]
    vector = np.array(lineList[1:])
    embeddingsIndex[word] = vector
embeddingsFile.close()

In [13]:
def embeddingsInVoc(voc):
    noEmbeddingTokens = []
    for token in voc:
        if token.lower() not in embeddingsIndex:
            noEmbeddingTokens.append(token)
    print('{}% de los tokens tienen embedding'.format( 100 - round((len(noEmbeddingTokens)/len(voc))*100, 2) ))
    return noEmbeddingTokens

noEmbeddingTokens = embeddingsInVoc(myVocabulary)

96.82% de los tokens tienen embedding


In [14]:
tokensReplacement = {'icemoon':'ice moon','bioterror':'bio terror','bioterrorism':'bio terrorism','microlight':'micro light',
                     'w/heavenly':'heavenly','typhoon-devastated':'typhoon devastated',"'save":'save','animalrescue':'animal rescue',
                     'mediterran':'mediterranean','meat-loving':'meat loving',"'suicide":'suicide'}
tokensDeletion = set(tokensReplacement.keys()) ^ set(noEmbeddingTokens)

In [15]:
# Últimos detalles sobre la limpieza del texto
def replace_noEmbeddingTokens(text, replacement_mapping, deletion_list):
    newText = []
    for word in word_tokenize(text):
        if word.lower() in replacement_mapping:
            newText.append(replacement_mapping[word.lower()])
        elif word.lower() not in deletion_list:
            newText.append(word) 
    return ' '.join(newText)

tw_train['text'] = tw_train['text'].map(lambda x: replace_noEmbeddingTokens(x, tokensReplacement, tokensDeletion))
tw_test['text'] = tw_test['text'].map(lambda x: replace_noEmbeddingTokens(x, tokensReplacement, tokensDeletion))

In [16]:
# Redefino vocabulario
myVocabulary = [w for w in myVocabulary if w not in tokensDeletion]
myVocabulary = [tokensReplacement[w] if (w in tokensReplacement) else w for w in myVocabulary]
myVocabulary = list(set(myVocabulary))
print('El nuevo vocabulario consta de {} palabras'.format(len(myVocabulary)))

El nuevo vocabulario consta de 2112 palabras


In [17]:
noEmbeddingTokens = embeddingsInVoc(myVocabulary)

99.67% de los tokens tienen embedding


### Resultados

In [18]:
tw_train.to_csv('tw_train.csv', index = False)
tw_test.to_csv('tw_test.csv', index = False)

with open('myVocabulary.pickle', 'wb') as f:
    pickle.dump(myVocabulary, f)

### Carga de los resultados

In [None]:
with open('myVocabulary.pickle', 'rb') as f:
    myVocabulary = pickle.load(f)

### Más de embeddings

In [None]:
# Creación de un Vocabulary Index
# El índice cero y el índice 1 se reservan para padding y "out of index", respectivamente.

vocabularyIndex = {}
for i in range(2, len(myVocabulary)+2):  # myVocabulary es una lista ya creada anteriormente
    vocabularyIndex[myVocabulary[i-2]] = i
    
# Construcción de la embedding matrix

num_tokens = len(vocabularyIndex) + 2
embeddings_dim = 100
embeddingsMatrix = np.zeros((num_tokens, embeddings_dim))
for word, i in vocabularyIndex.items():
    embedding_vector = embeddingsIndex.get(word)
    if embedding_vector is not None:
        embeddingsMatrix[i] = embedding_vector
    else:
        continue
print('La embeddingsMatrix resultante tiene las dimensiones {}'.format(embeddingsMatrix.shape))

# Construcción de los vectores de entrada

def vocMapping(text):
    vector = []
    for word in word_tokenize(text):
        try:
            vector.append(vocabularyIndex[word])
        except:
            vector.append(1)  # Embeddings desconocidos
    vector = vector + [0]*(num_tokens - len(vector))  # Padding
    return np.array(vector).astype(np.float)

## <center>Data Augmentation: Creación de un nuevo set de train basado en ruido

In [20]:
# Data a utilizar: embeddingsIndex, que es el diccionario tal que (k, v) = (word, embedding)
# Lo reduzco a una forma que contenga solo las claves que también se encuentran en myVocabulary

myEmbeddingsIndex = {k:v.astype(np.float) for k,v in embeddingsIndex.items() if k in myVocabulary}

In [21]:
def text_mutation(text, embDict):
    words = text.split()
    for i in range(len(words)):
        if words[i].lower() in embDict:
            minDistance = -1
            newWord = words[i]
            for k, v in embDict.items():
                if k != words[i].lower():
                    distance = np.dot(embDict[words[i].lower()], v)/(np.linalg.norm(embDict[words[i].lower()])*np.linalg.norm(v))
                    if (1 - distance) < (1 - minDistance):
                        minDistance = distance
                        newWord = k
            words[i] = newWord
    words.insert(random.randint(0, len(words)-1) ,list(embDict.keys())[random.randint(0, len(embDict)-1)])
    return ' '.join(words)

In [22]:
def noisy_preprocessing(DF, contraction_mapping, regexes, ref, myStopwords, myEmbeddingsIndex):
    isInText(DF, 'keyword')
    isInText(DF, 'location')
    DF[['text', 'conts']] = pd.DataFrame(DF['text'].map(lambda x: replace_contractions(x, contraction_mapping)).tolist())
    newFeatures(DF, {'hashtags':r'(#)+'})
    DF['text'] = DF['text'].map(lambda x: text_mutation(x, myEmbeddingsIndex))
    originalTextLength = DF['text'].map(lambda x: len(x))
    newFeatures(DF, {'mentions':r'(@)\w*', 'URLs':r'(http://)[a-zA-Z0-9./]*'})
    DF['textLenght'] = DF['text'].map(lambda x: len(x))
    DF[['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','.','X','POS_ratio','topPOS_ratio','NOUN/TOT']] = pd.DataFrame(DF['text'].map(lambda x: tagging(x)).tolist())
    DF[['polarity', 'subjectivity']] = pd.DataFrame(DF['text'].map(lambda x: sentimentAnalysis(x)).tolist())
    headlinesDistance(DF, ref)
    DF['K'] = DF['textLenght']*DF['NOUN/TOT']*DF['polarity']
    DF['text'] = DF['text'].map(lambda x: x.lower())
    DF['text'] = DF['text'].map(lambda x: re.compile(r'\d').sub(r' ', x))
    DF['text'] = DF['text'].map(lambda x: SWRemoval(x, myStopwords))
    DF['textTrash'] = originalTextLength - DF['textLenght']
    DF['text'] = DF['text'].map(lambda x: replace_noEmbeddingTokens(x, tokensReplacement, tokensDeletion))

In [23]:
tw_train_noise = pd.read_csv('./train.csv')

In [24]:
noisy_preprocessing(tw_train_noise, contraction_mapping, regexes, ref, myStopwords, myEmbeddingsIndex)

In [25]:
print(f'''Para ilustrar la diferencia:
Original: {tw_train.loc[500,'text']}
Mutado: {tw_train_noise.loc[500,'text']}''')

Para ilustrar la diferencia:
Original: christian attacked muslims temple mount waving israeli flag via pamela geller ...
Mutado: jesus murdered terrorists village inch holding palestinian navy via isis pamela abandoned ...


In [26]:
tw_train_noise = tw_train_noise[list(tw_train.columns)]

### Resultados:

### Uso: Para balancear el Train Set original

In [6]:
def balance(clean, noise):
    delta = clean['target'].value_counts().loc[0] - clean['target'].value_counts().loc[1]
    if delta > 0:
        add_balance = noise.loc[noise['target'] == 1, :].sample(n = abs(delta), replace = False, random_state = 60)
    elif delta < 0:
        add_balance = noise.loc[noise['target'] == 0, :].sample(n = abs(delta), replace = False, random_state = 60)
    balanced = pd.concat([clean, add_balance], axis=0, join = 'outer', ignore_index = True)    
    return shuffle(balanced)

### Uso: Para agragar ruido

In [7]:
def add_noise(clean, noise, noise_level,keep_proportion = True, return_residual_noise = False):
    
    if keep_proportion == True:
        stratify = noise['target']
    else:
        stratify = None
        
    residual_noise, noise_required = train_test_split(noise, stratify = stratify, test_size = noise_level)
    noisyDF = pd.concat([clean, noise_required], axis=0, join = 'outer', ignore_index = True)
    
    if return_residual_noise == False:
        return shuffle(noisyDF)
    else:
        return shuffle(noisyDF), shuffle(residual_noise)

## <center>Attention models

## BERT

In [124]:
# SET DE DATOS: CON UN SET DE DATOS EXTENDIDO Y BALANCEADO

tw_train = pd.read_csv('./tw_train.csv')
tw_train['text'] = tw_train['text'].astype('str')
tw_train = tw_train.drop(columns = ['l_in_t', 'k_in_t', 'X'])
tw_train_noise = pd.read_csv('./tw_train_noise.csv')
tw_train_noise['text'] = tw_train_noise['text'].astype('str')
tw_train_noise = tw_train_noise.drop(columns = ['l_in_t', 'k_in_t', 'X'])
tw_train_noisy, residual_noise = add_noise(tw_train, tw_train_noise, 0.50, keep_proportion = True, return_residual_noise = True)
tw_train_noisy_balanced = balance(tw_train_noisy, residual_noise)
tw_test = pd.read_csv('./tw_test.csv')
tw_test['text'] = tw_test['text'].astype('str')
tw_test = tw_test.drop(columns = ['l_in_t', 'k_in_t', 'X'])

In [125]:
# Separación del set de validación

def load_data_holdout(TrainDF, TestDF, validation_fraction, encode_keyword = False):
    # Numeric Features: Encoding de 'keyword' + Resultado de feature engineering
    print('Preparando las Numeric Features...')
    TrainDF_subset = TrainDF.drop(columns = ['id', 'keyword', 'location', 'text', 'target'])
    TestDF_subset = TestDF.drop(columns = ['id', 'keyword', 'location', 'text'])
    if encode_keyword == True:
        ohe = OneHotEncoder(handle_unknown = 'ignore')
        ohe.fit(np.array(TrainDF.loc[TrainDF['keyword'].isna() == False, 'keyword']).reshape((-1,1)))
        TrainDF['keyword'].fillna('NoKeyword', inplace = True)
        TrainDF_keyword_enc = ohe.transform(np.array(TrainDF['keyword']).reshape((-1,1)))
        TestDF['keyword'].fillna('NoKeyword', inplace = True)
        TestDF_keyword_enc = ohe.transform(np.array(TestDF['keyword']).reshape((-1,1)))
        print('El encoding genera {} nuevos features adicionales'.format(TrainDF_keyword_enc.shape[1])) 
        TrainDF_nf = sparse.hstack((TrainDF_keyword_enc, sparse.csr_matrix(TrainDF_subset.values))).toarray()
        X_test_nf = sparse.hstack((TestDF_keyword_enc, sparse.csr_matrix(TestDF_subset.values))).toarray()
    else:
        TrainDF_nf = TrainDF_subset.to_numpy()
        X_test_nf = TestDF_subset.to_numpy()
    X_train_nf, X_val_nf, y_train, y_val = train_test_split(TrainDF_nf, TrainDF['target'], stratify=TrainDF['target'], test_size=validation_fraction, random_state=1)
    nf_norm_layer = layers.experimental.preprocessing.Normalization()
    nf_norm_layer.adapt(X_train_nf)
    X_train_nf = nf_norm_layer(X_train_nf)
    X_val_nf = nf_norm_layer(X_val_nf)
    X_test_nf = nf_norm_layer(X_test_nf)
    
    # Formateo del texto
    print('Preparando el texto... \n')
    X_train_text, X_val_text, y_train_text, y_val_text = train_test_split(TrainDF['text'], TrainDF['target'], stratify=TrainDF['target'], test_size=validation_fraction, random_state=1)
    X_test_text = TestDF['text']
    
    # Formateo del target
    y_train = np.array(y_train)
    y_val = np.array(y_val)

    print('''Las dimensiones de los sets de datos son:
        Set de entrenamiento, features numéricos: {}
        Set de entrenamiento, texto: {}
        Set de validación, features numéricos: {}
        Set de validación, texto: {}
        Set de test, features numéricos: {}
        Set de test, texto: {}'''.format(X_train_nf.shape, X_train_text.shape, X_val_nf.shape, X_val_text.shape, X_test_nf.shape, X_test_text.shape))

    return X_train_text, X_train_nf, X_val_text, X_val_nf, X_test_text, X_test_nf, y_train, y_val

In [126]:
X_train_text, X_train_nf, X_val_text, X_val_nf, X_test_text, X_test_nf, y_train, y_val = load_data_holdout(tw_train_noisy_balanced, 
                                                                                                           tw_test, 
                                                                                                           0.20)

Preparando las Numeric Features...
Preparando el texto... 

Las dimensiones de los sets de datos son:
        Set de entrenamiento, features numéricos: (10420, 24)
        Set de entrenamiento, texto: (10420,)
        Set de validación, features numéricos: (2606, 24)
        Set de validación, texto: (2606,)
        Set de test, features numéricos: (3263, 24)
        Set de test, texto: (3263,)


In [127]:
# PARÁMETROS

max_seq_length = 70

In [128]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable = True)

In [129]:
# TOKENIZADOR: Se usa el vocab_file original

FullTokenizer = bert.bert_tokenization.FullTokenizer  # Inicializo tokenizador de BERT
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [130]:
print('''
Verificación del parámetro max_seq_length...
El máximo largo de tokens es {}
'''.format(tw_train_noisy_balanced['text'].map(lambda x: ["[CLS]"] + tokenizer.tokenize(x) + ["[SEP]"]).map(lambda x: len(x)).max()))



Verificación del parámetro max_seq_length...
El máximo largo de tokens es 47



In [131]:
# PREPARACION DE LOS INPUTS QUE REQUIERE BERT

def get_ids(tokens, tokenizer, max_seq_length):
    # Tokenización a partir de la caja negra de BERT...
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return np.array(input_ids)

def get_masks(tokens, max_seq_length):
    # 1 donde haya token, 0 donde no
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return np.array([1]*len(tokens) + [0] * (max_seq_length - len(tokens)))


def get_segments(tokens, max_seq_length):
    # Segments == 0 para la primer secuencia y == 1 para la segunda
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return np.array(segments + [0] * (max_seq_length - len(tokens)))

In [132]:
# CALCULO DE LOS INPUTS

trainTokens = X_train_text.map(lambda x: ["[CLS]"] + tokenizer.tokenize(x) + ["[SEP]"])

input_ids = np.stack(trainTokens.map(lambda x: get_ids(x, tokenizer, max_seq_length)))
input_masks = np.stack(trainTokens.map(lambda x: get_masks(x, max_seq_length)))
input_segments = np.stack(trainTokens.map(lambda x: get_segments(x, max_seq_length)))

# trainInputs = [input_ids, input_masks, input_segments]

In [133]:
# CALCULO DE LOS INPUTS DE VALIDACION

valTokens = X_val_text.map(lambda x: ["[CLS]"] + tokenizer.tokenize(x) + ["[SEP]"])

input_ids_val = np.stack(valTokens.map(lambda x: get_ids(x, tokenizer, max_seq_length)))
input_masks_val = np.stack(valTokens.map(lambda x: get_masks(x, max_seq_length)))
input_segments_val = np.stack(valTokens.map(lambda x: get_segments(x, max_seq_length)))

# trainInputs = [input_ids_val, input_masks_val, input_segments_val]

In [145]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = bert_layer
#         self.bert = hub.KerasLayer(
#             self.bert_path, trainable=self.trainable
#         )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [146]:
# Entradas del modelo
in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]

# Primer capa: BERT Custom Layer
bert_output = BertLayer(n_fine_tune_layers=10)(bert_inputs)

# Capas densas 
# dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

# El modelo
BERT_model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
BERT_model.summary()

In [None]:
BERT_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
BERT_model.fit([input_ids, input_masks, input_segments], 
          y_train,
          validation_data=([input_ids_val, input_masks_val, input_segments_val], y_val),
          epochs=1,
          batch_size=32)

## BERT 2

In [147]:
# SET DE DATOS: CON UN SET DE DATOS EXTENDIDO Y BALANCEADO

tw_train = pd.read_csv('./tw_train.csv')
tw_train['text'] = tw_train['text'].astype('str')
tw_train = tw_train.drop(columns = ['l_in_t', 'k_in_t', 'X'])
tw_train_noise = pd.read_csv('./tw_train_noise.csv')
tw_train_noise['text'] = tw_train_noise['text'].astype('str')
tw_train_noise = tw_train_noise.drop(columns = ['l_in_t', 'k_in_t', 'X'])
tw_train_noisy, residual_noise = add_noise(tw_train, tw_train_noise, 0.50, keep_proportion = True, return_residual_noise = True)
tw_train_noisy_balanced = balance(tw_train_noisy, residual_noise)
tw_test = pd.read_csv('./tw_test.csv')
tw_test['text'] = tw_test['text'].astype('str')
tw_test = tw_test.drop(columns = ['l_in_t', 'k_in_t', 'X'])

In [148]:
# Separación del set de validación

def load_data_holdout(TrainDF, TestDF, validation_fraction, encode_keyword = False):
    # Numeric Features: Encoding de 'keyword' + Resultado de feature engineering
    print('Preparando las Numeric Features...')
    TrainDF_subset = TrainDF.drop(columns = ['id', 'keyword', 'location', 'text', 'target'])
    TestDF_subset = TestDF.drop(columns = ['id', 'keyword', 'location', 'text'])
    if encode_keyword == True:
        ohe = OneHotEncoder(handle_unknown = 'ignore')
        ohe.fit(np.array(TrainDF.loc[TrainDF['keyword'].isna() == False, 'keyword']).reshape((-1,1)))
        TrainDF['keyword'].fillna('NoKeyword', inplace = True)
        TrainDF_keyword_enc = ohe.transform(np.array(TrainDF['keyword']).reshape((-1,1)))
        TestDF['keyword'].fillna('NoKeyword', inplace = True)
        TestDF_keyword_enc = ohe.transform(np.array(TestDF['keyword']).reshape((-1,1)))
        print('El encoding genera {} nuevos features adicionales'.format(TrainDF_keyword_enc.shape[1])) 
        TrainDF_nf = sparse.hstack((TrainDF_keyword_enc, sparse.csr_matrix(TrainDF_subset.values))).toarray()
        X_test_nf = sparse.hstack((TestDF_keyword_enc, sparse.csr_matrix(TestDF_subset.values))).toarray()
    else:
        TrainDF_nf = TrainDF_subset.to_numpy()
        X_test_nf = TestDF_subset.to_numpy()
    X_train_nf, X_val_nf, y_train, y_val = train_test_split(TrainDF_nf, TrainDF['target'], stratify=TrainDF['target'], test_size=validation_fraction, random_state=1)
    nf_norm_layer = layers.experimental.preprocessing.Normalization()
    nf_norm_layer.adapt(X_train_nf)
    X_train_nf = nf_norm_layer(X_train_nf)
    X_val_nf = nf_norm_layer(X_val_nf)
    X_test_nf = nf_norm_layer(X_test_nf)
    
    # Formateo del texto
    print('Preparando el texto... \n')
    X_train_text, X_val_text, y_train_text, y_val_text = train_test_split(TrainDF['text'], TrainDF['target'], stratify=TrainDF['target'], test_size=validation_fraction, random_state=1)
    X_test_text = TestDF['text']
    
    # Formateo del target
    y_train = np.array(y_train)
    y_val = np.array(y_val)

    print('''Las dimensiones de los sets de datos son:
        Set de entrenamiento, features numéricos: {}
        Set de entrenamiento, texto: {}
        Set de validación, features numéricos: {}
        Set de validación, texto: {}
        Set de test, features numéricos: {}
        Set de test, texto: {}'''.format(X_train_nf.shape, X_train_text.shape, X_val_nf.shape, X_val_text.shape, X_test_nf.shape, X_test_text.shape))

    return X_train_text, X_train_nf, X_val_text, X_val_nf, X_test_text, X_test_nf, y_train, y_val

In [149]:
X_train_text, X_train_nf, X_val_text, X_val_nf, X_test_text, X_test_nf, y_train, y_val = load_data_holdout(tw_train_noisy_balanced, 
                                                                                                           tw_test, 
                                                                                                           0.20)

Preparando las Numeric Features...
Preparando el texto... 

Las dimensiones de los sets de datos son:
        Set de entrenamiento, features numéricos: (10420, 24)
        Set de entrenamiento, texto: (10420,)
        Set de validación, features numéricos: (2606, 24)
        Set de validación, texto: (2606,)
        Set de test, features numéricos: (3263, 24)
        Set de test, texto: (3263,)


In [163]:
# PARÁMETROS

max_seq_length = 50

In [151]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable = True)

In [152]:
# TOKENIZADOR: Se usa el vocab_file original

FullTokenizer = bert.bert_tokenization.FullTokenizer  # Inicializo tokenizador de BERT
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [153]:
print('''
Verificación del parámetro max_seq_length...
El máximo largo de tokens es {}
'''.format(tw_train_noisy_balanced['text'].map(lambda x: ["[CLS]"] + tokenizer.tokenize(x) + ["[SEP]"]).map(lambda x: len(x)).max()))



Verificación del parámetro max_seq_length...
El máximo largo de tokens es 47



In [164]:
# PREPARACION DE LOS INPUTS QUE REQUIERE BERT

def get_ids(tokens, tokenizer, max_seq_length):
    # Tokenización a partir de la caja negra de BERT...
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return np.array(input_ids)

def get_masks(tokens, max_seq_length):
    # 1 donde haya token, 0 donde no
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return np.array([1]*len(tokens) + [0] * (max_seq_length - len(tokens)))


def get_segments(tokens, max_seq_length):
    # Segments == 0 para la primer secuencia y == 1 para la segunda
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return np.array(segments + [0] * (max_seq_length - len(tokens)))

In [165]:
# CALCULO DE LOS INPUTS

trainTokens = X_train_text.map(lambda x: ["[CLS]"] + tokenizer.tokenize(x) + ["[SEP]"])

input_ids = np.stack(trainTokens.map(lambda x: get_ids(x, tokenizer, max_seq_length)))
input_masks = np.stack(trainTokens.map(lambda x: get_masks(x, max_seq_length)))
input_segments = np.stack(trainTokens.map(lambda x: get_segments(x, max_seq_length)))

In [166]:
# CALCULO DE LOS INPUTS DE VALIDACION

valTokens = X_val_text.map(lambda x: ["[CLS]"] + tokenizer.tokenize(x) + ["[SEP]"])

input_ids_val = np.stack(valTokens.map(lambda x: get_ids(x, tokenizer, max_seq_length)))
input_masks_val = np.stack(valTokens.map(lambda x: get_masks(x, max_seq_length)))
input_segments_val = np.stack(valTokens.map(lambda x: get_segments(x, max_seq_length)))

In [167]:
# ESTRUCTURA DEL MODELO

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
bert_layer.trainable = False
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

nf_inputs = keras.Input(batch_size = None, shape = (X_train_nf.shape[1],))  # Entrada de los features numéricos completamente pre-procesados
nf_outputs = nf_inputs

x = layers.Concatenate(axis = 1)([pooled_output, nf_outputs])  # Combinación de las salidas de ambas ramas
x = layers.BatchNormalization(axis=-1, trainable=False)(x)


x = layers.Dense(64, activation="relu")(pooled_output)
outputs = layers.Dense(1, activation="sigmoid")(x)

BERT_model = keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=outputs)

In [168]:
BERT_model.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 50)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 50)]         0                                            
__________________________________________________________________________________________________
keras_layer_9 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [169]:
BERT_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [170]:
BERT_model_history = BERT_model.fit([input_ids, input_masks, input_segments], y_train, 
                                    validation_data = ([input_ids_val, input_masks_val, input_segments_val], y_val), 
                                    batch_size = 32, epochs = 5, verbose = 2)

Epoch 1/5
326/326 - 1471s - loss: 0.6119 - accuracy: 0.6718 - val_loss: 0.5598 - val_accuracy: 0.7134
Epoch 2/5
326/326 - 1480s - loss: 0.5544 - accuracy: 0.7199 - val_loss: 0.5917 - val_accuracy: 0.6807
Epoch 3/5
326/326 - 1478s - loss: 0.5471 - accuracy: 0.7255 - val_loss: 0.5038 - val_accuracy: 0.7682
Epoch 4/5
326/326 - 1484s - loss: 0.5294 - accuracy: 0.7376 - val_loss: 0.5295 - val_accuracy: 0.7444
Epoch 5/5
326/326 - 1482s - loss: 0.5362 - accuracy: 0.7308 - val_loss: 0.4952 - val_accuracy: 0.7690
