In [108]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold

### KERAS

#### Character process

In [109]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [110]:
text = pd.read_csv("train.csv", usecols=['text', 'target'])
test_text = pd.read_csv("test.csv", usecols=['text'])

In [111]:
text.drop_duplicates(subset = 'text', keep = False, inplace = True)
text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7434 non-null   object
 1   target  7434 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 174.2+ KB


In [112]:
text['text'] = text['text'].apply(lambda x: x.lower())
test_text['text'] = test_text['text'].apply(lambda x: x.lower())

In [113]:
text.head()

Unnamed: 0,text,target
0,our deeds are the reason of this #earthquake m...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,just got sent this photo from ruby #alaska as ...,1


In [114]:
test_text.head()

Unnamed: 0,text
0,just happened a terrible car crash
1,"heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,apocalypse lighting. #spokane #wildfires
4,typhoon soudelor kills 28 in china and taiwan


In [115]:
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(text['text'])

In [116]:
# Caracteres muy horrendos
tk.word_index

{'UNK': 1,
 ' ': 2,
 'e': 3,
 't': 4,
 'a': 5,
 'o': 6,
 'i': 7,
 'n': 8,
 's': 9,
 'r': 10,
 'h': 11,
 'l': 12,
 'c': 13,
 'd': 14,
 'u': 15,
 'p': 16,
 'm': 17,
 '/': 18,
 'g': 19,
 'f': 20,
 'y': 21,
 'w': 22,
 '.': 23,
 'b': 24,
 'k': 25,
 'v': 26,
 ':': 27,
 '#': 28,
 'j': 29,
 "'": 30,
 '?': 31,
 'x': 32,
 '@': 33,
 'z': 34,
 '0': 35,
 '1': 36,
 'q': 37,
 '-': 38,
 '2': 39,
 '5': 40,
 '3': 41,
 '4': 42,
 '7': 43,
 '9': 44,
 '6': 45,
 '!': 46,
 '8': 47,
 '\n': 48,
 '_': 49,
 '\x89': 50,
 'û': 51,
 ';': 52,
 '&': 53,
 ')': 54,
 '(': 55,
 '*': 56,
 'ª': 57,
 '|': 58,
 '[': 59,
 ']': 60,
 'å': 61,
 '+': 62,
 'ï': 63,
 'ê': 64,
 '=': 65,
 '÷': 66,
 '%': 67,
 'ò': 68,
 '$': 69,
 '\x9d': 70,
 '~': 71,
 'ó': 72,
 'ì': 73,
 '©': 74,
 '¢': 75,
 '£': 76,
 '^': 77,
 '¨': 78,
 'è': 79,
 '\\': 80,
 '¼': 81,
 '}': 82,
 'ñ': 83,
 '¤': 84,
 '¡': 85,
 '`': 86,
 '{': 87,
 ',': 88,
 'ã': 89,
 'ü': 90,
 'ç': 91,
 'â': 92,
 '«': 93,
 '>': 94,
 '´': 95,
 '¬': 96}

In [117]:
alphabet="abcdefghijklmnñopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$£%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
tk.word_index = char_dict.copy() 
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1 #UNK es el valor mas alto

In [118]:
tk.word_index

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'ñ': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27,
 '0': 28,
 '1': 29,
 '2': 30,
 '3': 31,
 '4': 32,
 '5': 33,
 '6': 34,
 '7': 35,
 '8': 36,
 '9': 37,
 ',': 38,
 ';': 39,
 '.': 40,
 '!': 41,
 '?': 42,
 ':': 43,
 "'": 44,
 '"': 45,
 '/': 46,
 '\\': 47,
 '|': 48,
 '_': 49,
 '@': 50,
 '#': 51,
 '$': 52,
 '£': 53,
 '%': 54,
 '^': 55,
 '&': 56,
 '*': 57,
 '~': 58,
 '`': 59,
 '+': 60,
 '-': 61,
 '=': 62,
 '<': 63,
 '>': 64,
 '(': 65,
 ')': 66,
 '[': 67,
 ']': 68,
 '{': 69,
 '}': 70,
 'UNK': 71}

In [119]:
# Ahora el texto se representa con una secuencia de caracteres
sequences = tk.texts_to_sequences(text['text'])
test_sequences = tk.texts_to_sequences(test_text['text'])

In [120]:
sequences[0]

[16,
 22,
 19,
 71,
 4,
 5,
 5,
 4,
 20,
 71,
 1,
 19,
 5,
 71,
 21,
 8,
 5,
 71,
 19,
 5,
 1,
 20,
 16,
 14,
 71,
 16,
 6,
 71,
 21,
 8,
 9,
 20,
 71,
 51,
 5,
 1,
 19,
 21,
 8,
 18,
 22,
 1,
 11,
 5,
 71,
 13,
 1,
 26,
 71,
 1,
 12,
 12,
 1,
 8,
 71,
 6,
 16,
 19,
 7,
 9,
 23,
 5,
 71,
 22,
 20,
 71,
 1,
 12,
 12]

In [121]:
test_sequences[0]

[10,
 22,
 20,
 21,
 71,
 8,
 1,
 17,
 17,
 5,
 14,
 5,
 4,
 71,
 1,
 71,
 21,
 5,
 19,
 19,
 9,
 2,
 12,
 5,
 71,
 3,
 1,
 19,
 71,
 3,
 19,
 1,
 20,
 8]

In [122]:
# Padding de cada secuencia para que todas tengan el mismo largo
data = pad_sequences(sequences, maxlen=1014, padding='post')
test_data = pad_sequences(test_sequences, maxlen=1014, padding='post')

In [123]:
data = np.array(data)
data.shape

(7434, 1014)

In [124]:
test_data = np.array(test_data)
test_data.shape

(3263, 1014)

In [125]:
train_classes = text['target'].values

#### CNN

In [126]:
size = len(tk.word_index)
size

71

In [127]:
embedding_weights = []
embedding_weights.append(np.zeros(size))

for char, i in tk.word_index.items():
    row = np.zeros(size)
    row[i-1] = 1
    embedding_weights.append(row)
    
embedding_weights = np.array(embedding_weights)

print(embedding_weights.shape)

(72, 71)


In [128]:
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

# Parametros
input_size = 1014
embedding_size = 71
fully_connected_layers = [1024, 1024]
num_of_classes = 1
dropout_p = 0.5
optimizer = 'adam'
loss = 'binary_crossentropy'

embedding_layer = Embedding(embedding_size+1, 
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

inputs = Input(shape=(input_size,), name='input', dtype='int64')

x = embedding_layer(inputs)

In [137]:
from keras.models import Sequential
from keras import layers

model = Sequential()

# Embedding
model.add(embedding_layer)

# Conv
model.add(layers.Conv1D(256, 3, activation='relu'))
model.add(layers.MaxPool1D(pool_size=4))

# fully connected
model.add(layers.Dense(1024, activation='relu'))

# Output
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_126"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_88 (Embedding)     (None, 1014, 71)          5112      
_________________________________________________________________
conv1d_144 (Conv1D)          (None, 1012, 256)         54784     
_________________________________________________________________
max_pooling1d_50 (MaxPooling (None, 253, 256)          0         
_________________________________________________________________
dense_186 (Dense)            (None, 253, 1024)         263168    
_________________________________________________________________
dense_187 (Dense)            (None, 253, 1)            1025      
Total params: 324,089
Trainable params: 324,089
Non-trainable params: 0
_________________________________________________________________


In [138]:
x_train, x_test, y_train, y_test = \
train_test_split(data, train_classes, test_size = 0.25, random_state = 123)

In [139]:
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)
callbacks = [callback]

model1.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=2,
          callbacks=callback)

Epoch 1/10
44/44 - 27s - loss: 0.5642 - accuracy: 0.6738 - val_loss: 0.8166 - val_accuracy: 0.5912
Epoch 2/10
44/44 - 28s - loss: 0.5642 - accuracy: 0.6749 - val_loss: 0.8169 - val_accuracy: 0.5906
Epoch 00002: early stopping


<tensorflow.python.keras.callbacks.History at 0x14743fd10>

In [102]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_weights], trainable=True))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [105]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

param_grid = dict(num_filters=[128, 144, 256],
                      kernel_size=[3, 5, 7],
                      vocab_size=[80],
                      embedding_dim=[79],
                      maxlen=[1014],
                      batch_size = [45,65,76,88])

model = KerasClassifier(build_fn=create_model,
                            epochs=15, validation_split=0.1,
                            verbose=1)

grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=2, n_iter=5, n_jobs=1)

grid_result = grid.fit(x_train, y_train, callbacks=[callback])

# Evaluate testing set
#test_accuracy = grid.score(x_test, y_test)

# Save and evaluate results
s = ('Best Accuracy : {:.4f}\n{}\n\n\n')
output_string = s.format(
            grid_result.best_score_,
            grid_result.best_params_)
            
print(output_string)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65 
Epoch 1/15


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 00007: early stopping
[CV]  vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65, total= 1.5min
[CV] vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65 
Epoch 1/15


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 00007: early stopping
[CV]  vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65, total= 1.5min
[CV] vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 00007: early stopping
[CV]  vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65, total= 1.4min
[CV] vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 00005: early stopping
[CV]  vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=65, total=  58.2s
[CV] vocab_size=80, num_filters=128, maxlen=1014, kernel_size=5, embedding_dim=79, batch_size=76 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
[CV]  vocab_size=80, num_filters=128, maxlen=1014, kernel_size=5, embedding_dim=79, batch_size=76, total=  27.9s
[CV] vocab_size=80, num_filters=144, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=45 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
[CV]  vocab_size=80, num_filters=144, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=45, total=  39.0s
[CV] vocab_size=80, num_filters=144, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=45 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 00006: early stopping
[CV]  vocab_size=80, num_filters=144, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=45, total= 1.3min
[CV] vocab_size=80, num_filters=144, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=45 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 00006: early stopping
[CV]  vocab_size=80, num_filters=144, maxlen=1014, kernel

Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 00004: early stopping
[CV]  vocab_size=80, num_filters=128, maxlen=1014, kernel_size=7, embedding_dim=79, batch_size=45, total=54.6min
[CV] vocab_size=80, num_filters=144, maxlen=1014, kernel_size=3, embedding_dim=79, batch_size=88 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 00008: early stopping
[CV]  vocab_size=80, num_filters=144, maxlen=1014, kernel_size=3, embedding_dim=79, batch_size=88, total=161.0min
[CV] vocab_size=80, num_filters=144, maxlen=1014, kernel_size=3, embedding_dim=79, batch_size=88 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 00009: early stopping
[CV]  vocab_size=80, num_filters=144, maxlen=1014, kernel_size=3, embedding_dim=79, batch_size=88, total=161.5min
[CV] vocab_size=80, num_filters=144, maxlen=1014, kernel_size=3, embedding_dim=79, batch_size=88 
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 653.6min finished


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 00007: early stopping
Best Accuracy : 0.7761
{'vocab_size': 80, 'num_filters': 128, 'maxlen': 1014, 'kernel_size': 7, 'embedding_dim': 79, 'batch_size': 45}



