# Modelos con Conv1d

In [26]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Concatenate, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Lambda, Activation, GaussianNoise, GaussianDropout
from keras import layers, Input, Model
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from tensorflow.keras.constraints import max_norm, unit_norm, min_max_norm
from tensorflow import keras
from keras import layers

### CNN Básico. 1er modelo. -> 1 Conv1d + Maxpool + 1 Densa

In [51]:
test = pd.read_csv('test.csv')
test = test[['id','text']]
x_test_kagle = test['text'].values

x = tweets_metrics['text'].values
y = tweets_metrics['target'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 123)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100
embedding_dim = 100

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [None]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train,
                    epochs=2,
                    verbose=1,
                    validation_data=(x_test, y_test),
                    batch_size=65)
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

### CNN Básico + Word embedding [Glove]. Resultado: 0,80570. -> 2 Conv1d + Maxpool + 1 Densa.

In [None]:
x = tweets_metrics['text'].values
y = tweets_metrics['target'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.23, random_state = 123)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 140

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

embedding_dim = 100
embedding_matrix = create_embedding_matrix('Embeddings/glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

In [None]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Conv1D(32, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
callback = EarlyStopping(monitor = 'val_loss', patience = 1)
history = model.fit(x_train, y_train,
                    epochs=15,
                    verbose=1,
                    validation_split=0.1,
                    batch_size=88,
                    callbacks = [callback])

In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}, Loss  {:.4f}".format(accuracy,loss))

### Random Search sobre CNN básico. -> 1 Conv1d + Maxpool + 1 Densa.

In [27]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [51]:
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values

epochs = 15
embedding_dim = 100
maxlen = 140

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

vocab_size = len(tokenizer.word_index) + 1

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)

embedding_matrix = create_embedding_matrix('Embeddings/glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

param_grid = dict(num_filters=[32, 128, 144],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen],
                      batch_size = [45,65,76,88])

model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, validation_split=0.1,
                            verbose=1)

grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=2, n_iter=5, n_jobs=1)

grid_result = grid.fit(x_train, y_train, callbacks=[callback])

s = ('Best Accuracy : {:.4f}\n{}\n\n\n')
output_string = s.format(
            grid_result.best_score_,
            grid_result.best_params_)
print(output_string)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Train on 5017 samples, validate on 558 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
[CV]  vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45, total=  29.7s
[CV] vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.6s remaining:    0.0s
Train on 5017 samples, validate on 558 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 00003: early stopping
[CV]  vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45, total=  29.8s
[CV] vocab_size=22811, num_filters=128, maxlen=140, kernel_size=5, embedding_dim=100, batch_size=45 
Train on 5018 samples, valid

### Posterior ejecución del modelo a partir de resultados de la Random Search. Resultado: 0,81274.

In [56]:
x = tweets_metrics['text'].values
y = tweets_metrics['target'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 123)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 140

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)
embedding_dim = 100
embedding_matrix = create_embedding_matrix('Embeddings/glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

In [60]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
model.add(layers.Conv1D(128, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_156"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_156 (Embedding)    (None, 140, 100)          2281100   
_________________________________________________________________
conv1d_156 (Conv1D)          (None, 134, 128)          89728     
_________________________________________________________________
global_max_pooling1d_156 (Gl (None, 128)               0         
_________________________________________________________________
dense_311 (Dense)            (None, 10)                1290      
_________________________________________________________________
dense_312 (Dense)            (None, 1)                 11        
Total params: 2,372,129
Trainable params: 2,372,129
Non-trainable params: 0
_________________________________________________________________


In [None]:
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)

history = model.fit(x_train, y_train,
                    epochs=3,
                    verbose=1,
                    validation_split=0.1,
                    batch_size=88,
                    callbacks = [callback])

In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}, Loss  {:.4f}".format(accuracy,loss))

### Posterior ejecución del modelo a partir de resultados de la Grid Search. Resultado: 0,81703. -> 1 Conv1d + 2 Densas. Con Glove y features de texto concatenados.

In [27]:
features = StandardScaler()
x_train_features = features.fit_transform(basic_features)

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 140
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)

embedding_dim_1 = 100
embedding_matrix_1 = create_embedding_matrix('Embeddings/glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim_1)

In [109]:
def create_conv1d():
    embedding = Embedding(vocab_size, embedding_dim_1, input_length=maxlen, weights=[embedding_matrix_1], trainable=True)
    x_train_input = Input(shape = (maxlen,), name = 'x_train_input')
    x_train_features_input = Input(shape = (15, ), name = 'x_features_train')
    emb = embedding(x_train_input)
    
    conv_out = Conv1D(128, 2, activation='relu')(emb)
    max_pool = GlobalMaxPooling1D()(conv_out)

    conc = Concatenate()([max_pool, x_train_features_input])
    
    dense1 = Dense(100, activation='relu')(conc)
    dense2 = Dense(50, activation='relu')(dense1)
    dense3 = Dense(1, activation='sigmoid')(dense2)
    
    model = Model(inputs = [x_train_input , x_train_features_input], outputs = dense3)
    optimizer = Adam() #default
    model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
model = create_conv1d()

history = model.fit([x_train,x_train_features], y_train,
                    epochs=3,
                    verbose=1,
                    validation_split=0.2,
                    batch_size=45)

### Multi-Channel CNN + Multi-word embedding. Se utilizan 3 pre-trained embeddings de Glove. -> 2 Conv1d en paralelo para cada word embedding + posterior concatenación (total 6).

In [27]:
features = StandardScaler()
x_train_features = features.fit_transform(basic_features)

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 50
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)

In [29]:
embedding_dim_1 = 100
embedding_matrix_1 = create_embedding_matrix('Embeddings/glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim_1)

embedding_dim_2 = 300
embedding_matrix_2 = create_embedding_matrix_840('Embeddings/glove.840B.300d.txt',tokenizer.word_index, embedding_dim_2)

embedding_dim_3 = 200
embedding_matrix_3 = create_embedding_matrix('Embeddings/glove.6B.200d.txt',tokenizer.word_index, embedding_dim_3)

In [109]:
def create_conv1d():
        
    embedding = Embedding(vocab_size, embedding_dim_1, input_length=maxlen, weights=[embedding_matrix_1], trainable=False)
    embedding2 = Embedding(vocab_size, embedding_dim_2, input_length=maxlen, weights=[embedding_matrix_2], trainable=True)
    embedding3 = Embedding(vocab_size, embedding_dim_3, input_length=maxlen, weights=[embedding_matrix_3], trainable=False)

    x_train_input = Input(shape = (maxlen,), name = 'x_train_input')
    x_train_features_input = Input(shape = (15, ), name = 'x_features_train')

    emb = embedding(x_train_input)
    emb2 = embedding2(x_train_input)
    emb3 = embedding3(x_train_input)

    #Emb 100
    conv_out1_1 = Conv1D(128, 2, activation='relu')(emb)
    activation1_1 = Activation('relu')(conv_out1_1)
    max_pool1_1 = GlobalMaxPooling1D()(activation1_1)
    conv_out1_2 = Conv1D(128, 3, activation='relu')(emb)
    activation1_2 = Activation('relu')(conv_out1_2)
    max_pool1_2 = GlobalMaxPooling1D()(activation1_2)

    #Emb 200
    conv_out2_1 = Conv1D(128, 2, activation='relu', kernel_constraint=max_norm(3), bias_constraint=max_norm(3))(emb2)
    activation2_1 = Activation('relu')(conv_out2_1)
    max_pool2_1 = GlobalMaxPooling1D()(activation2_1)
    conv_out2_2 = Conv1D(128, 3, activation='relu', kernel_constraint=max_norm(3), bias_constraint=max_norm(3))(activation2_1)
    activation2_2 = Activation('relu')(conv_out2_2)
    max_pool2_2 = GlobalMaxPooling1D()(activation2_2)

    #Emb 300
    conv_out3_1 = Conv1D(128, 2, activation='relu')(emb3)
    activation3_1 = Activation('relu')(conv_out3_1)
    max_pool3_1 = GlobalMaxPooling1D()(activation3_1)
    conv_out3_2 = Conv1D(128, 3, activation='relu')(emb3)
    activation3_2 = Activation('relu')(conv_out3_2)
    max_pool3_2 = GlobalMaxPooling1D()(activation3_2)

    conc = Concatenate()([max_pool2_2,  x_train_features_input])
    
    dense1 = Dense(100, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01))(conc)
    noise1 = GaussianNoise(0.1)(dense1)
    dense2 = Dense(10, activation='relu')(noise1)
    dense3 = Dense(1, activation='sigmoid')(dense2)
    
    model = Model(inputs = [x_train_input , x_train_features_input], outputs = dense3)
    optimizer = Adam() #default
    model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [112]:
model = create_conv1d()

history = model.fit([x_train,x_train_features], y_train,
                    epochs=3,
                    verbose=1,
                    validation_split=0.2,
                    batch_size=69)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
test_kagle = test[['id','text']]

x_test_kagle = test['text'].values
x_test_kagle = tokenizer.texts_to_sequences(x_test_kagle)
x_test_kagle = pad_sequences(x_test_kagle, padding='post', maxlen=maxlen)
x_test_features = features.transform(basic_features_test)

submit_df = pd.DataFrame()
submit_df['id'] = test_kagle['id']
submit_df['prob'] = model.predict([x_test_kagle,x_test_features])
submit_df['target'] = submit_df['prob'].apply(lambda x: 0 if x < .5 else 1)