In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

* Read csv, remove puctuation, stopwords and apply stemming, replace keywords with 'NULL'

In [None]:
tweets = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
tweets['keyword'] = tweets['keyword'].fillna('NULL')
test['keyword'] = test['keyword'].fillna('NULL')

In [None]:
duplicates = tweets.duplicated(subset = 'text', keep = False)
duplicates.value_counts()

In [None]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

In [None]:
tweets['keyword'] = tweets.keyword.str.replace('%20',' ')

In [None]:
import string

tweets['text'] = tweets['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
tweets.head()

In [None]:
tweets['text'] = tweets['text'].apply(lambda x: x.lower())

In [None]:
import io
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

In [None]:
tweets['text'] = tweets['text'].str.split()
tweets['text'] = tweets['text'].apply(remove_stopword)
tweets.head()

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

In [None]:
tweets['text'] = tweets['text'].apply(stemm)
tweets.head()

In [None]:
tweets.reset_index(drop=True, inplace=True)

In [None]:
tweets.head()

### Clustering graphic

* SVD

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(tweets['text'])

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2)
svd.fit(x_train_tfidf)
svd_result = svd.transform(x_train_tfidf)

In [None]:
to_plot = pd.DataFrame({'x': svd_result[:, 0], 'y': svd_result[:, 1], 'target': tweets['target']})

In [None]:
to_plot.reset_index(inplace=True, drop=True)

In [None]:
ax = to_plot[to_plot['target'] == 1].plot.scatter(x='x', y='y', s=8, alpha=0.8, color='blue', label='Real', figsize=(10, 10))
to_plot[to_plot['target'] == 0].plot.scatter(x='x', y='y', s=8, alpha=0.8, color='orange', label='Not real', ax=ax)

### TF-IDF matrix

In [None]:
x_train, x_test, y_train, y_test = \
train_test_split(tweets['text'], tweets.loc[:,['id','target']], test_size = 0.25, random_state = 123)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
train_vectors = tfidf_vectorizer.fit_transform(x_train)
test_vectors = tfidf_vectorizer.transform(x_test)
array = train_vectors.todense()

tfidf_matrix = pd.DataFrame(array)
tfidf_matrix.head(10)

In [None]:
y_train.reset_index(drop=True, inplace=True)
y_train.head()

#### Basic Naive Bayes 

En el caso de una predicción binaria es recomendable bernoulli

In [None]:
from sklearn.naive_bayes import BernoulliNB

bernoulli = BernoulliNB().fit(train_vectors, y_train['target'])

In [None]:
from sklearn.metrics import accuracy_score
predicted = bernoulli.predict(test_vectors)
print(accuracy_score(y_test['target'], predicted))

#### Tunning

In [None]:
tfidf_matrix.head()

In [None]:
features = tfidf_matrix.columns

In [None]:
alpha = np.linspace(0.5, 1.0, 20)
alpha = np.around(alpha, decimals=4)
alpha

In [None]:
grid = [{"alpha":alpha}]

In [None]:
from sklearn.model_selection import GridSearchCV

#classifier = BernoulliNB()
#gridsearch = GridSearchCV(classifier, grid, scoring = 'neg_log_loss', cv = 4)
#gridsearch.fit(df[features], y_train['target'])
#print("Best parameter: ",gridsearch.best_params_)

In [None]:
bernoulli_t = BernoulliNB(alpha=0.9474).fit(train_vectors, y_train['target'])
predicted_t = bernoulli_t.predict(test_vectors)
print(accuracy_score(y_test['target'], predicted_t))

### LightGBM

* Count vectorization

In [None]:
from sklearn.metrics import classification_report

In [None]:
x_train, x_test, y_train, y_test = \
train_test_split(tweets[['text', 'keyword']], tweets['target'], test_size = 0.25, random_state = 123)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf_vectorizer = CountVectorizer(ngram_range=(1,2), lowercase=True, stop_words='english', max_features=5000)
train_vectors = tfidf_vectorizer.fit_transform(x_train['text'])
test_vectors = tfidf_vectorizer.transform(x_test['text'])

In [None]:
tfidf_vectorizer = CountVectorizer(ngram_range=(1,2), lowercase=True, stop_words='english', max_features=300)
key_train_vectors = tfidf_vectorizer.fit_transform(x_train['keyword'])
key_test_vectors = tfidf_vectorizer.transform(x_test['keyword'])

In [None]:
from scipy.sparse import hstack
matrix_final = hstack([train_vectors, key_train_vectors])

In [None]:
matrix_final = matrix_final.astype('float32')

In [None]:
import lightgbm as lgb
d_train = lgb.Dataset(matrix_final, label=y_train)

params = {
    'learning_rate' : 0.02,
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : 'binary_logloss',
    'num_leaves' : 50,
    'max_depth' : 5
}

gbm = lgb.train(params, d_train, 5000)

In [None]:
test_final = hstack([test_vectors, key_test_vectors])
test_final = test_final.astype('float32')
y_pred = gbm.predict(test_final)

for i in range (0, len(y_pred)):
    if y_pred[i] >= 0.5:       
        y_pred[i] = 1
    else:  
        y_pred[i] = 0

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print(accuracy)

In [None]:
print(classification_report(y_test, y_pred))

### Features

In [None]:
tweets_features = pd.read_csv("train_features.csv")
test_features = pd.read_csv("test_features.csv")
keyword_w2v = pd.read_csv("keyword_features.csv")

In [None]:
tweets_features.head(1)

In [None]:
test_features.head(1)

In [None]:
tweets_features = tweets_features.drop(columns=['text', 'text_without_stopwords', 'target'])

In [None]:
tweets_features.head(1)

#### Merge all features

In [None]:
tweets.head(1)

In [None]:
keyword_w2v.head(1)

In [None]:
tweets_features.head(1)

In [None]:
merged = tweets.merge(tweets_features, left_on='id', right_on='id')

In [None]:
total_features = merged.merge(keyword_w2v, left_on='keyword', right_on='keyword')

In [None]:
total_features.head()

In [None]:
features = total_features.columns
features = features.drop('target')

#### Features with tweet vectorizer

In [None]:
x_train, x_test, y_train, y_test = \
train_test_split(total_features[features], total_features['target'], test_size = 0.25, random_state = 123)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf_vectorizer = CountVectorizer(ngram_range=(1,2), lowercase=True, stop_words='english', max_features=11000)
train_vectors = tfidf_vectorizer.fit_transform(x_train['text'])
test_vectors = tfidf_vectorizer.transform(x_test['text'])

In [None]:
train_array = train_vectors.todense()
train_matrix = pd.DataFrame(train_array)
x_train.reset_index(inplace=True, drop=True)
train_matrix['id'] = x_train['id']
X_train = x_train.merge(train_matrix).drop(columns=['keyword', 'location', 'text', 'id'])

In [None]:
test_array = test_vectors.todense()
test_matrix = pd.DataFrame(test_array)
x_test.reset_index(inplace=True, drop=True)
test_matrix['id'] = x_test['id']
X_test = x_test.merge(test_matrix).drop(columns=['keyword', 'location', 'text', 'id'])

In [None]:
import lightgbm as lgb
d_train = lgb.Dataset(X_train, y_train)

params : {
    'learning_rate' : 0.02,
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : 'binary_logloss',
    'num_leaves' : 500,
    'max_depth' : 2,
    'max_bin': 1000
}

gbm = lgb.train(params, d_train, 10000)

In [None]:
y_pred = gbm.predict(X_test)

for i in range (0, len(y_pred)):
    if y_pred[i] > 0.5:       
        y_pred[i] = 1
    else:  
        y_pred[i] = 0

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print(accuracy)

In [None]:
print(classification_report(y_test, y_pred))

-------

#### Character process

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [3]:
text = pd.read_csv("train.csv", usecols=['text', 'target'])
test_text = pd.read_csv("test.csv", usecols=['text'])

In [4]:
text.drop_duplicates(subset = 'text', keep = False, inplace = True)
text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7434 non-null   object
 1   target  7434 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 174.2+ KB


In [5]:
text['text'] = text['text'].apply(lambda x: x.lower())
test_text['text'] = test_text['text'].apply(lambda x: x.lower())

In [6]:
text.head()

Unnamed: 0,text,target
0,our deeds are the reason of this #earthquake m...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,just got sent this photo from ruby #alaska as ...,1


In [7]:
test_text.head()

Unnamed: 0,text
0,just happened a terrible car crash
1,"heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,apocalypse lighting. #spokane #wildfires
4,typhoon soudelor kills 28 in china and taiwan


In [8]:
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(text['text'])

In [9]:
# Caracteres muy horrendos
tk.word_index

{'UNK': 1,
 ' ': 2,
 'e': 3,
 't': 4,
 'a': 5,
 'o': 6,
 'i': 7,
 'n': 8,
 's': 9,
 'r': 10,
 'h': 11,
 'l': 12,
 'c': 13,
 'd': 14,
 'u': 15,
 'p': 16,
 'm': 17,
 '/': 18,
 'g': 19,
 'f': 20,
 'y': 21,
 'w': 22,
 '.': 23,
 'b': 24,
 'k': 25,
 'v': 26,
 ':': 27,
 '#': 28,
 'j': 29,
 "'": 30,
 '?': 31,
 'x': 32,
 '@': 33,
 'z': 34,
 '0': 35,
 '1': 36,
 'q': 37,
 '-': 38,
 '2': 39,
 '5': 40,
 '3': 41,
 '4': 42,
 '7': 43,
 '9': 44,
 '6': 45,
 '!': 46,
 '8': 47,
 '\n': 48,
 '_': 49,
 '\x89': 50,
 'û': 51,
 ';': 52,
 '&': 53,
 ')': 54,
 '(': 55,
 '*': 56,
 'ª': 57,
 '|': 58,
 '[': 59,
 ']': 60,
 'å': 61,
 '+': 62,
 'ï': 63,
 'ê': 64,
 '=': 65,
 '÷': 66,
 '%': 67,
 'ò': 68,
 '$': 69,
 '\x9d': 70,
 '~': 71,
 'ó': 72,
 'ì': 73,
 '©': 74,
 '¢': 75,
 '£': 76,
 '^': 77,
 '¨': 78,
 'è': 79,
 '\\': 80,
 '¼': 81,
 '}': 82,
 'ñ': 83,
 '¤': 84,
 '¡': 85,
 '`': 86,
 '{': 87,
 ',': 88,
 'ã': 89,
 'ü': 90,
 'ç': 91,
 'â': 92,
 '«': 93,
 '>': 94,
 '´': 95,
 '¬': 96}

In [10]:
alphabet="abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
tk.word_index = char_dict.copy() 
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1 #UNK es el valor mas alto

In [11]:
tk.word_index

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ',': 37,
 ';': 38,
 '.': 39,
 '!': 40,
 '?': 41,
 ':': 42,
 "'": 43,
 '"': 44,
 '/': 45,
 '\\': 46,
 '|': 47,
 '_': 48,
 '@': 49,
 '#': 50,
 '$': 51,
 '%': 52,
 '^': 53,
 '&': 54,
 '*': 55,
 '~': 56,
 '`': 57,
 '+': 58,
 '-': 59,
 '=': 60,
 '<': 61,
 '>': 62,
 '(': 63,
 ')': 64,
 '[': 65,
 ']': 66,
 '{': 67,
 '}': 68,
 'UNK': 69}

In [12]:
# Ahora el texto se representa con una secuencia de caracteres
sequences = tk.texts_to_sequences(text['text'])
test_sequences = tk.texts_to_sequences(test_text['text'])

In [13]:
sequences[0]

[15,
 21,
 18,
 69,
 4,
 5,
 5,
 4,
 19,
 69,
 1,
 18,
 5,
 69,
 20,
 8,
 5,
 69,
 18,
 5,
 1,
 19,
 15,
 14,
 69,
 15,
 6,
 69,
 20,
 8,
 9,
 19,
 69,
 50,
 5,
 1,
 18,
 20,
 8,
 17,
 21,
 1,
 11,
 5,
 69,
 13,
 1,
 25,
 69,
 1,
 12,
 12,
 1,
 8,
 69,
 6,
 15,
 18,
 7,
 9,
 22,
 5,
 69,
 21,
 19,
 69,
 1,
 12,
 12]

In [14]:
test_sequences[0]

[10,
 21,
 19,
 20,
 69,
 8,
 1,
 16,
 16,
 5,
 14,
 5,
 4,
 69,
 1,
 69,
 20,
 5,
 18,
 18,
 9,
 2,
 12,
 5,
 69,
 3,
 1,
 18,
 69,
 3,
 18,
 1,
 19,
 8]

In [15]:
# Padding de cada secuencia para que todas tengan el mismo largo
data = pad_sequences(sequences, maxlen=1014, padding='post')
test_data = pad_sequences(test_sequences, maxlen=1014, padding='post')

In [16]:
data = np.array(data)
data.shape

(7434, 1014)

In [20]:
test_data = np.array(test_data)
test_data.shape

(3263, 1014)

In [21]:
train_classes = text['target'].values

In [22]:
train_classes[:30]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

#### CHAR CNN

In [23]:
size = len(tk.word_index)
size

69

In [24]:
embedding_weights = []
embedding_weights.append(np.zeros(size))

for char, i in tk.word_index.items():
    row = np.zeros(size)
    row[i-1] = 1
    embedding_weights.append(row)
    
embedding_weights = np.array(embedding_weights)

In [25]:
print(embedding_weights.shape)

(70, 69)


In [26]:
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

In [27]:
# Parametros
input_size = 1014
embedding_size = 69
conv_layers = [[256, 7, 2], 
               [256, 7, 2], 
               [256, 2, -1], 
               [256, 2, -1], 
               [256, 2, -1], 
               [256, 2, 1]]

fully_connected_layers = [1024, 1024]
num_of_classes = 1
dropout_p = 0.5
optimizer = 'adam'
loss = 'binary_crossentropy'

embedding_layer = Embedding(size+1, 
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')

# Embedding 
x = embedding_layer(inputs)

# Conv 
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x) 
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)
        
x = Flatten()(x)

# Fully connected layers 
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)
    x = Dropout(dropout_p)(x)
    
# Output Layer
predictions = Dense(num_of_classes, activation='sigmoid')(x)

# Build model
model1 = Model(inputs=inputs, outputs=predictions)
model1.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1014)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1014, 69)          4830      
_________________________________________________________________
conv1d (Conv1D)              (None, 1008, 256)         123904    
_________________________________________________________________
activation (Activation)      (None, 1008, 256)         0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 504, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 498, 256)          459008    
_________________________________________________________________
activation_1 (Activation)    (None, 498, 256)          0     

In [28]:
x_train, x_test, y_train, y_test = \
train_test_split(data, train_classes, test_size = 0.25, random_state = 123)

from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 2, verbose=1)
callbacks = [callback]

model1.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=6,
          verbose=2,
          callbacks=callback)

Epoch 1/6
44/44 - 136s - loss: 0.6944 - accuracy: 0.5726 - val_loss: 0.6785 - val_accuracy: 0.5675
Epoch 2/6
44/44 - 141s - loss: 0.6528 - accuracy: 0.6151 - val_loss: 0.6365 - val_accuracy: 0.6541
Epoch 3/6
44/44 - 136s - loss: 0.6396 - accuracy: 0.6326 - val_loss: 0.6213 - val_accuracy: 0.6595
Epoch 4/6
44/44 - 144s - loss: 0.6093 - accuracy: 0.6691 - val_loss: 0.5821 - val_accuracy: 0.7052
Epoch 5/6
44/44 - 143s - loss: 0.5665 - accuracy: 0.7175 - val_loss: 0.5326 - val_accuracy: 0.7214
Epoch 6/6
44/44 - 142s - loss: 0.4324 - accuracy: 0.8131 - val_loss: 0.5312 - val_accuracy: 0.7633


<tensorflow.python.keras.callbacks.History at 0x1514a7b90>

In [None]:
y_submit = model1.predict(submit_data)

y_submit

s = []

for l in y_submit:
    if l[0] > l[1]:
        s.append(0)
    else:
        s.append(1)

s

test_text = pd.read_csv("test.csv", usecols=['id'])

test_text['target'] = s

test_text

test_text.to_csv('submit_prueba_35.csv', index=False)

In [31]:
# Parametros
input_size = 1014
embedding_size = 69
conv_layers = [[256, 7, 1], 
               [256, 7, 1], 
               [256, 1, -1], 
               [256, 1, -1], 
               [256, 1, -1], 
               [256, 1, 1]]

fully_connected_layers = [1024, 1024]
num_of_classes = 1
dropout_p = 0.5
optimizer = 'adam'
loss = 'binary_crossentropy'

embedding_layer = Embedding(size+1, 
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')

# Embedding 
x = embedding_layer(inputs)

# Conv 
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x) 
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)
        
x = Flatten()(x)

# Fully connected layers 
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)
    x = Dropout(dropout_p)(x)
    
# Output Layer
predictions = Dense(num_of_classes, activation='sigmoid')(x)

# Build model
model2 = Model(inputs=inputs, outputs=predictions)
model2.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model2.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1014)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1014, 69)          4830      
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 1008, 256)         123904    
_________________________________________________________________
activation_12 (Activation)   (None, 1008, 256)         0         
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 1008, 256)         0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 1002, 256)         459008    
_________________________________________________________________
activation_13 (Activation)   (None, 1002, 256)         0   

In [33]:
x_train, x_test, y_train, y_test = \
train_test_split(data, train_classes, test_size = 0.25, random_state = 123)

from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)
callbacks = [callback]

model2.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=2,
          callbacks=callback)

Epoch 1/10
44/44 - 321s - loss: 0.6758 - accuracy: 0.5670 - val_loss: 0.6429 - val_accuracy: 0.6498
Epoch 2/10
44/44 - 302s - loss: 0.6466 - accuracy: 0.6326 - val_loss: 0.6286 - val_accuracy: 0.6692
Epoch 3/10
44/44 - 331s - loss: 0.6313 - accuracy: 0.6511 - val_loss: 0.6183 - val_accuracy: 0.6638
Epoch 4/10
44/44 - 335s - loss: 0.6213 - accuracy: 0.6604 - val_loss: 0.6086 - val_accuracy: 0.6525
Epoch 5/10
44/44 - 309s - loss: 0.5739 - accuracy: 0.7031 - val_loss: 0.5543 - val_accuracy: 0.7257
Epoch 6/10
44/44 - 298s - loss: 0.4835 - accuracy: 0.7742 - val_loss: 0.5292 - val_accuracy: 0.7552
Epoch 7/10
44/44 - 298s - loss: 0.3713 - accuracy: 0.8377 - val_loss: 0.5945 - val_accuracy: 0.7171
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x164680890>

In [35]:
# Parametros
input_size = 1014
embedding_size = 69
conv_layers = [[256, 7, 1], 
               [256, 1, -1], 
               [256, 1, 1]]

fully_connected_layers = [1024, 1024]
num_of_classes = 1
dropout_p = 0.5
optimizer = 'adam'
loss = 'binary_crossentropy'

embedding_layer = Embedding(size+1, 
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')

# Embedding 
x = embedding_layer(inputs)

# Conv 
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x) 
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)
        
x = Flatten()(x)

# Fully connected layers 
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)
    x = Dropout(dropout_p)(x)
    
# Output Layer
predictions = Dense(num_of_classes, activation='sigmoid')(x)

# Build model
model3 = Model(inputs=inputs, outputs=predictions)
model3.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model3.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1014)]            0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 1014, 69)          4830      
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 1008, 256)         123904    
_________________________________________________________________
activation_21 (Activation)   (None, 1008, 256)         0         
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 1008, 256)         0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 1008, 256)         65792     
_________________________________________________________________
activation_22 (Activation)   (None, 1008, 256)         0   

In [36]:
x_train, x_test, y_train, y_test = \
train_test_split(data, train_classes, test_size = 0.25, random_state = 123)

from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)
callbacks = [callback]

model3.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=2,
          callbacks=callback)

Epoch 1/10
44/44 - 179s - loss: 0.6925 - accuracy: 0.5598 - val_loss: 0.6527 - val_accuracy: 0.6089
Epoch 2/10
44/44 - 197s - loss: 0.6396 - accuracy: 0.6352 - val_loss: 0.6010 - val_accuracy: 0.6918
Epoch 3/10
44/44 - 166s - loss: 0.5769 - accuracy: 0.7013 - val_loss: 0.5584 - val_accuracy: 0.7294
Epoch 4/10
44/44 - 174s - loss: 0.4244 - accuracy: 0.8124 - val_loss: 0.6095 - val_accuracy: 0.7278
Epoch 00004: early stopping


<tensorflow.python.keras.callbacks.History at 0x1615a1f10>