In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))


['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']


In [2]:
df = pd.read_csv('../input/train_E6oV3lV.csv')
df['label'] = df['label'].map({0: 2, 1: 1})
df = df.drop('id', axis=1)

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import copy

Using TensorFlow backend.


In [4]:
train, test = train_test_split(df, test_size=0.2)

In [5]:
train_df = copy.deepcopy(train)
test_df = copy.deepcopy(test)

train_texts = train_df['tweet'].values
train_texts = [s.lower() for s in train_texts]

test_texts = test_df['tweet'].values
test_texts = [s.lower() for s in test_texts]


In [6]:
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)

alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"

char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

train_data = pad_sequences(train_sequences, maxlen=150, padding='post')
test_data = pad_sequences(test_texts, maxlen=150, padding='post')

train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

train_classes = train_df['label'].values
train_class_list = [x - 1 for x in train_classes]

test_classes = test_df['label'].values
test_class_list = [x - 1 for x in test_classes]

from keras.utils import to_categorical

train_classes = to_categorical(train_class_list)
test_classes = to_categorical(test_class_list)

In [7]:
vocab_size = len(tk.word_index)

In [8]:
embedding_weights = [] #(71, 70)
embedding_weights.append(np.zeros(vocab_size)) # first row is pad

for char, i in tk.word_index.items(): # from index 1 to 70
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)


In [9]:
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

In [10]:
input_size = 150
embedding_size = 70
conv_layers = [[256, 7, 3], 
               [256, 7, 3], 
               [256, 3, -1], 
               [256, 3, -1], 
               [256, 3, -1], 
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = 2
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

In [11]:
embedding_layer = Embedding(vocab_size+1, 
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

In [12]:
inputs = Input(shape=(input_size,), name='input', dtype='int64')  
x = embedding_layer(inputs)
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x) 
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x) 
x = Flatten()(x) 

for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x) 
    x = Dropout(dropout_p)(x)

predictions = Dense(num_of_classes, activation='softmax')(x)

model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) 
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 70)           4970      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 144, 256)          125696    
_________________________________________________________________
activation_1 (Activation)    (None, 144, 256)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 48, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)    

In [13]:
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices]
y_train = train_classes[indices]

x_test = test_data
y_test = test_classes

In [14]:
# Training
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=2)



Instructions for updating:
Use tf.cast instead.
Train on 25569 samples, validate on 6393 samples
Epoch 1/10
 - 8s - loss: 0.2342 - acc: 0.9275 - val_loss: 0.1753 - val_acc: 0.9249
Epoch 2/10
 - 3s - loss: 0.1532 - acc: 0.9428 - val_loss: 0.1418 - val_acc: 0.9485
Epoch 3/10
 - 3s - loss: 0.0928 - acc: 0.9668 - val_loss: 0.1041 - val_acc: 0.9571
Epoch 4/10
 - 3s - loss: 0.0589 - acc: 0.9800 - val_loss: 0.0875 - val_acc: 0.9717
Epoch 5/10
 - 3s - loss: 0.0435 - acc: 0.9855 - val_loss: 0.1206 - val_acc: 0.9559
Epoch 6/10
 - 3s - loss: 0.0340 - acc: 0.9891 - val_loss: 0.1127 - val_acc: 0.9668
Epoch 7/10
 - 3s - loss: 0.0241 - acc: 0.9920 - val_loss: 0.1443 - val_acc: 0.9698
Epoch 8/10
 - 3s - loss: 0.0242 - acc: 0.9914 - val_loss: 0.1556 - val_acc: 0.9673
Epoch 9/10
 - 3s - loss: 0.0143 - acc: 0.9951 - val_loss: 0.1586 - val_acc: 0.9676
Epoch 10/10
 - 3s - loss: 0.0132 - acc: 0.9953 - val_loss: 0.1504 - val_acc: 0.9697


<keras.callbacks.History at 0x7f4f80f06b38>