## CNN - tf.keras

In [235]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
stop_words = stopwords.words('english')

In [236]:
training_data = pd.read_csv('train.csv')

#### Combining the 3 columns ( keyword + location + text ) - filling the NAN by a blank or ''

In [237]:
training_data['text'] = training_data['keyword'].fillna('') + training_data['location'].fillna('')  \
                        + training_data['text'].fillna('')
training_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [238]:
training_data = training_data.drop(columns=['id','keyword','location'],axis=1)
training_data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


#### Data Cleaning step involves tokenizing, removing the stopwords and removing non-alphanumeric tokens

In [239]:
def clean_data(text):
    tokens = text.split()
    no_stopwords = [x for x in tokens if x not in stop_words]
    no_nonalphanum = [x.lower() for x in no_stopwords if x.isalnum()]
    return ' '.join(no_nonalphanum)

In [240]:
training_data['text'] = training_data['text'].apply(clean_data)
print(training_data.shape)
training_data.head()

(7613, 2)


Unnamed: 0,text,target
0,our deeds reason may allah forgive us,1
1,forest fire near la ronge canada,1
2,all residents asked notified no evacuation she...,1
3,people receive evacuation orders california,1
4,just got sent photo ruby smoke pours school,1


In [241]:
test_data = pd.read_csv('test.csv')
test_id = test_data['id']
test_data['text'] = test_data['keyword'].fillna('') + test_data['location'].fillna('')  \
                        + test_data['text'].fillna('')
test_data = test_data.drop(columns=['id','keyword','location'],axis=1)
test_data['text'] = test_data['text'].apply(clean_data)
print(test_data.shape)
test_data.head()

(3263, 1)


Unnamed: 0,text
0,just happened terrible car crash
1,heard different stay safe
2,forest fire spot geese fleeing across i cannot...
3,apocalypse
4,typhoon soudelor kills 28 china taiwan


In [242]:
combined_data = pd.DataFrame()
combined_data = combined_data.append(training_data,ignore_index=True,sort=False)
combined_data = combined_data.append(test_data,ignore_index=True,sort=False)
combined_data = combined_data.text
combined_data.shape

(10876,)

In [243]:
x_train,x_test,y_train,y_test = train_test_split(training_data.text,training_data.target)

In [244]:
from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import regularizers

#### using nltk tokenizer to tokenize and convert text to numbers 

In [245]:
train_docs = training_data.text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_data)

encoded_docs = tokenizer.texts_to_sequences(x_train)
max_length = max([len(s.split()) for s in train_docs])
x_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

encoded_docs = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1


#### CNN with a L2 regularization

In [248]:

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(32, activation='relu',kernel_regularizer=regularizers.l2(0.05)))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())


Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 26, 100)           2058100   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 19, 32)            25632     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 9, 32)             0         
_________________________________________________________________
flatten_15 (Flatten)         (None, 288)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 32)                9248      
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 33        
Total params: 2,093,013
Trainable params: 2,093,013
Non-trainable params: 0
___________________________________________

In [250]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, verbose=2, validation_data=(x_test,y_test))
# evaluate
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 5709 samples, validate on 1904 samples
Epoch 1/10
 - 7s - loss: 1.2943 - accuracy: 0.6322 - val_loss: 0.5561 - val_accuracy: 0.7553
Epoch 2/10
 - 7s - loss: 0.4169 - accuracy: 0.8341 - val_loss: 0.5208 - val_accuracy: 0.7652
Epoch 3/10
 - 6s - loss: 0.2158 - accuracy: 0.9294 - val_loss: 0.6054 - val_accuracy: 0.7621
Epoch 4/10
 - 7s - loss: 0.1180 - accuracy: 0.9714 - val_loss: 0.7020 - val_accuracy: 0.7442
Epoch 5/10
 - 6s - loss: 0.0864 - accuracy: 0.9799 - val_loss: 0.7772 - val_accuracy: 0.7463
Epoch 6/10
 - 7s - loss: 0.0671 - accuracy: 0.9828 - val_loss: 0.8314 - val_accuracy: 0.7589
Epoch 7/10
 - 8s - loss: 0.0586 - accuracy: 0.9835 - val_loss: 0.8441 - val_accuracy: 0.7453
Epoch 8/10
 - 7s - loss: 0.0535 - accuracy: 0.9849 - val_loss: 0.7686 - val_accuracy: 0.7468
Epoch 9/10
 - 8s - loss: 0.0481 - accuracy: 0.9858 - val_loss: 0.8039 - val_accuracy: 0.7453
Epoch 10/10
 - 7s - loss: 0.0448 - accuracy: 0.9860 - val_loss: 0.8599 - val_accuracy: 0.7442
Test Accuracy: 74.422

In [251]:
prediction = model.predict_classes(x_test)
accuracy_score(y_test,prediction)

0.7442226890756303

In [252]:
encoded_docs = tokenizer.texts_to_sequences(test_data.text)
test_data = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

prediction = model.predict_classes(test_data)
prediction = prediction.reshape(-1,1)
predictions = pd.DataFrame()
for i,j in zip(test_id,prediction):
    new = pd.DataFrame({'id':i,'target':j})
    predictions = predictions.append(new,ignore_index=True)
predictions.to_csv('submission.csv',index=False)

#### Accuracy :---
#####                - Local Accuracy : 74.42 (over split training data)
#####                - Online Accuracy : 74.94 (After fitting over all training data)