### Import Libraries & Data

In [1]:
import tensorflow as tf
import numpy as np 
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
training_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
testing_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')

training_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


### Natural Language Processing in TensorFlow

Global variables 

In [5]:
vocab_size = 20000
embedding_dim = 16
max_length = 30
trunc_type = 'post'
oov_tok = "<OOV>"

Tokenizer generator & methods

In [6]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_data.text)
word_index = tokenizer.word_index

In [7]:
training_data.text


0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [8]:
training_sequences = tokenizer.texts_to_sequences(training_data.text)
training_padded = pad_sequences(training_sequences, maxlen=max_length, truncating=trunc_type)

In [9]:
training_padded[0:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,  120, 4634,   25,    5,  869,
           9,   22,  264,  139, 1620, 4635,   90,   41],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,  190,   46,  230,  800, 6955, 6956, 1405],
       [   0,    0,    0,    0,    0,    0,    0,    0,   41, 1752, 1621,
           8, 6957,    7, 6958,   25,  137, 6959,   21, 1753,   40,  442,
         257,   58, 2159,    7,  715, 1406,   25, 1107]], dtype=int32)

In [10]:
training_padded.shape

(7613, 30)

In [11]:
testing_sequences = tokenizer.texts_to_sequences(testing_data.text)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

In [12]:
testing_padded[0:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   35,  914,    6, 1952,  131,   93],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  475,
          57,  264,   12, 1202, 2649,  606, 2322,  246],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          78,   12,    6,  190,   46,   20,  826, 3576,    1,   25, 5168,
         872,    5,  770,   11, 1415,  506,   98,   41]], dtype=int32)

In [13]:
testing_padded.shape

(3263, 30)

### Neural Network

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.Flatten(),
    #tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 16)            320000    
_________________________________________________________________
flatten (Flatten)            (None, 480)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 481       
Total params: 320,481
Trainable params: 320,481
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(training_padded, training_data.target, epochs= 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdb9d959550>

In [18]:
testing_target = model.predict(testing_padded)

In [19]:
testing_target

array([[0.45646805],
       [0.6036356 ],
       [0.7591853 ],
       ...,
       [0.7920799 ],
       [0.9335045 ],
       [0.4225679 ]], dtype=float32)

In [20]:
testing_target.shape

(3263, 1)

In [21]:
type(testing_target)

numpy.ndarray

In [22]:
print(testing_target[900][0])

0.22171783


In [23]:
for i in range(3263):
    if testing_target[i][0] > 0.5:
        testing_target[i][0] = 1
    else:
        testing_target[i][0] = 0

In [24]:
testing_target

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [0.]], dtype=float32)

In [25]:
testing_target = testing_target.astype(int)

In [26]:
testing_target

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [0]])

In [27]:
sub = pd.DataFrame()
sub["id"] = testing_data.id
sub["target"] = testing_target

In [28]:
sub

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [29]:
sub.to_csv("submission.csv", index = False)