In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
y_test = sample_submission['target']

In [3]:
x_train, y_train, x_test = train_data['text'], train_data['target'], test_data['text']
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((7613,), (7613,), (3263,), (3263,))

In [13]:
def clean_text(data):
    data = data.str.replace('[^a-zA-Z]', ' ', regex = True)
    data = data.str.replace(r'\$\w*', '', regex = True)
    data = data.str.replace(r'^RT[\s]+', '', regex = True)
    data = data.str.replace(r'https?:\/\/.*[\r\n]*', '', regex = True)
    data = data.str.replace(r'#', '', regex = True)
    return data

In [14]:
x_train_cleaned = clean_text(x_train)
x_test_cleaned = clean_text(x_test)

In [17]:
vocab_size = 10000
embedding_dim = 32
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_train_cleaned)

training_sequences = tokenizer.texts_to_sequences(x_train_cleaned)
training_padded = pad_sequences(training_sequences, maxlen=max_length, truncating=trunc_type)

testing_senquences = tokenizer.texts_to_sequences(x_test_cleaned)
testing_padded = pad_sequences(testing_senquences, maxlen=max_length, truncating=trunc_type)

In [21]:
print(x_train.loc[x_train.str.find('hearts')!=-1], end='\n'*2)
print(x_train_cleaned.loc[x_train_cleaned.str.find('hearts')!=-1])

57      Set our hearts ablaze and every city was a gif...
4670    Oh no. The Boots &amp; hearts social media inu...
4931    They are the real heroes... RIP Brave hearts.....
Name: text, dtype: object

57      Set our hearts ablaze and every city was a gif...
4670    Oh no  The Boots  amp  hearts social media inu...
4931    They are the real heroes    RIP Brave hearts  ...
Name: text, dtype: object


In [23]:
print(x_train[57])
print(x_train_cleaned[57])

Set our hearts ablaze and every city was a gift And every skyline was like a kiss upon the lips @Û_ https://t.co/cYoMPZ1A0Z
Set our hearts ablaze and every city was a gift And every skyline was like a kiss upon the lips      https   t co cYoMPZ A Z


In [31]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])

model.compile(loss='binary_crossentropy',
             optimizer=tf.optimizers.Adam(learning_rate=0.01),
             metrics=['accuracy'])

model.summary()

model.fit(training_padded, y_train, epochs=10, validation_data=(testing_padded , y_test))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 32)           320000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 3840)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 23046     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 7         
Total params: 343,053
Trainable params: 343,053
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e32ba27dc0>

In [32]:
model.evaluate(testing_padded, y_test)



[2.2957985401153564, 0.7051792740821838]

In [33]:
y_predictions = model.predict(testing_padded)
y_pred = [1 if p >0.5 else 0 for p in y_predictions]
y_predictions[0]

array([0.74177605], dtype=float32)

In [34]:
len(testing_padded), len(y_test)

(3263, 3263)