<a href="https://colab.research.google.com/github/mkrsteska/BSA2020_Team_Tissot_Project_2/blob/master/code/Universal%20Sentence%20Encoder%20and%20Keras%20Sequential%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from tqdm import tqdm
import numpy as np
import pandas as pd
from preprocess_tweets import preprocess_tweet_use

Using TensorFlow backend.


In [0]:
# Load Universal Sentence Encoder
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

### **Model #1**

In [0]:
# Read the data 
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df = df[["text", "target"]]

In [0]:
train_text = df.text.values
test_text = df_test.text.values
y_train = df.target.values

In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(train_text):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
X_train = np.array(X_train)

100%|██████████| 7613/7613 [04:35<00:00, 27.59it/s]


In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(test_text):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [01:53<00:00, 28.68it/s]


In [0]:
print ("X_train shape", X_train.shape)
print ("y_train shape", y_train.shape)
print ("X_test shape", X_test.shape)

X_train shape (7613, 512)
y_train shape (7613,)
X_test shape (3263, 512)


In [0]:
model = keras.Sequential()

model.add(keras.layers.Dense(units=256, input_shape=(X_train.shape[1], ),activation='relu'))

model.add(keras.layers.Dropout(rate=0.2))

model.add(keras.layers.Dense(units=128, activation='relu'))

model.add(keras.layers.Dropout(rate=0.2))

model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile( 
    loss='binary_crossentropy', 
    optimizer=keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

In [0]:
history = model.fit(
    X_train, y_train, 
    epochs = 10, 
    batch_size = 512,
    validation_split = 0.1,
    verbose = 1,
    shuffle = True
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
df_test = pd.read_csv("test.csv")
ids = df_test['id'].to_numpy()

In [0]:
predictions = model.predict_classes(X_test)[:, 0]

In [0]:
pd.DataFrame({'id': ids, 'target': predictions}).to_csv('5. Submission_Tensorflow_Keras.csv', index=False)

### **Model #2**
**with preprocessed tweets**

In [0]:
#Read the data
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df = df[["text", "target"]]

In [0]:
train_text = df.text.apply(preprocess_tweet_use)
test_text = df_test.text.apply(preprocess_tweet_use)
y_train = df.target

In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(train_text):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
X_train = np.array(X_train)

100%|██████████| 7613/7613 [04:23<00:00, 28.84it/s]


In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(test_text):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [01:49<00:00, 29.66it/s]


In [0]:
model = keras.Sequential()

model.add(keras.layers.Dense(units=256, input_shape=(X_train.shape[1], ),activation='relu'))

model.add(keras.layers.Dropout(rate=0.1))

model.add(keras.layers.Dense(units=128, activation='relu'))

model.add(keras.layers.Dropout(rate=0.1))

model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile( 
    loss='binary_crossentropy', 
    optimizer=keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

In [0]:
history = model.fit(
    X_train, y_train, 
    epochs = 12, 
    batch_size = 1024,
    validation_split = 0.1,
    verbose = 1,
    shuffle = True
)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [0]:
predictions = model.predict_classes(X_test)[:, 0]

In [0]:
pd.DataFrame({'id': ids, 'target': predictions}).to_csv('6. Submission_Tensorflow_Keras.csv', index=False)

### **Model #3**
**with preprocessed tweets**

In [0]:
X_train_expand = np.expand_dims(X_train, axis=2)

X_train_expand.shape

(7613, 512, 1)

In [0]:
model = keras.Sequential()

model.add(keras.layers.Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(512, 1)))

model.add(keras.layers.Conv1D(32, kernel_size=3, activation='elu', padding='same'))

model.add(keras.layers.Conv1D(32, kernel_size=3, activation='relu', padding='same'))

model.add(keras.layers.MaxPooling1D(pool_size=3))

model.add(keras.layers.Bidirectional(keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2)))

model.add(keras.layers.Dense(units=256, activation='relu'))

model.add(keras.layers.Dropout(rate=0.1))

model.add(keras.layers.Dense(units=256, activation='relu'))

model.add(keras.layers.Dropout(rate=0.1))

model.add(keras.layers.Dense(units=128, activation='relu'))

model.add(keras.layers.Dropout(rate=0.1))

model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy'])













In [0]:
history = model.fit(
    X_train_expand, y_train, 
    epochs = 50, 
    batch_size = 512,
    validation_split = 0.1,
    verbose = 1,
    shuffle = True
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [0]:
X_test_expand = np.expand_dims(X_test, axis=2)

X_test_expand.shape

(3263, 512, 1)

In [0]:
predictions = model.predict_classes(X_test_expand)[:, 0]

In [0]:
pd.DataFrame({'id': ids, 'target': predictions}).to_csv('7. Submission_Tensorflow_Keras.csv', index=False)