**Installation**

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install --upgrade scikit-learn
!pip install -q tfds-nightly tensorflow matplotlib

[K     |████████████████████████████████| 85 kB 2.7 MB/s 
[K     |████████████████████████████████| 4.0 MB 4.9 MB/s 
[K     |████████████████████████████████| 4.1 MB 5.0 MB/s 
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import Libraries**

In [None]:
import pandas as pd
import io
import string
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import re
import shutil
import string
import tensorflow as tf
import tensorflow_datasets as tfds
import sklearn

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

gpus = tf.config.list_physical_devices("GPU")
if gpus:
  print(len(gpus), "Logical GPUs")
  print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
else:
  print("No GPU was detected. This code can be very slow without a GPU.")

# to make this notebook's output stable across runs
np.random.seed(20)
tf.random.set_seed(20)

No GPU was detected. This code can be very slow without a GPU.


In [None]:
# Import other common libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
import itertools
import re
import spacy

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


**Import Dataset** - Train and Test

In [None]:
trainfile = "/content/drive/MyDrive/testGC/training.1600000.processed.noemoticon.csv"
testfile = "/content/drive/MyDrive/testGC/vaccination_all_tweets.csv"
train_columns = ['polarity','id','date','query_string','twitter_user','tweet']
test_columns = ['id','user_name','user_location','user_description','user_created','user_followers','user_friends','user_favourites','user_verified','date','text','hashtags','source','retweets','favorites','is_retweet']

In [None]:
# Import Train data set
df_train = pd.read_csv(trainfile,
                 header=None, 
                 names=train_columns, 
                 encoding='latin-1')
df_train.head()

Unnamed: 0,polarity,id,date,query_string,twitter_user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
# Import Test data set
df_test = pd.read_csv(testfile,
                 encoding='latin-1',
                 low_memory=False)
df_test.head()

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1340539111971516416,Rachel Roh,"La Crescenta-Montrose, CA",Aggregator of Asian American news; scanning di...,2009-04-08 17:52:46,405,1692,3247,False,2020-12-20 06:06:44,Same folks said daikon paste could treat a cyt...,['PfizerBioNTech'],Twitter for Android,0,0,False
1,1338158543359250433,Albert Fong,"San Francisco, CA","Marketing dude, tech geek, heavy metal & '80s ...",2009-09-21 15:27:30,834,666,178,False,2020-12-13 16:27:13,While the world has been on the wrong side of ...,,Twitter Web App,1,1,False
2,1337858199140118533,elið±ð¹ðªðºð,Your Bed,"heil, hydra ðâº",2020-06-25 23:30:28,10,88,155,False,2020-12-12 20:33:45,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,"['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf...",Twitter for Android,0,0,False
3,1337855739918835717,Charles Adler,"Vancouver, BC - Canada","Hosting ""CharlesAdlerTonight"" Global News Radi...",2008-09-10 11:28:53,49165,3933,21853,True,2020-12-12 20:23:59,"Facts are immutable, Senator, even when you're...",,Twitter Web App,446,2129,False
4,1337854064604966912,Citizen News Channel,,Citizen News Channel bringing you an alternati...,2020-04-23 17:58:42,152,580,1473,False,2020-12-12 20:17:19,Explain to me again why we need a vaccine @Bor...,"['whereareallthesickpeople', 'PfizerBioNTech']",Twitter for iPhone,0,0,False


In [None]:
#train - choose only 3 columns
df_train = df_train[['date','tweet','polarity']]
df_train = df_train.dropna(subset=['tweet'])
df_train.head()

Unnamed: 0,date,tweet,polarity
0,Mon Apr 06 22:19:45 PDT 2009,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,0
2,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,0
3,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,0
4,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",0


In [None]:
#test - choose only 2 columns and 1 cols for sentiment
df_test = df_test[['date','text']]
df_test = df_test.dropna(subset=['text'])
df_test.loc[:,'polarity'] = np.nan

In [None]:
df_test = df_test.rename({'text':'tweet'}, axis=1)
df_test.head()

Unnamed: 0,date,tweet,polarity
0,2020-12-20 06:06:44,Same folks said daikon paste could treat a cyt...,
1,2020-12-13 16:27:13,While the world has been on the wrong side of ...,
2,2020-12-12 20:33:45,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,
3,2020-12-12 20:23:59,"Facts are immutable, Senator, even when you're...",
4,2020-12-12 20:17:19,Explain to me again why we need a vaccine @Bor...,


**Split Dataset**

In [None]:
train_df, val_df = np.split(df_train.sample(frac=1), [int(0.9*len(df_train))])
test_df = df_test
print(len(train_df), 'training examples')
print(len(val_df), 'validation examples')
print(len(test_df), 'test examples')

1440000 training examples
160000 validation examples
202456 test examples


In [None]:
#train_df_sample = train_df.sample(200000,random_state=10)
train_df_sample = train_df
X_train = train_df_sample.drop("polarity", axis=1) # drop labels for training set
y_train = train_df_sample["polarity"].copy()

In [None]:
#val_df_sample = val_df.sample(16000,random_state=10)
val_df_sample = val_df
X_val = val_df_sample.drop("polarity", axis=1) # drop labels for validation set
y_val = val_df_sample["polarity"].copy()

In [None]:
#test_df_sample = test_df.sample(10000,random_state=10)
test_df_sample = test_df
X_test = test_df_sample.drop("polarity", axis=1) # drop labels for test set
y_test = test_df_sample["polarity"].copy()

In [None]:
print("X_train: " + str(X_train.shape) + str(type(X_train)))
print("y_train: "+ str(y_train.shape)+ str(type(y_train)))
print("X_val: " + str(X_val.shape) + str(type(X_val)))
print("y_val: "+ str(y_val.shape)+ str(type(y_val)))
print("X_test: " + str(X_test.shape) + str(type(X_test)))
print("y_test: "+ str(y_test.shape)+ str(type(y_test)))

X_train: (1440000, 2)<class 'pandas.core.frame.DataFrame'>
y_train: (1440000,)<class 'pandas.core.series.Series'>
X_val: (160000, 2)<class 'pandas.core.frame.DataFrame'>
y_val: (160000,)<class 'pandas.core.series.Series'>
X_test: (202456, 2)<class 'pandas.core.frame.DataFrame'>
y_test: (202456,)<class 'pandas.core.series.Series'>


**Preprocessing**

In [None]:
# Punctuation Removal
punctuation_removal = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    # remove https links
    clean_tweet = re.sub(r'http\S+', '', text)
    # remove username
    clean_tweet = re.sub('@[^\s]+','', clean_tweet)
    # convert text to lowercase
    clean_tweet = clean_tweet.lower()
    # remove numbers
    clean_tweet = re.sub('\d', ' ', clean_tweet)
    # remove whitespaces
    clean_tweet = ' '.join(clean_tweet.split())

    return clean_tweet.translate(str.maketrans('', '', punctuation_removal))

In [None]:
X_train["tweet"] = X_train["tweet"].apply(lambda text: remove_punctuation(text))

X_val["tweet"] = X_val["tweet"].apply(lambda text: remove_punctuation(text))

X_test["tweet"] = X_test["tweet"].apply(lambda text: remove_punctuation(text))

In [None]:
#remove stopwords
#!pip install nltk
#import nltk

#from nltk.corpus import stopwords
#nltk.download('stopwords')

#stop_words = set(stopwords.words('english'))

#X_train["tweet"] = X_train["tweet"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
#X_val["tweet"] = X_val["tweet"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
#X_test["tweet"] = X_test["tweet"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
vocab_size = 50000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
X_train['tweet']

313228     get to a show girl do whatcha gotta do what if...
253601                     ex boyfriends suck pete the alien
1041037                          which one did you like best
217422                damn my back is hella hurtting at work
495067                    unfortunately my days already over
                                 ...                        
622215       dont feel well sooooo no rocco for me tonight  
174561     starting my nd shift im going to miss like an ...
77153      having spent a fair whack of yesterday working...
71907                                          is very tired
298735     blownleft over chipotle probably how im going ...
Name: tweet, Length: 1440000, dtype: object

In [None]:
X_val['tweet']

381986                            sluggish saturdayin office
456844                          hahah spare me some hw elves
1417046    glad i just got something off my chest it feel...
1140067        home from workboreddd got to see today missin
544028                               fine then lol go see it
                                 ...                        
556250     oh how i love foggy weather and iphoneipod os ...
1334495                                     morning twitters
1085711            finally watching my benjamin button movie
879066     watching oneman quotbrown eyed girlquot a cape...
1031523            okay good knight twitter world hubby time
Name: tweet, Length: 160000, dtype: object

In [None]:
X_test['tweet']

0         same folks said daikon paste could treat a cyt...
1         while the world has been on the wrong side of ...
2         coronavirus sputnikv astrazeneca pfizerbiontec...
3         facts are immutable senator even when youre no...
4         explain to me again why we need a vaccine wher...
                                ...                        
202451    rdif sputnikv is already registered in countri...
202452    breaking  russiaâs health minister mikhail m...
202453     bbmp bengaluru covidvaccine availability for ...
202454    argentinaâs ceo on sputnikv authorization ba...
202455     bbmp bengaluru covidvaccine availability for ...
Name: tweet, Length: 202456, dtype: object

**Label and Tweet Encoding**

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.fit_transform(y_val)
y_test = label_encoder.fit_transform(y_test)

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train['tweet'])
tokenizer.fit_on_texts(X_val['tweet'])
tokenizer.fit_on_texts(X_test['tweet'])
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train['tweet'])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(X_val['tweet'])
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(X_test['tweet'])
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Need this block to get it to work with TensorFlow 2.x
training_padded = np.array(train_padded)
training_labels = np.array(y_train)

val_padded = np.array(val_padded)
val_labels = np.array(y_val)

test_padded = np.array(test_padded)
test_labels = np.array(y_test)

**Model**

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          800000    
                                                                 
 bidirectional (Bidirectiona  (None, 32)               4224      
 l)                                                              
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 804,769
Trainable params: 804,769
Non-trainable params: 0
_________________________________________________________________


In [None]:
# predict on a sample text without padding.

predictions = model.predict(np.array([val_sequences[0]]))
print(predictions[0])

[0.49941152]


In [None]:
# predict on a sample text with padding

predictions = model.predict(np.array([val_padded[0]]))
print(predictions[0])

[0.50117886]


In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['binary_accuracy'])

**Execute Model**

In [None]:
num_epochs = 5
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(val_padded, val_labels),
                    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
test_loss, test_acc = model.evaluate(test_padded, test_labels)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)


Test Loss: 0.8680432438850403
Test Accuracy: 0.5316118001937866


In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'binary_accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

In [None]:
predictions = model.predict(test_padded)
print(predictions)

[[0.2416096 ]
 [0.3554325 ]
 [0.460093  ]
 ...
 [0.34999275]
 [0.917688  ]
 [0.34999275]]


In [None]:
df = pd.DataFrame(predictions, columns=['predictions'])
df.to_csv('prediction-RNN.csv', index=False)

**Stack two or more LSTM layers**

In [None]:
#model = tf.keras.Sequential([
#    tf.keras.layers.Embedding(vocab_size, 64, mask_zero=True),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
#    tf.keras.layers.Dense(64, activation='sigmoid'),
#    tf.keras.layers.Dropout(0.5),
#    tf.keras.layers.Dense(1)
#])

In [None]:
#model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
#              optimizer=tf.keras.optimizers.Adam(),
#              metrics=['binary_accuracy'])

In [None]:
#history = model.fit(training_padded, training_labels, 
#                    epochs=5,
#                    validation_data=(test_padded, test_labels),
#                    validation_steps=30,
#                    verbose=1)

In [None]:
#test_loss, test_acc = model.evaluate(testing_dataset)

#print('Test Loss:', test_loss)
#print('Test Accuracy:', test_acc)


In [None]:
#plt.figure(figsize=(16, 8))
#plt.subplot(1, 2, 1)
##plot_graphs(history, 'binary_accuracy')
#plt.ylim(None, 1)
#plt.subplot(1, 2, 2)
#plot_graphs(history, 'loss')
#plt.ylim(0, None)

In [None]:
#predictions = model.predict(test_padded)
#print(predictions)

In [None]:
#df = pd.DataFrame(predictions, columns=['predictions'])
#df.to_csv('prediction-RNN-2layer.csv', index=False)