![title](LogoEPL.jpg)
<b><p style='text-align: center;'> Algorithms in Data Science  </p> </b>


<b><p style='text-align: center;'> Trump and Biden: Fans' tweet Classification: </p> </b>
<b><p style='text-align: center;'> BERT </p> </b>


<i><p style='text-align: Center;'> Nima Farnoodian , Charles Rongione, Breno Tiburcio</p> </i>
 


## Installing a pip package

In [None]:
import sys
!{sys.executable} -m pip install bert-for-tf2
!{sys.executable} -m pip install sentencepiece
!{sys.executable} -m pip install tensorflow
!{sys.executable} -m pip install tensorflow_hub
!{sys.executable} -m pip install --upgrade tensorflow_hub

## Importing and processing 

In [112]:
import pandas as pd
import math
import random
import numpy as np

Trump_train = pd.read_csv("C:/Users/b_tib/OneDrive - UCL/1F/oLINMA2472/Project/HW2/comments/The_Donald_train.csv")
Trump_train.insert(2, "Candidate", 0)
Biden_train = pd.read_csv("C:/Users/b_tib/OneDrive - UCL/1F/oLINMA2472/Project/HW2/comments/JoeBiden_train.csv")
Biden_train.insert(2, "Candidate", 1)

Trump_test = pd.read_csv("C:/Users/b_tib/OneDrive - UCL/1F/oLINMA2472/Project/HW2/comments/The_Donald_test.csv")
Trump_test.insert(2, "Candidate", 0)
Biden_test = pd.read_csv("C:/Users/b_tib/OneDrive - UCL/1F/oLINMA2472/Project/HW2/comments/JoeBiden_test.csv")
Biden_test.insert(2, "Candidate", 1)

train_set = pd.concat([Biden_train, Trump_train])
test_set = pd.concat([Biden_test, Trump_test])

# Spliting dataframe into interactive lists for body processing and then ensemble together again
body_train=list(train_set['body'])
body_test=list(test_set['body'])

cand_train=list(train_set['Candidate'])
cand_test=list(test_set['Candidate'])
    

 The following script cleans all the comments:

In [246]:
import re

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)             # Remove punctuations and numbers
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)       # Single character removal
    sentence = re.sub(r'\s+', ' ', sentence)                  # Removing multiple spaces
    return sentence

def process_text(sentences):
    processed_body=[]
    for sen in sentences:
        processed_body.append(preprocess_text(sen))
    return processed_body

In [247]:
body_train_processed=process_text(body_train)
body_test_processed=process_text(body_test)
print(body_train_processed[-10:])

['You misspelled asshoe ', 'Reddit being cocksuckers as usual that what ', 'Yeah that what heard also But have to ask how is the secret service allowed to turn away someone with court orders signed by judge and notarized ', 'How many days has it been since this latest attempt at election interference Reddit has to be noticing the traffic decrease ', 'Look at the comment count vs the number of actual comments Enjoy your new communist overlords m sure reddit controlled moderation will be blast ', 'Look at the comment count vs the actual number of comments They are in the process of fine tuning automodderation to delete random comments It only matter of time before they unveil the new censorship in all its glory ', 'I got wanings including for upvoting wrongthink Am shadowbanned spez guess all were for upvoting wrongthink ', 'Less than half the comments are showing This is messed up ', 'You seem just little too content there champ', 'Yeah says but there like ']


## Stop Words

In [115]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
def nltk_ST(example_sent):

    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(example_sent) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
            
    filtered_sentence=' '.join(filtered_sentence) 
    
    return filtered_sentence


In [134]:
def body_ST(body_processed):

    body_processed_ST=[]
    for sen in body_processed:
        sen_ST = nltk_ST(sen)
        body_processed_ST.append(sen_ST)
        
    return body_processed_ST

body_train_processed_ST=body_ST(body_train_processed)
body_test_processed_ST=body_ST(body_test_processed)

print(body_test_processed_ST[0:3])

['Beto badass strikes', 'Imagine three together Look saying shot magical Democratic Party Past Present Future type moment three die simply die', 'Lmao fuck concern troll']


## BERT Tolkenizer (TensorFlow)

In [117]:
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers### Creating Tokenizer

BertTokenizer = bert.bert_tokenization.FullTokenizer                                # create an object
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",      
                            trainable=False)                                        # BERT model from hub.KerasLayer
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()          # create vocabulary np-array file   
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()                    # set the text to lowercase
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)                           # 

### Tolkenizing

Passing comments through the tokenizer:

In [118]:
def tokenize_body(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

In [119]:
def body_2_labels(bodies_processed, candidate):

    tokenized_body = [tokenize_body(body) for body in bodies_processed]
    
    body_cand = [[comment, candidate[i], len(comment)]
                 for i, comment in enumerate(tokenized_body)]  
    
    random.shuffle(body_cand)
    comments_labels = [(comment_lab[0], comment_lab[1]) for comment_lab in body_cand]
    
    return comments_labels

In [187]:
train_labels = body_2_labels(body_train_processed_ST,cand_train)
test_labels = body_2_labels(body_test_processed_ST,cand_test)


### Batching

Training __TensorFlow 2.0 models__:
The batch size of 32 means the neural network will calibrate its weight after 32 comments.

In [274]:
processed_train_set = tf.data.Dataset.from_generator(lambda: train_labels, output_types=(tf.int32,tf.int32))
processed_test_set = tf.data.Dataset.from_generator(lambda: test_labels, output_types=(tf.int32,tf.int32))


BATCH_SIZE_TRAIN = 32 # 705 batches
BATCH_SIZE_TEST = 32 # 6 batches
batched_train_set = processed_train_set.padded_batch(BATCH_SIZE_TRAIN, padded_shapes=((None, ),()))
batched_test_set = processed_test_set.padded_batch(BATCH_SIZE_TEST, padded_shapes=((None, ),()))

TOTAL_BATCHES = math.ceil(len(train_labels) / BATCH_SIZE_TRAIN)

batched_train_set.shuffle(TOTAL_BATCHES)

train_data = batched_train_set
test_data = batched_test_set

Checking how the first bacth look like:

In [260]:
next(iter(train_data))

(<tf.Tensor: shape=(32, 150), dtype=int32, numpy=
 array([[ 2002,  2064,  3828, ...,     0,     0,     0],
        [ 1045,  2812,  4593, ...,     0,     0,     0],
        [ 2017,  2757, 11504, ...,     0,     0,     0],
        ...,
        [10166,  4942,  4760, ...,     0,     0,     0],
        [ 3929,  5993,  2057, ...,     0,     0,     0],
        [ 2027,  2769,  9936, ...,     0,     0,     0]])>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
        1, 0, 0, 1, 1, 0, 1, 0, 1, 0])>)

# Model DCNN Model

Our test model chosen consists on three convolutional hidden layers ranging from two to for kernels.
In between each convolution, it applied Pool to reduce variance an computational (Maximum pooling in this case).  

source: https://colab.research.google.com/drive/12noBxRkrZnIkHqvmdfFW2TGdOXFtNePM#scrollTo=6eh7sIquja5t

In [261]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)

        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")

    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)

        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

In [281]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 100 #200     # [200,1000] 50
CNN_FILTERS = 32 # [32,64] 32
DNN_UNITS = 512   # dense layer  [32,512]
DROPOUT_RATE = 0.20 # for each layer [0.1,0.4]
NB_EPOCHS = 1     # times [3,10]
OUTPUT_CLASSES = 2

text_model = DCNN(vocab_size=VOCAB_LENGTH,
                        emb_dim=EMB_DIM,
                        nb_filters=CNN_FILTERS,
                        FFN_units=DNN_UNITS,
                        nb_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [282]:
# Compiling Model
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy", #log_cosh
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])
    
#Fitting Model
text_model.fit(train_data, epochs = NB_EPOCHS, validation_data = test_data)


#TEsting Model
results = text_model.evaluate(test_data)
print(results)

[0.3392384946346283, 0.8401842713356018]
