# Detecting whether a Tweet is Anti Mask or Pro Mask
<!-- Github link: https://github.com/mro9395/ML_projects/tree/main/03Twitter_Real_or_Missinformation -->

## Install required packages

In [38]:
# Import packages

import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from keras.metrics import CategoricalAccuracy
from keras.metrics import SparseCategoricalAccuracy

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from math import sqrt

## Loading required data

In [6]:
# Load dataset of Tweets
tweets = pd.read_csv('TweetBase.csv')

In [11]:
# Keep only tweets and labels
data = tweets[['text','Human label']]

# Drop null values
data = data.dropna()

In [121]:
# Get overall description of dataset
data.describe()[:2]

Unnamed: 0,tweet,label
count,3343,3115
unique,3102,2


In [86]:
# Group by
data.groupby('label').count()

Unnamed: 0_level_0,tweet
label,Unnamed: 1_level_1
antimask,328
promask,2787


In [24]:
# Rename columns
data.columns = ['tweet','label']

In [56]:
# Rename values and only keep antimask and promask tweets, drop neutral
data['label'] = data['label'].map({1:'promask',-1:'antimask'})
data = data.drop(data[data['label']==0].index)

In [57]:
# Split data in training and test
trainingdata = data.sample(frac = 0.2)
testdata = data.drop(trainingdata.index)

In [78]:
# Print shapes of data
print(trainingdata.shape)
print(testdata.shape)

(669, 2)
(2674, 2)


In [120]:
# # Show samples of tweets

# print('1. Tweets that demonstrate misinformation:\n')
# for i in trainingdata[trainingdata['label']=='promask'].tweet[:7]:
#     print('~', i)

# print('\n 2. Tweets that demonstrate real information:\n')
# for i in trainingdata[trainingdata['label']=='antimask'].tweet[:7]:
#     print('~', i)

**Discussion of dataset**: With this dataset, I aim to build a model to predict whether a tweet has a Pro Mask or Anti Mask sentiment given the COVID-19 pandemic circumstances. Labels describe whether the tweets depict a Pro Mask or Anti Mask sentiment. These labeling has been made by humans so the range of error of mislabeling is very low. Out of the 3343 tweets of this dataset, 328 are Anti Mask tweets and 2787 are Pro Mask. The features are tokens of the content of tweets, while the target is a binary variable (Pro Mask vs Anti Mask).l or false fact-checked claims.

I have personally collected the dataset that contains tweets between December 25, 2020 and January 22, 2021 using Twitter API. These dates were chosen since a COVID-19 cases surge occurred then. The geographical area contains only Florida.


## Preprocessing

### Prepare preprocessor

In [59]:
# Build vocabulary from training text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(trainingdata.tweet)

# preprocessor tokenizes words and makes sure all documents have the same length
def preprocessor(data, maxlen, max_words):

    sequences = tokenizer.texts_to_sequences(data)

    word_index = tokenizer.word_index
    X = pad_sequences(sequences, maxlen=maxlen)

    return X

### Prepare Train and Test Data

In [80]:
# tokenize and pad X data
X_train = preprocessor(trainingdata.tweet, maxlen=60, max_words=10000)
X_test = preprocessor(testdata.tweet, maxlen=60, max_words=10000)

# ohe encode Y data
y_train = pd.get_dummies(trainingdata.label)
y_test = pd.get_dummies(testdata.label)

In [81]:
# Print shapes of data
print(X_train.shape)
print(X_test.shape)

(669, 60)
(2674, 60)


## Model building

In [82]:
# Declare maximum length of tweet

maxlen = 60

In [83]:
# Set function to evaluate models

def model_eval_metrics(y_true, y_pred):
    accuracy_eval = accuracy_score(y_true, y_pred)
    f1_score_eval = f1_score(y_true, y_pred,average="macro",zero_division=0)
    precision_eval = precision_score(y_true, y_pred,average="macro",zero_division=0)
    recall_eval = recall_score(y_true, y_pred,average="macro",zero_division=0)
    mse_eval = 0
    rmse_eval = 0
    mae_eval = 0
    r2_eval = 0
    metricdata = {'accuracy': [accuracy_eval], 'f1_score': [f1_score_eval], 'precision': [precision_eval], 'recall': [recall_eval], 'mse': [mse_eval], 'rmse': [rmse_eval], 'mae': [mae_eval], 'r2': [r2_eval]}
    finalmetricdata = pd.DataFrame.from_dict(metricdata)
    return finalmetricdata

## Model 1: Embeddings with bidirectional LSTM

Here we use an embeddings layer of 64 features followed by a bidirectional LSTM of 32 features before flatten and a dense layer. This model has few layers than the others.

In [101]:
model = Sequential()
model.add(Embedding(10000, 64, input_length=maxlen))
model.add(Bidirectional(LSTM(32)))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2)
score = model.evaluate(X_test, y_test, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [102]:
# Calculate metrics of evaluation 

y_pred = model.predict(X_test).argmax(axis=1)
predicted_labels = [y_test.columns[i] for i in y_pred]

y_test_labels=y_test.idxmax(axis=1) #extract labels from one hot encoded y_test object
y_test_labels=list(y_test.idxmax(axis=1)) #returns a pandas series of predicted labels

model_eval_metrics( y_test_labels,predicted_labels)

Unnamed: 0,accuracy,f1_score,precision,recall,mse,rmse,mae,r2
0,0.833583,0.537883,0.662618,0.540049,0,0,0,0


## Model 2: Embeddings with bidirectional and stacked LSTM

Here we use an embeddings layer with 32 features rather than 64, one bidirectional LSTM and three stacked LSTM layers before the dense layer.

In [114]:
model = Sequential()
model.add(Embedding(10000, 32, input_length=maxlen))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(16))
model.add(Dense(2, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=16,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [115]:
# Calculate metrics of evaluation 

y_pred = model.predict(X_test).argmax(axis=1)
predicted_labels = [y_test.columns[i] for i in y_pred]

y_test_labels=y_test.idxmax(axis=1) #extract labels from one hot encoded y_test object
y_test_labels=list(y_test.idxmax(axis=1)) #returns a pandas series of predicted labels

model_eval_metrics( y_test_labels,predicted_labels)

Unnamed: 0,accuracy,f1_score,precision,recall,mse,rmse,mae,r2
0,0.771503,0.599975,0.596817,0.603875,0,0,0,0


## Model 3: Embeddings with stacked 1D Convolutions

Here a model with an embeddings layer will be used along three stacked 1D Convolution layers using MaxPooling. Finally, a flatten process is used before the dense layer.

In [110]:
model = Sequential()
model.add(Embedding(10000, 32, input_length=maxlen))
model.add(layers.Conv1D(32, 7, activation='relu')) 
model.add(layers.MaxPooling1D())
model.add(layers.Conv1D(32, 7, activation='relu')) 
model.add(layers.MaxPooling1D())
model.add(layers.Conv1D(32, 7, activation='relu')) 
model.add(layers.MaxPooling1D())
model.add(Flatten())
model.add(Dense(2, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [111]:
# Calculate metrics of evaluation 

y_pred = model.predict(X_test).argmax(axis=1)
predicted_labels = [y_test.columns[i] for i in y_pred]

y_test_labels=y_test.idxmax(axis=1) #extract labels from one hot encoded y_test object
y_test_labels=list(y_test.idxmax(axis=1)) #returns a pandas series of predicted labels

model_eval_metrics( y_test_labels,predicted_labels)

Unnamed: 0,accuracy,f1_score,precision,recall,mse,rmse,mae,r2
0,0.835453,0.457419,0.917695,0.501134,0,0,0,0


**Discussion**: The three models have different results, although the accuracy is similar between model 1 and 3. Given that this model deals with imbalanced classes (there are a lot more Pro Mask tweets than Anti Mask), accuracy might not be the best metric to compare. So, I will use F1 score. According to this, the Model 2 with a Bidirectional LSTM and three stacked LSTM has the best score. The more complex architecture of this model might explain why it balances well both recall and precision.

Even though Model 1 has more potential meaningful features (64) the F1 is not as high as Model 2, which implies that keeping the value to 32 might be sufficient for this dataset. Anyhow, Model 1 has a higher precision. So, if our aim is to have very specific results, Model 1 might be a better alternative than 2. But the best performing model in terms of precision is Model 3, the use of three 1D Convolution layers provides a very high precision, but a bad recall. The use of MaxPooling instead of Average Pooling might be causing the model to be more sensible to distinguish the tweets.

## Feeding synthetic tweets

In [119]:
# Use model 2
y_pred = model.predict(preprocessor(pd.Series(['masks dont work',
                                               'covid is a plan',
                                               'end masks',
                                               'masks save lives',
                                               'vaccines are good',
                                               'people must use masks']), maxlen=60, max_words=10000)).argmax(axis=1)
predicted_labels = [y_test.columns[i] for i in y_pred]
predicted_labels

['antimask', 'antimask', 'antimask', 'promask', 'promask', 'antimask']

**Discussion**: Using Model 2 and these samples, we observe that it delivers a good result classifying Anti Mask and pro Mask sentiments. Only the last one has been misclassified. It seems that the use of strong modifiers like 'must' or 'dont' might make the tweet more probable of being classified as Anti Mask. Positive words like 'save', 'lives' or 'good' might be have the oppposite effect (like the vaccine example). It looks promising that the model could differentiate 'covid is a plan' as Anti Mask given that the sentence looks pretty neutral.