# Data Mining: lab 2.1

In [1]:
!pip install transformers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import TFDistilBertForSequenceClassification
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import pipeline



In [2]:
# Load data as a dataframe

df = pd.read_pickle('labeled_tweets.p')

In [3]:
# Basic info

print(f"Number of rows: {df.shape[0]}")
print(f"Number of attributes: {df.shape[1]}")

# Overview of dataset
df.head()

Number of rows: 2882
Number of attributes: 3


Unnamed: 0,text_a,text_b,label
0,RT @WaysMeansCmte: Republican Senators need to...,Laid-off workers set up soup kitchens in front...,2
1,Jeff Van Drew sold out his district and his co...,Pitch in to help Amy Kennedy defeat Jeff Van D...,0
2,Speaker Pelosi has failed the American people—...,House Minority Leader McCarthy: Pelosi touts D...,1
3,To learn more about global efforts to #EndPoli...,"Home | End Polio. With your help, we can end p...",1
4,RT @realDailyWire: BREAKING: Hunter Biden Rece...,Hunter Biden Received Millions From Wife Of Ex...,0


In [4]:
# Inspect the dataset: how many tweets are there per different value of 'label'?

df.groupby(by="label").count()

Unnamed: 0_level_0,text_a,text_b
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1275,1275
1,1377,1377
2,230,230


### Logistic Regession

In [5]:
# Split the test in training and test set. For this BOW analysis we will use the words contained in the text_a features.
X = list(df.text_a.values)
y = list(df.label.values)
labels = ['affitmative', 'negotiated', 'oppositional']

X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
# Vectorize the string features
cv = CountVectorizer()
cv.fit(X_train_str) # create the vocabulary

X_train = cv.transform(X_train_str)
X_test = cv.transform(X_test_str)

In [7]:
# Create and train the model
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
# Determine how well the algorithm predicts the target variable
y_pred = lr.predict(X_test)

print("Classification report:\n")
print(classification_report(y_test, y_pred, target_names=labels))
print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))

Classification report:

              precision    recall  f1-score   support

 affitmative       0.57      0.62      0.60       256
  negotiated       0.55      0.59      0.57       270
oppositional       0.20      0.04      0.07        51

    accuracy                           0.55       577
   macro avg       0.44      0.42      0.41       577
weighted avg       0.53      0.55      0.54       577

Confusion matrix:

[[159  95   2]
 [105 159   6]
 [ 14  35   2]]


In [9]:
# How does it perform with a random set of label values?
y_rand = np.random.randint(0, 3, len(y_pred))

print(classification_report(y_test, y_rand, target_names=labels))
print(confusion_matrix(y_test, y_rand))

              precision    recall  f1-score   support

 affitmative       0.43      0.32      0.37       256
  negotiated       0.51      0.34      0.41       270
oppositional       0.11      0.43      0.17        51

    accuracy                           0.34       577
   macro avg       0.35      0.37      0.32       577
weighted avg       0.44      0.34      0.37       577

[[83 78 95]
 [91 92 87]
 [20  9 22]]


Our classifier gives a more accurate prediction that a random guess of the label. Another fact that justifes it is that the f1-score is higher than 0.5.

In [10]:
# Try to interpret the model. Based on the words with the highest coefficients, what do you think the model has learned?

vocabulary = cv.get_feature_names()
regression_coefficients = lr.coef_[0] # get the LR weights
vocab_coef_combined = list(zip(regression_coefficients, vocabulary)) # this combines two separate lists [1, 2], ['word1', 'word2'] into one list [[1, 'word1'], [2, 'word2']]

feature_importance = pd.DataFrame(vocab_coef_combined,
                      columns=['coef', 'word'])
feature_importance.sort_values('coef', ascending=False).head(10)

Unnamed: 0,coef,word
5583,0.738226,helping
10305,0.71353,rt
2839,0.704435,complete
3189,0.635509,cruz
1820,0.607602,below
6493,0.595849,job
5114,0.590124,generation
11553,0.585061,team
7098,0.581903,live
2887,0.57807,confirmation


In [11]:
# Use TF-IDF features instead of raw counts. Does it improve the performance of your model?

tfidf = TfidfVectorizer() 

tfidf.fit(X_train_str) # create the vocabulary

X_train_idf = tfidf.transform(X_train_str)
X_test = tfidf.transform(X_test_str)

In [12]:
y_pred = lr.predict(X_test)

print("Classification report:\n")
print(classification_report(y_test, y_pred, target_names=labels))
print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))

Classification report:

              precision    recall  f1-score   support

 affitmative       0.47      0.96      0.63       256
  negotiated       0.63      0.11      0.19       270
oppositional       0.00      0.00      0.00        51

    accuracy                           0.48       577
   macro avg       0.37      0.36      0.27       577
weighted avg       0.50      0.48      0.37       577

Confusion matrix:

[[246  10   0]
 [239  31   0]
 [ 43   8   0]]


  _warn_prf(average, modifier, msg_start, len(result))


Using TF-IDF vectorizer doesn't improve the performance of the model: both macro and weighted averages of precision, recall and f1 score are lower than when using raw frequencies.

### BERT

#### Fine-tune a DistilBERT model on our task and evaluate its performance: does it perform any better than using a logistic regression classifier?

In [13]:
# Create sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

In [14]:
import tensorflow as tf
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128) # convert input strings to BERT encodings
test_encodings = tokenizer(X_test, truncation=True, padding=True,  max_length=128)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(16) # convert the encodings to Tensorflow objects
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
)).batch(64)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(64)

In [15]:
# Compile model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', 
                                                           num_labels=len(labels))
callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, 
                      mode='min', baseline=None, 
                      restore_best_weights=True)]

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it fo

In [16]:
# Train the model for ten epochs
model.fit(train_dataset, epochs=10, callbacks=callbacks, validation_data=val_dataset, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<tensorflow.python.keras.callbacks.History at 0x7fd1cb161be0>

In [17]:
logits = model.predict(test_dataset)
y_preds = np.argmax(logits[0], axis=1)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.64      0.48      0.55       116
           1       0.59      0.80      0.68       147
           2       0.00      0.00      0.00        25

    accuracy                           0.60       288
   macro avg       0.41      0.43      0.41       288
weighted avg       0.56      0.60      0.57       288



  _warn_prf(average, modifier, msg_start, len(result))


The results aren't overwhelmingly better than the other's for any of the algorithms. However, we will favor BERT against logistic regression because the f1-score of the accuracy is higher (0.62 > 0.55).

#### Instead of using a DistilBERT which is trained of all kinds of texts, we could also make use of BERTweet, which is – the name says it all – specifically trained on tweets. To use BERTweet, we’ll have to make a few adjustments to the code in the manual [also make sure you install the emoji package (!pip install emoji)].

In [20]:
!pip install emoji



In [18]:
# Create sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

# The two steps below are different for BERTweet model
# Load bertweet tokenizer
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base', normalization=True)

# Load bertweet model
model = TFAutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base', num_labels=len(labels))


# Convert input strings to BERT encodings
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128) 
test_encodings = tokenizer(X_test, truncation=True, padding=True,  max_length=128)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)

# Convert the encodings to Tensorflow objects
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(16) 
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
)).batch(64)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(64)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

In [19]:
# Fix bug of transformers package
model.roberta.return_dict = False

# Compile model
callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, 
                      mode='min', baseline=None, 
                      restore_best_weights=True)]

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss)

In [20]:
model.fit(train_dataset, epochs=10, callbacks=callbacks, validation_data=val_dataset, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<tensorflow.python.keras.callbacks.History at 0x7fd1c011d128>

In [21]:
# Check report of the model
logits = model.predict(test_dataset)
y_preds = np.argmax(logits[0], axis=1)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.40      1.00      0.57       116
           1       0.00      0.00      0.00       147
           2       0.00      0.00      0.00        25

    accuracy                           0.40       288
   macro avg       0.13      0.33      0.19       288
weighted avg       0.16      0.40      0.23       288



  _warn_prf(average, modifier, msg_start, len(result))


According to the classification report above, the BERTweet model doesn't classify the tweets significantly better than the standard BERT model: on the contrary, the acuraccy f1-score is lower by 8 decimals.

#### Look at the confusion matrix of BERTtweets predictions: what can you conclude about the classification of tweets with the label oppositional?

In [22]:
print(confusion_matrix(y_test, y_preds))

[[116   0   0]
 [147   0   0]
 [ 25   0   0]]


The algortihm didn't label any tweet as "oppositional", not even the ones that actually were (which have been labelled as 'affirmative')

#### During our Q&A yesterday, Tim de Winkel said the interpreting retweets as a communicative act takes some contextual knowledge: you have to know how retweets are generally used. Can you relate this to the performance of the models?

Knowing that users are likely to retweet tweets they agree with will implicitly create a bias against oppositional retweets, as there will be very few occassions where an user retweets content he does not agree with (some exception would be mocking the opponent, or retweeting old tweets of them where they said something controversial by today's standards)

## Zero shot classification

#### Think of the topics that were relevant during the U.S. elections in the last three months. Use these topics as candidate labels.

#### Classify the first ten tweets in the labeled tweets.p dataset, and evaluate the predicted labels manually. Would this model fruitful to use to label the topics of these tweets?

In [23]:
classifier = pipeline("zero-shot-classification")
candidate_labels = ['vote', 'election', 'biden', 'trump', 'mail', 'fraud' , 'president', 'ballots']
first_ten_tweets = df.text_a[10].to_list()[:10]

results = classifier(first_ten_tweets, candidate_labels)

Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartModel: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification m

In [24]:
results

[{'labels': ['biden',
   'vote',
   'president',
   'trump',
   'election',
   'ballots',
   'mail',
   'fraud'],
  'scores': [0.6972934007644653,
   0.07789458334445953,
   0.05175100639462471,
   0.04757499694824219,
   0.043722447007894516,
   0.03964070603251457,
   0.032259780913591385,
   0.009863041341304779],
  'sequence': "I was honored to join this ceremony with Legion Post #9 and VFW Post #1617 in Derry to honor Gold Star Mother Nancy Geary.\n\nWe will always remember the sacrifice of her son, LCpl. Michael Geary, as well as Nancy's sacrifice, courage, and resiliency.\n\nhttps://t.co/Yl0Bm4wkyD"},
 {'labels': ['biden',
   'vote',
   'mail',
   'president',
   'trump',
   'ballots',
   'election',
   'fraud'],
  'scores': [0.5052504539489746,
   0.17508500814437866,
   0.08642823994159698,
   0.06787031143903732,
   0.06729519367218018,
   0.05534692108631134,
   0.030514873564243317,
   0.012209057807922363],
  'sequence': "RT @nextstepsidaho: @SenatorRisch @IdSBOE Thanks fo