<a href="https://colab.research.google.com/github/m-newhauser/rep-or-dem-tweets/blob/main/finetune_full_architecture_tftrainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Install if necessary
try:
    import transformers
    import preprocessor as p
except ImportError:
    print('Installing packages')
    !pip install transformers==4.6.0
    !pip install tweet-preprocessor

In [4]:
# Imports
import random
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
import preprocessor as p

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import (
    TFDistilBertForSequenceClassification,
    TFTrainer,
    TFTrainingArguments,
)

from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

from google.colab import drive

random.seed(123)

In [5]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Define path to save model to Google Drive
save_model_path = "/content/drive/MyDrive/ColabData/models/distilbert-political-tweets-model"

## Pre-process data

In [5]:
# Read in raw data -- https://fivethirtyeight.datasettes.com/fivethirtyeight/twitter-ratio%2Fsenators#export
# tweets_raw = pd.read_csv("senators.csv")

# # Save to parquet (only have to do this the first time)
# tweets_raw.to_parquet("senators.parquet")

# Read in random sample of tweets
tweets_raw = pd.read_parquet("/content/drive/MyDrive/ColabData/senators.parquet").sample(n=10000, random_state=123)

In [6]:
# Remove numbers, emojis and &'s
p.set_options(p.OPT.NUMBER, p.OPT.EMOJI)

tweets = (tweets_raw
          .drop(columns=["created_at", "url", "bioguide_id"])
          .assign(
              text_clean=tweets_raw["text"].apply(p.clean).str.replace("&amp;", "and ").str[:512], # remove &'s and truncate
              party=np.where(tweets_raw.user == "SenSanders", "D", tweets_raw.party) # Change Bernie Sanders from I -> D
              )
          .query('party != "I"') # Remove tweets from Independent senators
          )

In [7]:
# Print some info about the dataset
print(f"{tweets.shape[0]} total tweets in dataset\n")

print(f"Tweets by party:\n{tweets.party.value_counts()}")

9894 total tweets in dataset

Tweets by party:
R    5013
D    4881
Name: party, dtype: int64


In [8]:
# Create a column with numeric labels
label_mapping = {
    "D": 0,
    "R": 1
}

tweets['label'] = np.where(tweets['party'] == "D", 0, 1)

In [9]:
# Convert to list
texts = list(tweets.text_clean)
labels = list(tweets.label)

# Split training dataset into test and train
(train_texts, test_texts, train_labels, test_labels) = train_test_split(
    texts, labels, test_size=0.3
)

### Tokenize data for DistilBERT

In [10]:
# Load DistilBERT tokenizer and tokenize (encode) the texts
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

### Create encodings

In [11]:
# Wrap encodings in a Tensor Flow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

## Fine-tune entire DistilBERT architecture (layers)

In [12]:
# Create a dict of metrics to calculate during training
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


# Provide args for fine-tuning DistilBERT on our data
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=6,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    learning_rate=2e-05,             # start with a low learning rate when fine-tuning
    warmup_steps=250,                # number of warmup steps for learning rate scheduler ([500, 1000] are normal but start low)
    weight_decay=0.01,               # strength of weight decay
    evaluation_strategy="epoch",
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1,
    eval_steps=10
)

# Instantiate the pre-trained model
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", 
        num_labels=2
    )

# Create the trainer
trainer = TFTrainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset,
    compute_metrics=compute_metrics # custom function with metrics to compute
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [13]:
# Train the model
trainer.train()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported


In [14]:
# Evaluate the model
trainer.evaluate()



{'eval_accuracy': 0.9055779569892473,
 'eval_f1': 0.9062395729062396,
 'eval_loss': 0.2606573412495275,
 'eval_precision': 0.9059372915276851,
 'eval_recall': 0.9065420560747663}

In [16]:
# Save model and tokenizer (to Google Drive)
trainer.save_model(save_model_path)
tokenizer.save_pretrained(save_model_path)

('/content/drive/MyDrive/ColabData/models/distilbert-political-tweets-model/tokenizer_config.json',
 '/content/drive/MyDrive/ColabData/models/distilbert-political-tweets-model/special_tokens_map.json',
 '/content/drive/MyDrive/ColabData/models/distilbert-political-tweets-model/vocab.txt',
 '/content/drive/MyDrive/ColabData/models/distilbert-political-tweets-model/added_tokens.json',
 '/content/drive/MyDrive/ColabData/models/distilbert-political-tweets-model/tokenizer.json')

In [15]:
# Make predictions on the test set
test_predictions = trainer.predict(test_dataset)

# Apply softmax to get final predicted labels for test set
test_predictions_labels = test_predictions.predictions.argmax(-1)



In [16]:
# Create an output dataframe with truth and predicted labels on test set
predictions_df = pd.DataFrame({
    "text": test_texts,
    "label": test_labels,
    "pred": test_predictions_labels
})

# Now merge it with other Twitter information
predictions_df = (tweets[["rowid", "user", "state", "party", "text_clean"]]
                  .merge(predictions_df, left_on="text_clean", right_on="text")
                  .drop(columns=["text_clean"])
                  )

In [17]:
# Accuracy by party
(predictions_df
 .groupby("party")
 .apply(lambda x: accuracy_score(x["label"], x["pred"]))
 )

party
D    0.918977
R    0.949424
dtype: float64

In [18]:
# Accuracy by state
(predictions_df
 .groupby("state")
 .apply(lambda x: accuracy_score(x["label"], x["pred"]))
 .sort_values()
 )

state
FL    0.818182
WV    0.830769
CO    0.836066
ND    0.862745
OH    0.865385
MS    0.891892
CA    0.891892
MI    0.897059
AZ    0.902439
MD    0.907407
AR    0.910256
NE    0.914894
RI    0.915493
MA    0.916667
NY    0.916667
IL    0.916667
NV    0.926829
MT    0.928571
IA    0.930233
MO    0.933333
DE    0.934211
LA    0.934783
VA    0.935065
NC    0.935484
WI    0.937500
NH    0.938462
CT    0.938776
NJ    0.940299
GA    0.942029
OR    0.942029
IN    0.943662
HI    0.951220
NM    0.951613
OK    0.952381
AK    0.955556
VT    0.955882
AL    0.956522
KY    0.959459
TX    0.961538
WY    0.961538
MN    0.965517
ID    0.966102
WA    0.969231
KS    0.970588
PA    0.971014
SD    0.971014
TN    0.985075
UT    0.987179
ME    1.000000
SC    1.000000
dtype: float64

In [19]:
# Accuracy by user
(predictions_df
 .groupby("user")
 .apply(lambda x: accuracy_score(x["label"], x["pred"]))
 .sort_values()
 )

user
SenBillNelson      0.428571
SenBennetCO        0.720000
Sen_JoeManchin     0.757576
SenatorHeitkamp    0.760000
SenCortezMasto     0.800000
                     ...   
SenDeanHeller      1.000000
SenatorCollins     1.000000
LindseyGrahamSC    1.000000
SenPatRoberts      1.000000
McConnellPress     1.000000
Length: 99, dtype: float64

## Push fine-tuned model to Huggingface 🤗 repo

In [8]:
# Install git-lfs
import huggingface_hub
huggingface_hub.lfs.install_lfs_in_userspace()

In [9]:
# Log in to Huggingface CLI
!transformers-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: m-newhauser
Password: 
Login successful
Your token: aBSsFWbqJRmQMbVYegoDJqSgfLXfcUmyhctMKGqPpJbIyjlXvycKNvnBprjxWutvVtjTDBoRDafcLDRlDrFRwOrwjcVZCxWkpBIONAUavmeYFltVHdWaqPeMiTPgHuiT 

Your token has been saved to /root/.huggingface/token


In [10]:
# Only enter when needed
hub_password = "xxxx"

# Configure git settings
!git config --global user.email "mary.newhauser@gmail.com"
!git config --global user.name "m-newhauser"

# Load the model that was saved locally
saved_model = TFDistilBertForSequenceClassification.from_pretrained(save_model_path)

# Set the repo url username:password
repo_url = f"https://m-newhauser:{hub_password}@huggingface.co/m-newhauser/distilbert-political-tweets"

# Now push the tokenizer to the hub
tokenizer.push_to_hub(repo_url=repo_url)
saved_model.push_to_hub(repo_url=repo_url)