<a href="https://colab.research.google.com/github/m-newhauser/rep-or-dem-tweets/blob/main/finetune_binary_full_architecture_pt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Links
* [Huggingface Course - Write your training loop in PyTorch](https://huggingface.co/course/chapter3/4?fw=pt) (Article)
* [Huggingface Course - A full training](https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter3/section4.ipynb#scrollTo=WARodF9Sa6Yq) (Notebook)

In [1]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Install if necessary
try:
    import transformers
    import preprocessor as p
except ImportError:
    print('Installing packages')
    !pip install datasets transformers[sentencepiece] tweet-preprocessor

In [2]:
# Imports
import random
import torch
import sqlite3
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
import preprocessor as p

from sqlalchemy import create_engine
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

from google.colab import drive

random.seed(123)

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define root dir in Google Drive
root_dir = "/content/drive/MyDrive/colab_data"

# PARAMS
train_size = 0.8
checkpoint = "distilbert-base-uncased"
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [4]:
# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Pre-process data

In [5]:
# Connect to locally created sqlite DB
conn = sqlite3.connect(f"{root_dir}/raw_data/TWEETS.db")  # path to db

# Select only tweets from current session of Congress in 2021
tweets_df = pd.read_sql("SELECT * FROM senators WHERE date BETWEEN '2021-01-20' AND '2021-12-31'", conn)

# Print total number of tweets
print(f"{tweets_df.shape[0]} total tweets in dataset\n")

99693 total tweets in dataset



In [6]:
# Remove numbers, emojis and &'s
p.set_options(p.OPT.NUMBER, p.OPT.EMOJI)

tweets_df = (tweets_df
          .assign(
              text=tweets_df["text"].apply(p.clean).str.replace("&amp;", "and ").str[:512], # remove &'s and truncate
              party=np.where(tweets_df.party == "Independent", "Democrat", tweets_df.party) # Change Independent senator's party to Democrat
           ) 
          .drop(columns="index")
          )

In [7]:
# Create a list of classes and map them using id2label
id2label = {str(i): label for i, label in enumerate(tweets_df["party"].unique().tolist())}
label2id = {v: k for k, v in id2label.items()}

print(label2id)

{'Republican': '0', 'Democrat': '1'}


In [8]:
# Create a "labels" column from the label2id mapping
tweets_df = (tweets_df
             .assign(labels=tweets_df["party"].map(label2id)) # Create a labels column (for expected DistilBERT input)
)

In [9]:
# Put clean data in a dataset split into train and test sets
dataset = Dataset.from_pandas(tweets_df).train_test_split(train_size=train_size, seed=123)

# Cast labels column as class labels
dataset = dataset.class_encode_column("labels")

Flattening the indices:   0%|          | 0/80 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/80 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/8 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/20 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/20 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

### Tokenize data for DistilBERT

In [10]:
from transformers import AutoTokenizer

# Load DistilBERT tokenizer and tokenize (encode) the texts
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [11]:
# Make a list of columns to remove before tokenization
cols_to_remove = [col for col in dataset["train"].column_names if col != "labels"]

# Tokenize and encode the dataset
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    return tokenized_batch

dataset_enc = dataset.map(tokenize, batched=True, remove_columns=cols_to_remove, num_proc=4)

# Set dataset format for PyTorch
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [12]:
from transformers import DataCollatorWithPadding

# Instantiate a data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create data loaders for to reshape data for PyTorch model
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    dataset_enc["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    dataset_enc["test"], batch_size=8, collate_fn=data_collator
)

In [13]:
# Dynamically set number of class labels based on dataset
num_labels = dataset["train"].features["labels"].num_classes

## Fine-tune entire DistilBERT architecture (layers)

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

In [16]:
# Model parameters
learning_rate = 5e-5
num_epochs = 5

In [17]:
from transformers import AdamW

# Create the optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

from transformers import get_scheduler

# Further define learning rate scheduler
num_training_batches = len(train_dataloader)
num_training_steps = num_epochs * num_training_batches
lr_scheduler = get_scheduler(
    "linear",                   # linear decay
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [18]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

# Move model to device
model.to(device)

# Train the model with PyTorch training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/49850 [00:00<?, ?it/s]

In [19]:
# Save model to disk
model.save_pretrained(f"{root_dir}/models/distilbert-political-tweets")

In [21]:
from datasets import load_metric

metric = load_metric("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9076182356186369, 'f1': 0.9116716217512228}

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(f"{root_dir}/models/distilbert-political-tweets").to(device)

In [42]:
model.device

device(type='cuda', index=0)

In [48]:
raw_inputs = [
    "Corporate greed is Kroger giving its CEO a 296% pay raise over the past decade, increasing profits by 23.5% in 2021, seeing its stock price jump 36% in the past year and spending $1.5 billion on stock buybacks and dividends, while 44% of Kroger workers can't afford rent.",
    "Do you know why 1 out of 4 Americans can't afford their prescription drugs? Do you know why thousands of Americans die every year because they can’t afford their medicine? Do you know why millions of Americans ration their insulin? I'll tell you why: greed, greed, greed.",
    "Tolerating lawlessness and anarchy is not compassionate. It doesn’t help vulnerable communities for politicians to passively watch them devolve into war zones. Democrats need to drop the soft-on-crime nonsense and give American families the protection they deserve."
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt").to(device) # Move the tensor to the GPU
outputs = model(**inputs)
outputs

SequenceClassifierOutput([('logits', tensor([[-6.5196,  6.4241],
                                   [-6.3935,  6.3234],
                                   [ 5.9433, -5.0645]], device='cuda:0', grad_fn=<AddmmBackward0>))])

In [49]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[2.3912e-06, 1.0000e+00],
        [3.0001e-06, 1.0000e+00],
        [9.9998e-01, 1.6573e-05]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


In [47]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [None]:
# Accuracy by party
(predictions_df
 .groupby("party")
 .apply(lambda x: accuracy_score(x["label"], x["pred"]))
 )

party
D    0.918977
R    0.949424
dtype: float64

In [None]:
# Accuracy by state
(predictions_df
 .groupby("state")
 .apply(lambda x: accuracy_score(x["label"], x["pred"]))
 .sort_values()
 )

state
FL    0.818182
WV    0.830769
CO    0.836066
ND    0.862745
OH    0.865385
MS    0.891892
CA    0.891892
MI    0.897059
AZ    0.902439
MD    0.907407
AR    0.910256
NE    0.914894
RI    0.915493
MA    0.916667
NY    0.916667
IL    0.916667
NV    0.926829
MT    0.928571
IA    0.930233
MO    0.933333
DE    0.934211
LA    0.934783
VA    0.935065
NC    0.935484
WI    0.937500
NH    0.938462
CT    0.938776
NJ    0.940299
GA    0.942029
OR    0.942029
IN    0.943662
HI    0.951220
NM    0.951613
OK    0.952381
AK    0.955556
VT    0.955882
AL    0.956522
KY    0.959459
TX    0.961538
WY    0.961538
MN    0.965517
ID    0.966102
WA    0.969231
KS    0.970588
PA    0.971014
SD    0.971014
TN    0.985075
UT    0.987179
ME    1.000000
SC    1.000000
dtype: float64

In [None]:
# Accuracy by user
(predictions_df
 .groupby("user")
 .apply(lambda x: accuracy_score(x["label"], x["pred"]))
 .sort_values()
 )

user
SenBillNelson      0.428571
SenBennetCO        0.720000
Sen_JoeManchin     0.757576
SenatorHeitkamp    0.760000
SenCortezMasto     0.800000
                     ...   
SenDeanHeller      1.000000
SenatorCollins     1.000000
LindseyGrahamSC    1.000000
SenPatRoberts      1.000000
McConnellPress     1.000000
Length: 99, dtype: float64

## Push fine-tuned model to Huggingface 🤗 repo

In [None]:
# Install git-lfs
import huggingface_hub
huggingface_hub.lfs.install_lfs_in_userspace()

In [None]:
# Log in to Huggingface CLI
!transformers-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: m-newhauser
Password: 
Login successful
Your token: aBSsFWbqJRmQMbVYegoDJqSgfLXfcUmyhctMKGqPpJbIyjlXvycKNvnBprjxWutvVtjTDBoRDafcLDRlDrFRwOrwjcVZCxWkpBIONAUavmeYFltVHdWaqPeMiTPgHuiT 

Your token has been saved to /root/.huggingface/token


In [None]:
# Only enter when needed
hub_password = "xxxx"

# Configure git settings
!git config --global user.email "mary.newhauser@gmail.com"
!git config --global user.name "m-newhauser"

# Load the model that was saved locally
saved_model = TFDistilBertForSequenceClassification.from_pretrained(save_model_path)

# Set the repo url username:password
repo_url = f"https://m-newhauser:{hub_password}@huggingface.co/m-newhauser/distilbert-political-tweets"

# Now push the tokenizer to the hub
tokenizer.push_to_hub(repo_url=repo_url)
saved_model.push_to_hub(repo_url=repo_url)