In [None]:
import os
import cv2
import time
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

import torchvision

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

Image Data from [here](https://www.kaggle.com/andrewmvd/animal-faces)
- Animal Faces

## Images

In [None]:
# What's in this dataset?
os.listdir('course_data/afhq')

In [None]:
# three labels
os.listdir('course_data/afhq/train')

In [None]:
# within each folder are the images
os.listdir('course_data/afhq/train/cat')[:5]

In [None]:
# create a dataframe for our data
data_path = 'course_data/afhq'

rows = []
for dataset in os.listdir(data_path):
    for label in os.listdir(data_path + f'/{dataset}'):
        for image in os.listdir(data_path + f'/{dataset}' + f'/{label}'):
            row = dict()
            row['image_file'] = image
            row['label'] = label
            row['dataset'] = dataset
        
            # a bit redudant, could build from other data in __getitem__ if wanted
            row['image_path'] = data_path + f'/{dataset}' + f'/{label}'
            rows.append(row)
        
df = pd.DataFrame(rows)
print(len(df))
df.head()

In [None]:
# training and validation data
df_train = df[df['dataset'] == 'train'].reset_index(drop=True)
df_val = df[df['dataset'] == 'val'].reset_index(drop=True)
len(df_train), len(df_val)

Before creating a Dataset class, let's think about what we want as our input to the network

In [None]:
import cv2

# pull up an image
row = df.iloc[0]
image_path = row['image_path']
fname = row['image_file']
path = image_path+'/'+fname
img = cv2.imread(path)

# what is an image?
img

In [None]:
# 512x512 image with 3 channels
print(img.shape)

# pixel intensity goes from 0 to 255
print(np.max(img), np.min(img))

In [None]:
# look at the image
plt.imshow(img)

In [None]:
# why is it weird? cv2 opens in BGR instead of RGB
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

## Dataset

In [None]:
# Let's create a Dataset for our animal faces! 
class AnimalFacesDataset(Dataset):
    def __init__(self, df):
        self.df = df
        
        # label dictionary
        self.label_dict = {'cat':0, 'dog':1, 'wild':2}
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # get ingredients for retrieving image
        image_path = row['image_path']
        fname = row['image_file']
        path = image_path+'/'+fname
        
        # read the img
        img = cv2.imread(path)
        
        # convert to RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # move color channels to correct spot
        img = np.transpose(img, (2, 0, 1))
        
        # convert to [0,1] scale
        img = torch.tensor(img / 255.).float()
        
        label = torch.tensor(self.label_dict[row['label']])
        
        return img, label

In [None]:
ds_train = AnimalFacesDataset(df_train)
dl_train = DataLoader(ds_train, batch_size = 8, shuffle=True)

In [None]:
# make sure our recipe works!
# notice the time...
for img, label in tqdm(dl_train):
    None

## Text

IMDB Movie Review Dataset (cleaned)
- Originally from [here](https://ai.stanford.edu/~amaas/data/sentiment/)
- Cleaned into a csv [here](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

In [None]:
df = pd.read_csv('course_data/IMDB Dataset.csv')
df.head()

## Automatic Tokenization with Spacy

In [None]:
# tool for text
import spacy

# load information about words
!python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [None]:
some_text = df.iloc[9]['review']
print(some_text)

# automatically tokenize the text
tokenized_text = nlp(some_text)

In [None]:
# it's not perfect
for token in tokenized_text:
    print(token.text)

In [None]:
idx = 5

token = tokenized_text[idx]

# lemmatization
print('Lemmatization of', token.text, 'is', token.lemma_)

# part of speech tagging
print(token.text, 'is a', token.pos_)

# is it a stop word?
print('The fact that', token.text, 'is a stop word is', token.is_stop)

In [None]:
# sentence segmentation
for sentence in tokenized_text.sents:
    print(sentence)

- tons more fancy features!
- Let's do a simple pipeline where we ignore non-alphabetic characters

In [None]:
import re

a_review = df.iloc[9]['review']

# remove those <br />s
a_review = a_review.replace('<br />', ' ')
print(a_review)

# remove non-alphabetic characters
a_review = re.sub("[^A-Za-z']+", ' ', a_review)
print(a_review)

In [None]:
# disabling some fancy features of spacy for speed
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

rows = []
for idx in tqdm(range(len(df))):
    row = df.iloc[idx].copy()
    
    # first we remove numeric characters and lowercase everything
    cleaned_review = re.sub("[^A-Za-z']+", ' ', row['review'].replace('<br />', ' ')).lower()
    
    # we let spaCy tokenize and lemmatize the text for us
    tokenized_review = nlp(cleaned_review)
    cleaned_tokenized = [token.lemma_ for token in tokenized_review if ((not token.is_stop) or (' ' in token.text))]
    
    if len(cleaned_tokenized) > 1:
        row['cleaned'] = ' '.join(cleaned_tokenized)
    rows.append(row)
df_clean = pd.DataFrame(rows)
df_clean.head()
df_clean.to_csv('course_data/IMDB_cleaned.csv')

In [None]:
df_clean = pd.read_csv('course_data/IMDB_cleaned.csv')
df_clean.head()

In [None]:
print(df_clean.iloc[9]['review'])
print('\n')
df_clean.iloc[9]['cleaned']

## Prepare for Training

In [None]:
# count words, send infrequent to unknown

# let's get an idea of word frequency
from collections import Counter

reviews = [review.split(' ') for review in list(df_clean['cleaned'])]
word_freq = Counter([token for review in reviews for token in review]).most_common()

In [None]:
# no surprises here
word_freq[:10]

In [None]:
# words only seen once
word_freq[-25:]

In [None]:
# remove words that appear infrequently
word_freq = dict(word_freq)
print(len(word_freq))
min_freq = 5
word_dict = {}

# sending all the unknowns to 0
i = 1
for word in word_freq:
    if word_freq[word] > min_freq:
        word_dict[word] = i
        i += 1
    else:
        word_dict[word] = 0

# dictionary length        
dict_length = max(word_dict.values()) + 1
dict_length

In [None]:
# to collate the tensors into batches, sentence need to be the same size
# we could overwrite the collate function, or we could pick a max sentence size and pad

max_length = 0
for idx in tqdm(range(len(df_clean))):
    row = df_clean.iloc[idx]
    length = len(row['cleaned'].split(' '))
    if length > max_length:
        max_length = length
print(max_length)

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.sent_dict = {'negative': 0, 'positive': 1}
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        review = row['cleaned'].split(' ')
        x = torch.zeros(self.max_len)
        
        # get review as a list of integers
        for idx in range(len(review)):
            
            # we want to front pad for RNN
            x[self.max_len - len(review) + idx] = self.word_dict[review[idx]]
            
        y = torch.tensor(self.sent_dict[row['sentiment']]).float()
        
        # embedding likes long tensors
        return x.long(), y
ds = IMDBDataset(df_clean, word_dict, max_length)
next(iter(ds))

## Models

In [None]:
# CBOW model for sentiment analysis
# train the embedding during training
class CBOW(nn.Module):
    def __init__(self, dict_length, embedding_size):
        super(CBOW, self).__init__()
        # padding index turns off gradient for unknown tokens
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        self.linear = nn.Linear(embedding_size, 1)
        self.emb_size = embedding_size
        
    def forward(self, x):
        sent_length = x.shape[1]
        x = self.word_emb(x)
        sent_length = torch.count_nonzero(x, dim=1)
        x = torch.sum(x, dim=1) / sent_length
        x = self.linear(x)
        return torch.squeeze(x)

In [None]:
dl = DataLoader(ds, batch_size=1000, shuffle=True)
x, y = next(iter(dl))

cbow_model = CBOW(dict_length, 100)
cbow_model(x).shape

In [None]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

def one_pass_acc(model, dataloader, num_points):
    model.eval()
    total_incorrect = 0
        
    for x, y in dataloader:
        y_pred = (torch.sigmoid(model(x)) > 0.5).float()
        
        total_incorrect += torch.count_nonzero(y - y_pred).item()
        
    percent_wrong = total_incorrect / num_points
    return 1 - percent_wrong

In [None]:
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(cbow_model.parameters(), lr = 0.01)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(cbow_model, dl, optimizer, lossFun)
    print('Loss: ', loss)
    
    acc = one_pass_acc(cbow_model, dl, len(ds))
    print('Accuracy: ', acc)

## HuggingFace

In [None]:
# main HuggingFace package
import transformers as hf

Note that the tokenizer is often intimately linked to the model you are using.
- Below is the tokenizer for the basic BERT model (see [here](https://huggingface.co/bert-base-uncased))
- Note that lemmatization and stop word removal techniques are not used here
- You can also see some sub-word tokens

In [None]:
tokenizer = hf.AutoTokenizer.from_pretrained('bert-base-uncased')

sent = 'hello world, I am flying to Kashyyk!'

tokenizer.tokenize(sent)

In [None]:
# notice the output here
tokenizer(sent)

In [None]:
# what's going on here?
tokenizer('hello')

Special tokens for beginning and end of sequences.
- "CLS" for classification
- "SEP" for separating (sentences)

In [None]:
tokenizer.convert_ids_to_tokens([101, 102])

Recall that all input sequences have to be the same length, so we often must use padding.

In [None]:
# automatic padding
tokenizer(sent, padding="max_length")

In [None]:
# pad by longest sentence in list
tokenizer(['a sentence', 'a longer sentence'], padding="longest")

In [None]:
# fancy
tokenizer(['a sentence',
           'a longer sentence',
           'a way way way longer sentence'], truncation=True, max_length=6, padding='longest')

In [None]:
# return PyTorch tensors!
tokenizer(sent, return_tensors='pt')

In [None]:
# can handle more than one sequence
# notice the token type ids
two_sents = tokenizer('here is a sentence', 'this is the second sentence')
two_sents

In [None]:
tokenizer.convert_ids_to_tokens(two_sents['input_ids'])

In [None]:
model = hf.AutoModel.from_pretrained('bert-base-uncased')

In [None]:
model

In [None]:
# no specified task here
inputs = tokenizer(sent, padding='max_length', return_tensors="pt")
output = model(**inputs)

In [None]:
# two output vectors
len(output)

The first output is the BERT emebedding for each token. The second output has pooled them together in some way.

In [None]:
output[0].shape, output[1].shape

Now we're getting into the nitty-gritty of transformers. Let's take a step back in abstraction. What if I want to do something specific with a nice big, pretrained BERT model?

In [None]:
# classification of sentences
model = hf.AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = hf.AutoTokenizer.from_pretrained(name)
model = hf.AutoModelForSequenceClassification.from_pretrained(name)

sentences = ["I am very angry", "I am very happy", "I am in the middle"]

tokens = tokenizer(sentences , padding=True, truncation=True, return_tensors="pt")

output = model(**tokens)
output

Let's abstract it even more!

In [None]:
classifier = hf.pipeline("sentiment-analysis", model=name)
classifier(sent)

In [None]:
classifier(sentences)

- As always beware of the [bias](https://huggingface.co/course/chapter1/8?fw=pt) in this model!
- More [tasks](https://huggingface.co/course/chapter7/1?fw=pt)