# And now for something completely different...

## EDA on a Hugging Face dataset

In [None]:
# !pip uninstall -y umap

In [None]:
!pip install "transformers[torch]" datasets scikit-learn umap-learn pandas numpy matplotlib seaborn

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datasets

pd.set_option('max_colwidth', 200)

### Choose dataset

In [None]:
# datasets.list_datasets()

In [None]:
import pprint
pprint.pprint(datasets.get_dataset_config_info('yahoo_answers_topics'))

In [None]:
builder = datasets.load_dataset_builder('yahoo_answers_topics')
print(f"size of dataset: {round(builder.info.dataset_size/2**30, 2)} GB") # ~0.74 GB
print(f"size of download: {round(builder.info.download_size/2**30, 2)} GB") # ~0.3 GB

In [None]:
qa10topics = datasets.load_dataset('yahoo_answers_topics')

### Check out DatasetDict

In [None]:
qa10topics # huge: 1.4 million training examples

In [None]:
qa10topics['train'].column_names

In [None]:
qa10topics['train'].features

In [None]:
labels = qa10topics['train'].features['topic'].names
labels # this dataset is annotated more for classification than QA

In [None]:
qa10topics.set_format('pandas')
qa10topics['train'][0:3]

In [None]:
# Ditch the test split and the question_content column
qa10topics = None
qa10topics = datasets.load_dataset("yahoo_answers_topics", split="train")
qa10topics = qa10topics.remove_columns('question_content')

___

### Closer look at training split, as pandas dataframe

In [None]:
df = qa10topics.to_pandas() # could use batch size to avoid memory issues
df['topic_name'] = df['topic'].apply(lambda x: labels[x]) # readable labels
df.head(3)

In [None]:
df['topic'].value_counts() # balanced

In [None]:
df.info() # no nulls in any column (but later we will notice blanks)

In [None]:
df.id.nunique() # 'id' is indeed unique id

In [None]:
df.groupby(['topic_name'])['id'].describe() # id independent of topic -- topics scattered, not blocked

In [None]:
df["question_title"].apply(lambda x: len(x.split())).min() # shortest question titles?

In [None]:
df[df["question_title"].apply(lambda x: len(x.split())==1)] # One-word questions coincide with heavy repetition on '?'

In [None]:
df["best_answer"].apply(lambda x: len(x.split())).min() # Most concise answers?

In [None]:
df[df["best_answer"].apply(lambda x: len(x.split()))==0] # A lot of these answers appear blank

In [None]:
df.query("best_answer == ''") # 24,572 rows with blank answer

In [None]:
df.query("best_answer == ''").groupby(['topic_name'])['id'].count().plot(
    kind='bar', title='Unanswered questions, by topic')
plt.show()

### Filter out blank answers

In [None]:
pattern = re.compile('^\s*$') # blanks

# if using huggingface dataset, ...
# ds.filter(lambda x: len(pattern.findall(x["best_answer"])) == 0)

df = df[~df['best_answer'].str.match(pattern)] # drop blanks

Re-examine topic counts

In [None]:
df['topic_name'].value_counts()

Topics remain well balanced, huge

In [None]:
df['topic_name'].value_counts().plot(kind='barh', title='Topic counts')
plt.show()

___

### Clean and split for word counts

In [None]:
def cleaner(text):
    text = re.sub('<.{,10}>', ' ', text) # remove some html tags
    text = text.replace("'", '') # remove apostrophes
    text = re.sub('[^A-Za-z ]', ' ', text) # if punctuation matters, use re.sub(f'[^{string.printable}]', ' ', text)
    text = re.sub(' {2,}', ' ', text) # remove extra spaces
    text = text.lower().strip().split()
    return text

s = " hear that the mojave road is amazing!<br />\.. "
cleaner(s)

In [None]:
df = df.assign(question_title = df["question_title"].apply(cleaner))

In [None]:
df = df.assign(best_answer = df["best_answer"].apply(cleaner))

In [None]:
df.head()

### How do questions begin?

In [None]:
# new column for question start word
df['q_start'] = df['question_title'].apply(lambda x: x[0] if len(x)>0 else '') 

# within each topic, what are the most frequent question start words?
q_start_freq = df.groupby(['topic_name']).value_counts(['q_start'])

# check out top couple in each topic
q_start_freq.groupby(['topic_name']).nlargest(2).droplevel(level=1)


- It seems that "who" is more common in Sports & Entertainment,
- while "why" is more common in Politics & Society.
- "how" dominates Computers & Internet.

In [None]:
q_df = pd.DataFrame()
for topic in labels:
    q_df[topic] = q_start_freq.loc[topic].index[:10]
q_df # most topics have same q_start words

Let's see it with the q_start words aligned

In [None]:
top_question_starts = set(q_df.values.flatten()) # <- union of words in 10x10 q_df above
q_viz = q_start_freq.to_frame(name="count").query("q_start in @top_question_starts") # filter df to 15 words that capture all topics' top 10

plt.figure(figsize=(10, 6))
plt.title('Most common question starts, by topic')
sns.heatmap(q_viz.reset_index().pivot(index='topic_name', columns='q_start', values='count'), 
            cmap='Blues', square=True)
plt.show()

- Why do so many questions start with "I"? 
- especially in health, family, computers 
- Framing the question with challenge or desire? "I want to know...?"


In [None]:
df.query("q_start == 'i'")[:3] # yeah, framing the question

In [None]:
df.drop(columns=["q_start"], inplace=True)

### Word counts

In [None]:
df['nwords_q'] = df["question_title"].apply(lambda x: len(x))
df['nwords_a'] = df["best_answer"].apply(lambda x: len(x))

In [None]:
df[['nwords_q', 'nwords_a']].describe().astype(int)


- We removed blanks earlier, but that was before cleaning and splitting

In [None]:
df = df[df.nwords_q.apply(lambda x: x>0)] 
df = df[df.nwords_a.apply(lambda x: x>0)]

In [None]:
df[['nwords_q', 'nwords_a']].describe().astype(int)

- Ok, no more blanks

In [None]:
df['nwords_q'].plot(kind='hist', bins=20, title='Most questions have 5-15 words', xlabel='Number of words in question')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Many short answers, gamma distribution')
plt.xlabel('Number of words in answer')
plt.ylabel('Frequency')
plt.hist(df['nwords_a'], bins=200, range=(0, 150), histtype='bar', rwidth=2)
plt.show()


### if time permits, topic modeling

start with stop-word removal 

In [None]:
from collections import Counter

cum_tallies = Counter()

for words in df['question_title'].values:
    cum_tallies.update(words)

In [None]:
cum_tallies.most_common(100) # top 100 all look generic

In [None]:
stops = {tup[0] for tup in cum_tallies.most_common(100)}


...nope, no topic modeling

if time permitted, I might list some common Spanish words, French words, etc., then remove examples with high counts of foreign stop words

___
___

## Restart kernel with GPU

### get encodings, embeddings, for umap visualization of small batches

In [None]:
import re
import numpy as np
import torch
import datasets
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader


Load a tokenizer and pre-trained (headless) model


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

chkpt = "deepset/roberta-base-squad2-distilled"
tokenizer = AutoTokenizer.from_pretrained(chkpt)
model = AutoModel.from_pretrained(chkpt).to(device)

tokenizer.vocab_size, tokenizer.model_max_length, tokenizer.model_input_names

In [None]:
device

In [None]:
# # if Dataset instead of IterableDataset 
# from torch.utils.data import DataLoader
# qa10topics.set_format(type="torch")
# training_dataloader = DataLoader(qa10topics['train'], batch_size=batch_size)
# batch = next(iter(training_dataloader))
# batch.update(tokenize(batch))
# batch.update(extract_hidden_states(batch))
# batch['feature_embeddings'].shape # (1, 768)
# X_train = np.array(batch["feature_embeddings"])
# y_train = np.array(batch["topic"])

Prepare an IterableDataset to take small batch

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 100

In [None]:
ds = datasets.load_dataset("yahoo_answers_topics", split="train", streaming=True)

In [None]:
ds = ds.remove_columns('question_content')

In [None]:
# avoid blanks
pattern = re.compile('^\s*$')
ds = ds.filter(lambda x: len(pattern.findall(x["best_answer"])) == 0)
ds = ds.filter(lambda x: len(pattern.findall(x["question_title"])) == 0)

In [None]:
shuffled_ds = ds.shuffle(seed=8, buffer_size=BUFFER_SIZE)

NB: any function mapped to IterableDataset much accept and return a dictionary

In [None]:
def tokenize(batch: dict)-> dict:
    """
    Even though this dataset isn't annotated for extractive QA 
    and the distilled RoBERTa tokenizer doesn't require (Q,A) input, 
    I'll tokenize QA pair, just for fun; maybe later we will decide to try 
    a QA pipeline with metadata fields to limit the retriever.
    """
    return tokenizer(batch["question_title"], batch["best_answer"], padding='max_length', truncation=True, return_tensors="pt")

In [None]:
def extract_hidden_states(batch: dict)-> dict:
    """ get feature embeddings from headless model """

    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}

    with torch.no_grad():# qa10topics.set_format(type="torch")
        last_hidden_state = model(**inputs).last_hidden_state

    return {"feature_embeddings": last_hidden_state[:,0].cpu().numpy()}

In [None]:
dataloader = DataLoader(shuffled_ds, batch_size=BATCH_SIZE)

___

In [None]:
batch = next(iter(dataloader))

In [None]:
batch.update(tokenize(batch))

In [None]:
batch.update(extract_hidden_states(batch))

In [None]:
batch

In [None]:
batch['feature_embeddings'].shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
from umap import UMAP
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
X_train = batch["feature_embeddings"]
y_train = batch["topic"]

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train) # scale to [0,1], for umap dimension reduction algo

In [None]:
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)

Get readable labels

In [None]:
builder = datasets.load_dataset_builder("yahoo_answers_topics")

In [None]:
labels = builder.info.features['topic'].names

Dataframe for plotting

In [None]:
df_embed = pd.DataFrame(mapper.embedding_, columns=["X","y"])
df_embed["label"] = y_train
df_embed.head()

From Hugging Face book:

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(12,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", 
         "Greens", "PuRd", "YlOrBr", "YlGnBu", "RdPu"]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_embed.query(f"label == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show() # not perfect, but sufficient for a sanity-check: patterns are consistent within-topic batch after batch, but not identical across topics

___

note to self: to save intermediate dataset to cloud storage -- more generally, to use huggingface or tensorflow or keras methods that expect a filesystem (like datagenfromdirectory) -- use FUSE, or gcsfs:
<https://huggingface.co/docs/datasets/v1.11.0/filesystems.html>


In [None]:
# !pip install gcsfs
# import gcsfs
# gcs = gcsfs.GCSFileSystem(project="gcs_project_name")
# encoded_dataset.save_to_disk("gcs://bucket_name/enc_ds", fs=gcs)