# Tag Cluster Analysis

## Summary

This code summarizes the essential steps found in the full analysis.

***

#### 1. Read data

```
colnames = pd.read_csv("tags.txt")
col_parts = [split_words(cn) for cn in colnames]
col_sentences = [' '.join(part) for part in col_parts]
```

#### 2. Train and save the model

```
ByteLevelBPETokenizer
RobertaTokenizer.from_pretrained
torch.device('cuda')
```

#### 3. K-Means cluster

```
embeddings = {sent: get_embeddings(sent) for sent in col_sentences}
n_clusters = 100
```

#### 3.A. BERT K-Means cluster

```
bert_embeddings = np.array([val for key,val in embeddings.items()])
```

#### 3.B. TFIDF K-Means cluster

```
trimmed = [key.replace(' ', '_') for key,val in embeddings.items()]
vectorizer_ntf = TfidfVectorizer(analyzer='char',ngram_range=(1,6))
ngram_embeddings = vectorizer_ntf.fit_transform(trimmed).toarray()
```

#### 3.C. Basic K-Means cluster

```
def get_tokenized(sentence):
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, padding='max_length')
    return input_ids
tok_embeddings = {key: get_tokenized(key) for key,val in embeddings.items()}
tok_embeddings = np.array([val for key,val in tok_embeddings.items()], dtype=np.float32)
```

#### 4. DBSCAN cluster on combined clusters

```
similarity = np.matrix(np.mean([tok_matrix, ngram_matrix, bert_matrix], axis=0))
ens_assigned = DBSCAN().fit(similarity).labels_
```

#### 5. Visualize

```
px.treemap
```
***

## 0. Set up

In [1]:
!pip install transformers
!pip install plotly

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import re
import os

import numpy as np
import pandas as pd

import torch
from tqdm import tqdm

from transformers import RobertaTokenizer, RobertaConfig, RobertaForMaskedLM, AdamW
from tokenizers import ByteLevelBPETokenizer

from cuml import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

import plotly.express as px
import plotly.io as pio

## 1. Read data

In [3]:
an7_df = pd.read_csv("an7.csv", nrows = 1)

colnames = an7_df.columns
colnames = [cn for cn in colnames if cn != 'rec_epoch' and cn != '']
colnames = colnames[2:]

def split_words(sentence):
    pattern = '([A-Z]{0,1}[a-z]+|[0-9]+|[A-Z]+)'
    return list(map(lambda x: x.lower(), re.findall(pattern, sentence)))

col_parts = [split_words(cn) for cn in colnames]
max_len = max(map(len, col_parts)) * 3
vocab_size = len(np.unique([c for sub in col_parts for c in sub]))

col_sentences = [' '.join(part) for part in col_parts]
col_sentences

['30 fic 1036 pv',
 '30 fic 675 6 pv',
 '30 fic 9 2 pv',
 '30 pdic 693 pv',
 '30 pi 943 pv',
 '31 fic 9 1 pv',
 '31 h 4 trip 2 cr pv',
 '31 h 4 trip 37 cr pv',
 '31 h 4 trip 3 cr pv',
 '31 h 4 trip 49 cr pv',
 '31 h 4 trip 4 cr pv',
 '31 h 4 trip 4 tr pv',
 '31 h 4 trip 5 cr pv',
 '31 h 4 trip 9 cr pv',
 '31 h 4 trip 9 tr pv',
 '31 h 4 tripdin 1 16 pv',
 '31 h 4 tripdlogstat pv',
 '31 h 4 tripfo 5 pv',
 '31 h 4 triphitsd pv',
 '31 ti 10 111 pv',
 '320 fc 2054 pv',
 '320 fc 2063 pv',
 '320 fc 2063 sp',
 '320 fc 2106 pv',
 '320 fc 2122 pv',
 '320 lah 36 pv',
 '320 lah 42 pv',
 '320 lahh 161 6 pv',
 '320 lahh 2038 pv',
 '320 lal 2068 pv',
 '320 lal 2087 b pv',
 '320 lc 2050 pv',
 '320 li 2050 1 pv',
 '320 li 216 1 pv',
 '320 li 305 1 pv',
 '320 li 311 2 pv',
 '320 li 314 pv',
 '320 li 401 2 pv',
 '320 p 301 2 pv',
 '320 p 301 3 sp',
 '320 p 301 6 pv',
 '320 p 302 pv',
 '320 p 305 4 pv',
 '320 p 305 6 sp',
 '320 p 307 1 pv',
 '320 p 307 5 sp',
 '320 p 307 6 sp',
 '320 p 309 pv',
 '320 p 30

In [4]:
f = open("tag_names.txt", "w")
for col in col_sentences:
    f.write(col + "\n")
f.close()

## 2. Train the model

Byte level tokenizer for tags

In [5]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=['tag_names.txt'],
                vocab_size=vocab_size,
                min_frequency=1,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

path = './tags'
if not os.path.exists(path):
    os.mkdir(path)
tokenizer.save_model('tags')






['tagbert/vocab.json', 'tagbert/merges.txt']

Prepare text function

In [6]:
def prepare_text(original_text, tokenizer, mask_per=0):
    tokenized = tokenizer(original_text, max_length=max_len+2, padding='max_length', truncation=True)
    input_ids = np.array(tokenized['input_ids'])
    labels = np.array(tokenized['input_ids'])
    attn_mask = np.array(tokenized['attention_mask'])

    if mask_per > 0:
        rand_mask = np.random.rand(len(input_ids))
        mask_bool = (rand_mask < mask_per) * (input_ids > tokenizer.mask_token_id)
        
        input_ids[mask_bool] = tokenizer.mask_token_id
    
    return input_ids, labels, attn_mask

Roberta tokenizer and Torch inputs, labels, and masks

In [7]:
tokenizer = RobertaTokenizer.from_pretrained('tags', max_len=max_len)

input_ids = []
labels = []
masks = []

for sentence in col_sentences:
    inp, lab, msk = prepare_text(sentence, tokenizer, 0.15)

    input_ids.append(inp)
    labels.append(lab)
    masks.append(msk)

input_ids = torch.tensor(input_ids)
labels = torch.tensor(labels)
masks = torch.tensor(masks)

  input_ids = torch.tensor(input_ids)


Torch data loader

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

encodings = {'input_ids': input_ids, 'attention_mask': masks, 'labels': labels}
dataset = Dataset(encodings)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

Model setup

In [9]:
config = RobertaConfig(
    vocab_size=vocab_size,  # we align this to the tokenizer vocab_size
    max_position_embeddings=100,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

model = RobertaForMaskedLM(config)

Pytorch model

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model.save_pretrained('./tags')

Epoch 0: 100%|█████████████████████████████████████████████████████████████| 185/185 [00:06<00:00, 28.22it/s, loss=0.18]
Epoch 1: 100%|████████████████████████████████████████████████████████████| 185/185 [00:06<00:00, 30.02it/s, loss=0.129]


***
## 3. K-means clusters

Get embeddings

In [11]:
tokenizer = RobertaTokenizer.from_pretrained('tags', max_len=max_len)
model = RobertaForMaskedLM.from_pretrained('tags', output_hidden_states=True)

def get_embeddings(sentence):
    input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        output = model(input_ids)

    sentence = torch.mean(output.hidden_states[-1], dim=1).squeeze()
    return sentence.tolist()

embeddings = {sent: get_embeddings(sent) for sent in col_sentences}

Set the number of clusters

In [12]:
n_clusters = 100

### 3.A. Bert

In [13]:
bert_embeddings = np.array([val for key,val in embeddings.items()])
bert_assigned = KMeans(n_clusters=n_clusters).fit(bert_embeddings).labels_

print(bert_embeddings)
print(bert_embeddings.shape)
print(bert_assigned)

[[ 0.70432413  0.46304297  0.62492901 ...  0.18706192  0.21794353
   0.39417842]
 [ 0.06814011  0.82242113  0.83895427 ...  0.45808119  0.72384846
  -0.25988039]
 [ 0.52438879 -0.00370699  0.23375988 ...  0.24840973  0.27771083
  -0.12161738]
 ...
 [ 0.13490051  0.09283341  0.7884838  ... -0.12167238  0.5347687
  -0.67930782]
 [ 0.37457299  0.41564214  0.49819303 ...  0.00121878 -0.20404068
   0.18723816]
 [ 0.00856628  0.34612221  0.69429493 ... -0.29135454  0.41447243
  -0.53319287]]
(2949, 768)
[42 42 42 ...  4 65 82]


### 3.B. TFIDF

In [14]:
trimmed = [key.replace(' ', '_') for key,val in embeddings.items()]
vectorizer_ntf = TfidfVectorizer(analyzer='char',ngram_range=(1,6))

ngram_embeddings = vectorizer_ntf.fit_transform(trimmed).toarray()
ngram_assigned = KMeans(n_clusters=n_clusters).fit(ngram_embeddings).labels_

print(ngram_embeddings)
print(ngram_embeddings.shape)
print(ngram_assigned)

[[0.05050783 0.         0.         ... 0.         0.         0.        ]
 [0.0228953  0.         0.         ... 0.         0.         0.        ]
 [0.02878984 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.02071107 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.02023242 0.         0.         ... 0.         0.         0.        ]]
(2949, 20407)
[33 76 33 ... 38 33 38]


### 3.C. Basic

In [15]:
def get_tokenized(sentence):
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, padding='max_length')
    return input_ids

tok_embeddings = {key: get_tokenized(key) for key,val in embeddings.items()}
tok_embeddings = np.array([val for key,val in tok_embeddings.items()], dtype=np.float32)
tok_assigned = KMeans(n_clusters=n_clusters).fit(tok_embeddings).labels_

print(tok_embeddings)
print(tok_embeddings.shape)
print(tok_assigned)

[[  0. 364. 408. ...   1.   1.   1.]
 [  0. 364. 408. ...   1.   1.   1.]
 [  0. 364. 408. ...   1.   1.   1.]
 ...
 [  0. 474. 270. ...   1.   1.   1.]
 [  0. 730. 264. ...   1.   1.   1.]
 [  0. 569. 265. ...   1.   1.   1.]]
(2949, 24)
[96 55 96 ...  0 86  0]


***
## 4. DBSCAN cluster on combined

Combine cluster methods

In [16]:
bert_matrix = (np.array(bert_assigned)[None,:] == np.array(bert_assigned)[:,None]).astype(int)
ngram_matrix = (np.array(ngram_assigned)[None,:] == np.array(ngram_assigned)[:,None]).astype(int)
tok_matrix = (np.array(tok_assigned)[None,:] == np.array(tok_assigned)[:,None]).astype(int)
similarity = np.matrix(np.mean([tok_matrix, ngram_matrix, bert_matrix], axis=0))

DBSCAN cluster

In [17]:
ens_assigned = DBSCAN().fit(similarity).labels_
ens_assigned

array([  0,   1,   0, ...,  -1,  -1, 105], dtype=int32)

## 5. Visualize combined cluster assignments

Cluster labels

In [18]:
ens_df = pd.DataFrame({'tag': trimmed, 'cluster': ens_assigned})
agg_ens_df = ens_df.groupby('cluster').agg({'tag': ['count', lambda x: ', '.join(x)]})
agg_ens_df = agg_ens_df.reset_index()
agg_ens_df.columns = ['cluster', 'count', 'label']
agg_ens_df = agg_ens_df[agg_ens_df['cluster'] > -1]
agg_ens_df


Unnamed: 0,cluster,count,label
1,0,6,"30_fic_1036_pv, 30_fic_9_2_pv, 31_fic_9_1_pv, ..."
2,1,6,"30_fic_675_6_pv, 30_fic_675_3_pv, 31_fic_675_1..."
3,2,10,"31_h_4_trip_2_cr_pv, 31_h_4_trip_3_cr_pv, 31_h..."
4,3,9,"320_p_305_4_pv, 320_p_305_6_sp, 320_p_301_4_sp..."
5,4,7,"320_p_307_1_pv, 320_p_307_5_sp, 320_p_307_6_sp..."
...,...,...,...
111,110,5,"70_lahh_220_pv, 70_lahh_317_pv, 70_lahh_507_pv..."
112,111,5,"70_p_206_b_pv, 70_p_208_b_pv, 70_p_209_b_pv, 7..."
113,112,6,"70_pdi_217_f_pv, 70_pdi_217_h_pv, 70_pdi_217_i..."
114,113,5,"70_xv_118_1_b_sp, 70_xv_139_1_a_sp, 70_xv_117_..."


In [19]:
pio.renderers.default = 'iframe' #notebook jupyterlab
fig = px.treemap(agg_ens_df, path=['cluster', 'label'], values='count')
fig.show();