# Exploring ATA6 embedding using descriptions in ATA Master List 


## Todo: 
1. Lowercase everything. Roberta is case-sensitive. 
1. Use a domain-adapted roberta model. 
1. Combine Component and Location label_values whenever possible (lower-priority)
1. Use the updated ATA master list 
1. Visualize clusters at 4-digit level 

In [1]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
[0m

In [2]:
import re 
from pathlib import Path

import numpy as np
import pandas as pd 
from transformers import RobertaModel, RobertaTokenizer
from datasets import Dataset
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler
import torch 
from umap import UMAP
import faiss

In [None]:
DATA_PATH = Path(r'../input/737-ata-4plus')
data_file = '2023-03-31_737_ata-master-list-4plus.csv'

col_text = 'Title'

df = pd.read_csv(DATA_PATH.joinpath(data_file))

In [None]:
data = Dataset.from_pandas(df)

In [None]:
model_ckpt = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_ckpt)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaModel.from_pretrained(model_ckpt).to(device)

In [None]:
def tokenize_batch(batch): 
    return tokenizer(batch[col_text], padding=True, truncation=True)

In [None]:
# Example with first two texts: 
tokenize_batch(data[:2])

In [None]:
# Encoding the full dataset
data_encoded = data.map(tokenize_batch, batched=True, batch_size=None)

In [None]:
data_encoded

In [None]:
def extract_hidden_states(batch): 
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad(): 
        last_hidden_state = model(**inputs).last_hidden_state
    return {'cls_hidden_state': last_hidden_state[:, 0, :].cpu().numpy()}

In [None]:
data_encoded.set_format('torch', columns=['input_ids', 'attention_mask'])

In [None]:
data_hidden_state = data_encoded.map(extract_hidden_states, batched=True)

In [None]:
data_hidden_state

In [None]:
data_hidden_state['cls_hidden_state'].shape

In [None]:
X = np.array(data_hidden_state['cls_hidden_state'])
X.shape

In [None]:
pd.DataFrame(X)

In [None]:
y = np.array([ata[:2] for ata in data_hidden_state['ATA']])

## Visualization with UMAP

In [None]:
X_scaled = MinMaxScaler().fit_transform(X)

mapper = UMAP(n_components=2, metric='cosine', random_state=2023).fit(X_scaled)

mapper

In [None]:
df_umap_2d = pd.DataFrame(mapper.embedding_, columns=['X', 'Y'])
df_umap_2d['label'] = y

In [None]:
df_umap_2d

In [None]:
txt = 'UMAP visualization of 768D embeddings of ATA6 descriptions from 737 ATA Master File'
fig, ax = plt.subplots()
ax.hexbin(
    df_umap_2d['X'],
    df_umap_2d['Y'],
    cmap="Greys", 
    gridsize=20
)
ax.set_title(txt)
fig.show()

In [None]:
txt = 'UMAP visualization of 768D embeddings of ATA6 descriptions from 737 ATA Master File'
fig, axes = plt.subplots(2, 2, figsize=(9,9), sharex=True, sharey=True)
axes = axes.flatten()
labels = df_umap_2d['label'].unique()
cmap = "Greys"

for i, label in enumerate(labels): 
    df_umap_sub = df_umap_2d.query('label==@label')
    axes[i].hexbin(
        df_umap_sub['X'], 
        df_umap_sub['Y'], 
        cmap=cmap, 
        gridsize=20
    )
    axes[i].set_title(label)
plt.suptitle(txt)
plt.tight_layout()
plt.show()
    

## Nearest-neighbour lookups 

In [None]:
data_hidden_state.add_faiss_index(column='cls_hidden_state')

In [None]:
# these inputs are NER labels 
query_list = [
    'GPS', 'EXIT DOOR', 'SLAT CBS', 'FUEL SPAR VALVE', 'GLOBAL POSITIONING SYSTEM', 'GPS SENSOR UNIT', 
    'NDB', 'NAVIGATION DATABASE', 'GLOBAL POSITIONING SYSTEM (GPS)', 'ECS ACCESS DOORS', 'AIRSTAIR', 
    'IFF ANTENNA', 'RADAR SYS', 'radar sys', 'FSEU', 'DOOR PRESSURE SEAL', 'ACCESS DOOR', 'AMBER LIGHT', 
    'FUEL', 'TCAS', 'PITOT COVERS', 'STANDBY AIRSPEED INDICATOR', 'STNDBY AIRSPEED IND',
    '201504R10SPEEDBRAKE AUTOSTOW ACTUATOR AUTOMATIC', 'AIRSTAIRS', 'airstairs', 'FUEL SCAVENGE SYSTEMS'
]

for idx, query in enumerate(query_list): 
    print(f'\nInput number: {idx} ------------')
    print(f'Input text: {query}')
    
    query_encoding = tokenizer(query, return_tensors='pt', truncation=True, padding=True)
    inputs = {k:v.to(device) for k,v in query_encoding.items()}

    query_cls_embedding = model(**inputs).last_hidden_state[:, 0, :].detach().cpu().numpy()
    
    scores, retrieved_examples = data_hidden_state.get_nearest_examples('cls_hidden_state', query_cls_embedding, k=5)
    
    for n in range(4): 
        tokens = tokenizer.convert_ids_to_tokens(retrieved_examples['input_ids'][n])
        print(re.sub('<pad>', '', tokenizer.convert_tokens_to_string(tokens)))