# Inference with CLAVE

<a target="_blank" href="https://colab.research.google.com/github/davidaf3/CLAVE/blob/master/src/run_clave.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

This notebook shows how you can run inference on CLAVE and creates a Gradio UI that lets you experiment with the model.

## Setup

Install the necessary dependencies. This only install the packages that are not available in Colab. If you are not using Colab, you might need to install `torch`, `requests`, and `tqdm`.

In [20]:
%pip install rarfile gradio



Clone CLAVE's repo and move into it. If you are running this notebook locally and have already clone the repo, this step is not necessary.

In [None]:
!git clone https://github.com/davidaf3/CLAVE.git
%cd CLAVE/src

## Download the model weights
First, download the model weights and SentencePiece parameter from the provided URLs:

In [21]:
from tqdm import tqdm
import requests


res = requests.get(
    "https://www.reflection.uniovi.es/bigcode/download/2024/CLAVE/model.rar",
    stream=True,
)

with tqdm(
    total=int(res.headers.get("content-length", 0)), unit="B", unit_scale=True
) as progress_bar:
    with open("model.rar", "wb") as f:
        for data in res.iter_content(1024):
            progress_bar.update(len(data))
            f.write(data)

res = requests.get(
    "https://www.reflection.uniovi.es/bigcode/download/2024/CLAVE/tokenizer_data.zip",
    stream=True,
)

with tqdm(
    total=int(res.headers.get("content-length", 0)), unit="B", unit_scale=True
) as progress_bar:
    with open("tokenizer_data.zip", "wb") as f:
        for data in res.iter_content(1024):
            progress_bar.update(len(data))
            f.write(data)

100%|██████████| 277M/277M [00:24<00:00, 11.4MB/s]
100%|██████████| 1.03M/1.03M [00:00<00:00, 1.18MB/s]


Extract the downloaded `model.rar` and `tokenizer_data.zip` files:

In [22]:
import rarfile
import zipfile


with rarfile.RarFile("model.rar") as f:
    f.extractall(path=".")

with zipfile.ZipFile("tokenizer_data.zip") as f:
    f.extractall(path=".")

## Load the weights
Create a new model (`FineTunedModel` class) and load the weights from the extracted file (`CLAVE.pt`):

In [23]:
import torch
from model import FineTunedModel
from tokenizer import SpTokenizer


device = "cuda" if torch.cuda.is_available() else "cpu"

model = FineTunedModel(
    SpTokenizer.get_vocab_size(), 512, 512, 8, 2048, 6, use_layer_norm=True
).to(device)
model_checkpoint = torch.load("CLAVE.pt", map_location=device)
weights = {
    k[10:] if k.startswith("_orig_mod") else k: v
    for k, v in model_checkpoint["model_state_dict"].items()
}
model.load_state_dict(weights)
model.eval()

FineTunedModel(
  (encoder): Encoder(
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (embedding): Embedding(16000, 512)
    (pos_embedding): Embedding(2048, 512)
    (embedding_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (embedding_dropout): Dropout(p=0.1, inplace=

## Start the UI
Start the Gradio UI configured to run the `verify_authorship` function. This function tokenizes the inputs, processes the tokens with CLAVE to obtain an embedding for each input, and computes the distance between the embeddings.

In [None]:
import gradio as gr
import torch.nn.functional as F
from utils import pad_and_split_tokens


tokenizer = SpTokenizer()
threshold = 0.1050


def verify_authorship(source_code_1, source_code_2):
    with torch.inference_mode():
        tokens_1 = pad_and_split_tokens(tokenizer.tokenizes(source_code_1))[0]
        tokens_2 = pad_and_split_tokens(tokenizer.tokenizes(source_code_2))[0]
        embedding_1 = model(torch.tensor([tokens_1], device=device))
        embedding_2 = model(torch.tensor([tokens_2], device=device))
        distance = (1 - F.cosine_similarity(embedding_1, embedding_2)).item()
        return [
            distance,
            "Yes" if distance <= threshold else "No",
        ]


ui = gr.Interface(
    fn=verify_authorship,
    inputs=[
        gr.Code(language="python", label="Source code 1"),
        gr.Code(language="python", label="Source code 2"),
    ],
    outputs=[gr.Number(label="Distance"), gr.Text(label="Same author?")],
    allow_flagging="never",
)
ui.launch()

In [27]:
import gradio as gr
import torch
import torch.nn.functional as F
import shap
import sentencepiece as spm
from utils import pad_and_split_tokens

# Load SentencePiece tokenizer
tokenizer_path = "/content/CLAVE/src/tokenizer_data/tokenizer.model"
sp = spm.SentencePieceProcessor(model_file=tokenizer_path)

# Custom wrapper for SHAP
class CustomTokenizerWrapper:
    def __init__(self, sp):
        self.sp = sp

    def encode(self, text):
        tokens = self.sp.encode(text, out_type=str)
        return {"input_ids": tokens}

    def decode(self, tokens):
        return self.sp.decode(tokens)

wrapped_tokenizer = CustomTokenizerWrapper(sp)
masker = shap.maskers.Text(wrapped_tokenizer.encode)

# Load trained model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = torch.load("CLAVE.pt", map_location=device)
model.eval()

# SHAP explainer
explainer = shap.Explainer(model, masker)

# Function to highlight code
def highlight_code(code, shap_values):
    tokens = sp.encode(code, out_type=str)
    highlighted_code = ""

    for i, token in enumerate(tokens):
        color = "red" if shap_values[i] > 0 else "green"  # Red = Similarity, Green = Difference
        highlighted_code += f'<span style="color: {color};">{token}</span> '

    return f'<pre>{highlighted_code}</pre>'

# Function for authorship verification
def verify_authorship(source_code_1, source_code_2):
    with torch.inference_mode():
        tokens_1 = pad_and_split_tokens(sp.encode(source_code_1, out_type=int))[0]
        tokens_2 = pad_and_split_tokens(sp.encode(source_code_2, out_type=int))[0]

        embedding_1 = model(torch.tensor([tokens_1], device=device))
        embedding_2 = model(torch.tensor([tokens_2], device=device))

        distance = (1 - F.cosine_similarity(embedding_1, embedding_2)).item()

        # SHAP explanation
        shap_values_1 = explainer([tokens_1])
        shap_values_2 = explainer([tokens_2])

        highlighted_code_1 = highlight_code(source_code_1, shap_values_1.values[0])
        highlighted_code_2 = highlight_code(source_code_2, shap_values_2.values[0])

        return distance, "Yes" if distance <= 0.1050 else "No", highlighted_code_1, highlighted_code_2

# Gradio UI
ui = gr.Interface(
    fn=verify_authorship,
    inputs=[
        gr.Code(language="python", label="Source code 1"),
        gr.Code(language="python", label="Source code 2"),
    ],
    outputs=[
        gr.Number(label="Distance"),
        gr.Text(label="Same author?"),
        gr.HTML(label="Highlighted Code 1"),
        gr.HTML(label="Highlighted Code 2"),
    ],
    allow_flagging="never",
)
ui.launch()

AttributeError: 'dict' object has no attribute 'eval'