In [1]:
from PIL import Image
import requests
import torch
from transformers import (BitsAndBytesConfig, AutoProcessor,
                          LlavaForConditionalGeneration)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'llava-hf/llava-1.5-7b-hf'
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16)

model = LlavaForConditionalGeneration.from_pretrained(model_name,
                                                      quantization_config=quantization_config)
processor = AutoProcessor.from_pretrained(model_name)



Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.32s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def extract_logits(processor, image, generated_description, min_prob=1e-3):
    instruction = "Please provide a thorough description of this image"
    prompt = f"USER: <image>\n{instruction}\nASSISTANT:"
    
    tokenized_prompt_input = processor.tokenizer.tokenize(prompt + generated_description)
    tokenized_prompt = processor.tokenizer.tokenize(prompt)
    description_length = len(tokenized_prompt_input) - len(tokenized_prompt) + 1
    tokenized_description = tokenized_prompt_input[-description_length+1:]

    inputs = processor(text=prompt+generated_description, images=image, return_tensors="pt")

    with torch.no_grad():  # Ensure no gradients are computed to save memory
        forward_outputs = model(**inputs, return_dict=True)
        forward_logits = forward_outputs[1][0][-description_length:]
    
    scores_len = len(forward_logits)
    generated_ids = [processor.tokenizer.convert_tokens_to_ids(t) for t in tokenized_description]
    generated_scores = []
    
    for i, token_id in enumerate(generated_ids):
        token = processor.tokenizer._convert_id_to_token(token_id).replace("▁", " ")
        logits = torch.softmax(forward_logits[i], dim=-1).tolist()
        logits = {t: round(p, 4) for t, p in enumerate(logits) if (p > min_prob or t == token_id)}
        logits = {processor.tokenizer._convert_id_to_token(t).replace("▁", " ").strip(): p for t, p in logits.items()}
        generated_scores.append((token, logits))
    
    generated_text = ''.join([x for x, _ in generated_scores])
    return generated_text, generated_scores

In [4]:
import pandas as pd 
df = pd.read_excel(f'/home/student/HallucinationsLLM/result_data1.xlsx', index_col=0)

In [5]:
image_url = df.iloc[0]['image_link']
generated_description = df.iloc[0]['description']
image = Image.open(requests.get(image_url, stream=True).raw)
generated_text, generated_scores = extract_logits(processor, image, generated_description, min_prob=1e-3)

In [7]:
df.iloc[0]['logits']

'[{\' A\': 0.003916521091014147, \' The\': 0.9288559556007385, \' In\': 0.02594110369682312, \' It\': 0.0011577213881537318, \' This\': 0.037744082510471344}, {\' image\': 0.7400271892547607, \' black\': 0.0030243222136050463, \' picture\': 0.0012219366617500782, \' scene\': 0.2478780746459961, \' street\': 0.003763829357922077, \' photo\': 0.0013007447123527527}, {\' is\': 0.05028582736849785, \' dep\': 0.33306851983070374, \' show\': 0.051077716052532196, \' port\': 0.08290731906890869, \' shows\': 0.07667673379182816, \' capt\': 0.23618094623088837, \' features\': 0.11156398057937622, \' displays\': 0.03246697038412094, \' presents\': 0.019386911764740944}, {\'ict\': 0.9999663829803467}, {\'s\': 0.9999306201934814}, {\' a\': 0.9809731245040894, \' an\': 0.0146643603220582, \' people\': 0.001281357486732304}, {\' b\': 0.31018805503845215, \' l\': 0.2737399935722351, \' v\': 0.01669824682176113, \' very\': 0.005637079942971468, \' long\': 0.0017058333614841104, \' group\': 0.013843333

In [9]:
len(str(generated_scores))

38679

In [10]:
generated_scores

[(' The',
  {'A': 0.0039, 'The': 0.9288, 'In': 0.0255, 'It': 0.0012, 'This': 0.0377}),
 (' image',
  {'image': 0.7334,
   'black': 0.0034,
   'picture': 0.0012,
   'scene': 0.2535,
   'street': 0.0039,
   'photo': 0.0013}),
 (' dep',
  {'is': 0.0522,
   'dep': 0.3351,
   'show': 0.0506,
   'port': 0.0821,
   'shows': 0.076,
   'capt': 0.234,
   'features': 0.1105,
   'displays': 0.0322,
   'presents': 0.0195}),
 ('ict', {'ict': 1.0}),
 ('s', {'s': 0.9999}),
 (' a', {'a': 0.9803, 'an': 0.0151, 'people': 0.0013}),
 (' l',
  {'b': 0.3045,
   'l': 0.2646,
   'v': 0.0164,
   'very': 0.0057,
   'long': 0.0017,
   'group': 0.0134,
   'small': 0.0116,
   'large': 0.0028,
   'color': 0.0011,
   'city': 0.0186,
   'night': 0.0036,
   'town': 0.0022,
   'dark': 0.0012,
   'sun': 0.0012,
   'wide': 0.0014,
   'beautiful': 0.0017,
   'crow': 0.0546,
   'quiet': 0.0028,
   'street': 0.0204,
   'narrow': 0.0126,
   'crowd': 0.0025,
   'busy': 0.2299}),
 ('ively', {'ively': 0.9998}),
 (' street',
  {'

In [4]:
import pandas as pd
df1 = pd.read_excel("/home/student/HallucinationsLLM/team5_final_data.xlsx", index_col=0)
df1.iloc[0]['logits']

'[(\' The\', {\'The\': 0.9288, \'This\': 0.0377, \'In\': 0.0255, \'A\': 0.0039, \'It\': 0.0012}), (\' image\', {\'image\': 0.7334, \'scene\': 0.2535, \'street\': 0.0039, \'black\': 0.0034, \'photo\': 0.0013, \'picture\': 0.0012}), (\' dep\', {\'dep\': 0.3351, \'capt\': 0.234, \'features\': 0.1105, \'port\': 0.0821, \'shows\': 0.076, \'is\': 0.0522, \'show\': 0.0506, \'displays\': 0.0322, \'presents\': 0.0195}), (\'ict\', {\'ict\': 1.0}), (\'s\', {\'s\': 0.9999}), (\' a\', {\'a\': 0.9803, \'an\': 0.0151, \'people\': 0.0013}), (\' l\', {\'b\': 0.3045, \'l\': 0.2646, \'busy\': 0.2299, \'crow\': 0.0546, \'street\': 0.0204, \'city\': 0.0186, \'v\': 0.0164, \'group\': 0.0134, \'narrow\': 0.0126, \'small\': 0.0116}), (\'ively\', {\'ively\': 0.9998}), (\' street\', {\'street\': 0.4675, \'city\': 0.2792, \'scene\': 0.0501, \'dow\': 0.0276, \'urban\': 0.0212, \'and\': 0.0207, \'night\': 0.0199, \',\': 0.0193, \'town\': 0.0146, \'Asian\': 0.0146}), (\' scene\', {\'scene\': 0.5215, \'in\': 0.1403,

In [5]:
import ast 
ast.literal_eval(df1.iloc[0]['logits'])

[(' The',
  {'The': 0.9288, 'This': 0.0377, 'In': 0.0255, 'A': 0.0039, 'It': 0.0012}),
 (' image',
  {'image': 0.7334,
   'scene': 0.2535,
   'street': 0.0039,
   'black': 0.0034,
   'photo': 0.0013,
   'picture': 0.0012}),
 (' dep',
  {'dep': 0.3351,
   'capt': 0.234,
   'features': 0.1105,
   'port': 0.0821,
   'shows': 0.076,
   'is': 0.0522,
   'show': 0.0506,
   'displays': 0.0322,
   'presents': 0.0195}),
 ('ict', {'ict': 1.0}),
 ('s', {'s': 0.9999}),
 (' a', {'a': 0.9803, 'an': 0.0151, 'people': 0.0013}),
 (' l',
  {'b': 0.3045,
   'l': 0.2646,
   'busy': 0.2299,
   'crow': 0.0546,
   'street': 0.0204,
   'city': 0.0186,
   'v': 0.0164,
   'group': 0.0134,
   'narrow': 0.0126,
   'small': 0.0116}),
 ('ively', {'ively': 0.9998}),
 (' street',
  {'street': 0.4675,
   'city': 0.2792,
   'scene': 0.0501,
   'dow': 0.0276,
   'urban': 0.0212,
   'and': 0.0207,
   'night': 0.0199,
   ',': 0.0193,
   'town': 0.0146,
   'Asian': 0.0146}),
 (' scene',
  {'scene': 0.5215,
   'in': 0.1403,