In [2]:
%cd /content
!git clone -b v1.0 https://github.com/camenduru/LLaVA
%cd /content/LLaVA
!pip install -q gradio .



/content
fatal: destination path 'LLaVA' already exists and is not an empty directory.
/content/LLaVA
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for llava (pyproject.toml) ... [?25l[?25hdone


In [3]:
import os
import requests
from PIL import Image
from io import BytesIO
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from transformers import TextStreamer


[2023-12-16 16:30:21,977] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
from transformers import AutoTokenizer, BitsAndBytesConfig
from llava.model import LlavaLlamaForCausalLM
import torch

model_path = "4bit/llava-v1.5-13b-3GB"
kwargs = {"device_map": "auto"}
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)
model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device='cuda')
image_processor = vision_tower.image_processor

Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

In [5]:
def caption_image(image_file, prompt):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    disable_torch_init()
    conv_mode = "llava_v0"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
    inp = f"{roles[0]}: {prompt}"
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(raw_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
      output_ids = model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2,
                                  max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria])
    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    conv.messages[-1][-1] = outputs
    output = outputs.rsplit('</s>', 1)[0]
    return output

In [6]:
output = caption_image(f'https://pbs.twimg.com/media/Eo8N3JLVoAAlDJT?format=jpg&name=small', 'This is from a tweet by Tim Hortons, what can the content of the tweet be? I just want the tweet, no frills. The tweet should not be in quotes')
print(output)

"Saturday mornings made better with Tim Hortons! Enjoy a delicious breakfast sandwich and a refreshing iced coffee for only P145. Available all day! #TimHortons #SaturdayMorning #BreakfastSandwich #IcedCoffee"


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
path1 = "/content/drive/MyDrive/interiit/content_simulation_test_company.xlsx"
path2 = "/content/drive/MyDrive/interiit/content_simulation_test_time.xlsx"

In [9]:
import pandas as pd

In [10]:
df = pd.read_excel(path1)

In [11]:
urlLs = []

for index, row in df.iterrows():
    media_str = row['media']
    if 'Photo' in media_str:
        full_start = media_str.find("fullUrl='") + len("fullUrl='")
        full_end = media_str.find("'", full_start)
        full_url = media_str[full_start:full_end]

        urlLs.append(full_url)

    elif 'Video' in media_str:
        thumbnail_start = media_str.find("thumbnailUrl='") + len("thumbnailUrl='")
        thumbnail_end = media_str.find("'", thumbnail_start)
        thumbnail_url = media_str[thumbnail_start:thumbnail_end]

        urlLs.append(thumbnail_url)

    print(index)
    index+=1


urlsDf = pd.DataFrame({'Urls': urlLs})

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186


In [12]:
out = caption_image(f'https://pbs.twimg.com/media/Eo8N3JLVoAAlDJT?format=jpg&name=small', f"This is from a tweet by {df['username'][0]} on {df['date'][0]}, what can the tweet be? I just want the tweet, no other content.")
print(out)

"Saturday mornings made better with a heart-shaped breakfast sandwich and a cold drink from Tim Hortons. Available all day! #TimHortons #HeartShapedBreakfastSandwich #SaturdayMorning"


In [15]:
captions = []
ind = 6000
for x in urlsDf['Urls']:
  try:
    answer = caption_image(x, f"This is from a tweet by {df['username'][ind]} on {df['date'][ind]}, what can the tweet be? I just want the tweet, no other content.")
  except:
    captions.append("NaN")

  if ind>6500:
    break
  else:
    captions.append(answer)
    print(answer)
    print(ind)
    ind+=1

"A bee is sitting on a flower, and it's looking at the camera. #nature #bee #flower"
6000
"Investing in green energy is essential for a sustainable future. #renewableenergy #climatechange"
6001
"Breaking: Manchester City have been banned from the Champions League for two seasons. #UCL #ManCity"
6002
"Two soccer players from opposing teams are going after the ball. The player in the red and white striped jersey is trying to kick it away from the player in the blue and white jersey. #soccer #competition"
6003
"Congratulations to the gymnastics team on their win! So proud of their hard work and dedication. #gymnastics #teamwork #winning"
6004
The tweet by chasepwilliams on 2018-05-03 00:00:49 reads: "I'm watching Casualty and I'm really enjoying it. It's a great show!"
6005
"I'm at home, enjoying my football. I don't need to leave."
6006
"If you're not the one shooting the ball, you're not the one that matters the most. #basketball #shooting #teamwork"
6007
"Two men wearing black jackets 