In [1]:
#!pip install transformers streamlit torch dill
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch 
from dill import dump,load
from PIL import Image
import streamlit as st

In [2]:
model = VisionEncoderDecoderModel.from_pretrained('nlpconnect/vit-gpt2-image-captioning')
dump(model,open('model.pkl','wb'))

feature_extractor = ViTFeatureExtractor.from_pretrained('nlpconnect/vit-gpt2-image-captioning')
dump(feature_extractor,open('Feature_extractor.pkl','wb'))

tokenizer = AutoTokenizer.from_pretrained('nlpconnect/vit-gpt2-image-captioning')
dump(tokenizer,open('tokenizer.pkl','wb'))



In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

dump(device,open('device.pkl','wb'))

In [4]:
def gen_caption(image):
    i_image = Image.open(image)
    if i_image.mode != 'RGB':
        i_image = i_image.convert('RGB')
    pixel_values = feature_extractor(images = i_image, return_tensors='pt').pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **{'max_length':20, 'num_beams': 6, 'num_return_sequences':3})

    captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True) 
    captions = [cap.strip() for cap in captions]
    return captions



def caption_0(image):
    generated_captions = gen_caption(image)
    return generated_captions[0]



def caption_1(image):
    generated_captions = gen_caption(image)
    return generated_captions[1]



def caption_2(image):
    generated_captions = gen_caption(image)
    return generated_captions[2]

dump(gen_caption,open('gen_caption.pkl','wb'))

dump(caption_0,open('caption_0.pkl','wb'))

dump(caption_1,open('caption_1.pkl','wb'))

dump(caption_2,open('caption_2.pkl','wb'))

In [None]:
!streamlit run app.py