In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.layers import Embedding, LSTM, add, Concatenate, Reshape, concatenate, Bidirectional
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
from nltk import pos_tag
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize


plt.rcParams['font.size'] = 12
sns.set_style("dark")
warnings.filterwarnings('ignore')

In [None]:
import tensorflow as tf
print(tf.__version__)

# **Image Captioning**

**What is Image Captioning ?**
- Image Captioning is the process of generating textual description of an image. It uses both Natural Language Processing and Computer Vision to generate the captions.
- This task lies at the intersection of computer vision and natural language processing. Most image captioning systems use an encoder-decoder framework, where an input image is encoded into an intermediate representation of the information in the image, and then decoded into a descriptive text sequence.

**CNNs + RNNs (LSTMs)**
- To perform Image Captioning we will require two deep learning models combined into one for the training purpose
- CNNs extract the features from the image of some vector size aka the vector embeddings. The size of these embeddings depend on the type of pretrained network being used for the feature extraction
- LSTMs are used for the text generation process. The image embeddings are concatenated with the word embeddings and passed to the LSTM to generate the next word
- For a more illustrative explanation of this architecture check the Modelling section for a picture representation

<img src="https://miro.medium.com/max/1400/1*6BFOIdSHlk24Z3DFEakvnQ.png">

In [2]:
image_path = '../input/flickr8k/Images'

In [3]:
data = pd.read_csv("../input/flickr8k/captions.txt")
data.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [4]:
def readImage(path,img_size=224):
    img = load_img(path,color_mode='rgb',target_size=(img_size,img_size))
    img = img_to_array(img)
    img = img/255.
    
    return img

def display_images(temp_df):
    temp_df = temp_df.reset_index(drop=True)
    plt.figure(figsize = (20 , 20))
    n = 0
    for i in range(15):
        n+=1
        plt.subplot(5 , 5, n)
        plt.subplots_adjust(hspace = 0.7, wspace = 0.3)
        image = readImage(f"../input/flickr8k/Images/{temp_df.image[i]}")
        plt.imshow(image)
        plt.title("\n".join(wrap(temp_df.caption[i], 20)))
        plt.axis("off")

# **Visualization**
- Images and their corresponding captions

In [None]:
display_images(data.sample(15))

### <span style="font-weight: bold; color: #007bff;">Distribution of Comments per Image:</span>

Visualize the distribution of comments per image to understand how many comments are typically associated with each image.

In [None]:
comments_per_image = data.groupby('image')['caption'].count()

images_with_4_comments = comments_per_image[comments_per_image == 4]
images_with_5_comments = comments_per_image[comments_per_image == 5]

# Plot the bar chart
plt.figure(figsize=(10, 5))
plt.bar([3.5, 4.5], [len(images_with_4_comments), len(images_with_5_comments)], tick_label=['4 Comments', '5 Comments'])
plt.xlabel('Number of Comments')
plt.ylabel('Number of Images')
plt.title('Distribution of Comments per Image')
plt.show()

### <span style="font-weight: bold; color: #007bff;">Word Cloud:</span>

Generate a word cloud to visualize the most frequent words in the comments.

In [None]:

all_comments_text = ' '.join(data['caption'].fillna(''))

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_comments_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Comments')
plt.show()


In [None]:
data['pos_tags'] = data['caption'].fillna("").apply(lambda x: pos_tag(word_tokenize(x.lower())))

nouns = [word[0] for tags in data['pos_tags'] for word in tags if word[1].startswith('NN')]

wordcloud_nouns = WordCloud(width=800, height=400, background_color='white').generate(' '.join(nouns))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_nouns, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Nouns in Comments')
plt.show()

### <span style="font-weight: bold; color: #007bff;">Caption Length Distribution:</span>

Analyze the distribution of caption lengths to understand the range of caption lengths in the dataset.

In [None]:
# Caption length distribution
data['caption_length'] = data['caption'].fillna('').apply(lambda x: len(x.split()))
plt.hist(data['caption_length'], bins=range(1, max(data['caption_length']) + 1))
plt.xlabel('Caption Length (in words)')
plt.ylabel('Number of Captions')
plt.title('Distribution of Caption Lengths')
plt.show()


# **Caption Text Preprocessing Steps**
- Convert sentences into lowercase
- Remove special characters and numbers present in the text
- Remove extra spaces
- Remove single characters
- Add a starting and an ending tag to the sentences to indicate the beginning and the ending of a sentence

In [5]:
def text_preprocessing(data):
    data['caption'] = data['caption'].apply(lambda x: x.lower())
    data['caption'] = data['caption'].apply(lambda x: x.replace("[^A-Za-z]",""))
    data['caption'] = data['caption'].apply(lambda x: x.replace("\s+"," "))
    data['caption'] = data['caption'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
    data['caption'] = "startseq "+data['caption']+" endseq"
    return data

## __Preprocessed Text__

In [6]:
data = text_preprocessing(data)
captions = data['caption'].tolist()
captions[:10]

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tri-colored dog playing with each other on the road endseq',
 'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']

## __Tokenization and Encoded Representation__
- The words in a sentence are separated/tokenized and encoded in a one hot representation
- These encodings are then passed to the embeddings layer to generate word embeddings

<img src='https://lena-voita.github.io/resources/lectures/word_emb/lookup_table.gif'>

In [7]:
import numpy as np
import pickle

# Assuming data is your DataFrame containing captions
captions = data['caption'].tolist()

# Split captions into words and build the vocabulary
words = [caption.split() for caption in captions]
unique = list(set(word for sublist in words for word in sublist))

# Save and load unique words (optional, for future use)
# with open("unique.p", "wb") as pickle_d:
#     pickle.dump(unique, pickle_d) 
# unique = pickle.load(open('unique.p', 'rb'))

# Create mappings
word2idx = {val: index for index, val in enumerate(unique)}
idx2word = {index: val for index, val in enumerate(unique)}

# Example usage of mappings
print(word2idx['startseq'])  # Should print the index of 'startseq'
print(idx2word[379])        # Should print the word corresponding to index 5553

# Save the mappings to files
with open("word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)

with open("idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)

8590
smile


In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)

images = data['image'].unique().tolist()
nimages = len(images)

split_index = round(0.85*nimages)
train_images = images[:split_index]
val_images = images[split_index:]

train = data[data['image'].isin(train_images)]
test = data[data['image'].isin(val_images)]

train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

tokenizer.texts_to_sequences([captions[1]])[0]

[1, 18, 315, 63, 195, 116, 2]

In [9]:
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle)

# **Image Feature Extraction**
- DenseNet 201 Architecture is used to extract the features from the images
- Any other pretrained architecture can also be used for extracting features from these images
- Since the Global Average Pooling layer is selected as the final layer of the DenseNet201 model for our feature extraction, our image embeddings will be a vector of size 1920

<img src="https://imgur.com/wWHWbQt.jpg">

In [None]:
model = DenseNet201(weights='imagenet', include_top=False, pooling='avg')
fe = Model(inputs=model.input, outputs=model.layers[-2].output)

img_size = 224
features = {}
for image in tqdm(data['image'].unique().tolist()):
    img = load_img(os.path.join(image_path,image),target_size=(img_size,img_size))
    img = img_to_array(img)
    img = img/255.
    img = np.expand_dims(img,axis=0)
    feature = fe.predict(img, verbose=0)
    features[image] = feature

# **Data Generation**
- Since Image Caption model training like any other neural network training is a highly resource utillizing process we cannot load the data into the main memory all at once, and hence we need to generate the data in the required format batch wise
- The inputs will be the image embeddings and their corresonding caption text embeddings for the training process
- The text embeddings are passed word by word for the caption generation during inference time

In [None]:
class CustomDataGenerator(Sequence):
    
    def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer, 
                 vocab_size, max_length, features,shuffle=True):
    
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.directory = directory
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.features = features
        self.shuffle = shuffle
        self.n = len(self.df)
        
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __len__(self):
        return self.n // self.batch_size
    
    def __getitem__(self,index):
    
        batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size,:]
        X1, X2, y = self.__get_data(batch)        
        return (X1, X2), y
    
    def __get_data(self,batch):
        
        X1, X2, y = list(), list(), list()
        
        images = batch[self.X_col].tolist()
           
        for image in images:
            feature = self.features[image][0]
            
            captions = batch.loc[batch[self.X_col]==image, self.y_col].tolist()
            for caption in captions:
                seq = self.tokenizer.texts_to_sequences([caption])[0]

                for i in range(1,len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
            
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                
        return X1, X2, y

# **Modelling**
- The image embedding representations are concatenated with the first word of sentence ie. starseq and passed to the LSTM network 
- The LSTM network starts generating words after each input thus forming a sentence at the end

<img src='https://raw.githubusercontent.com/yunjey/pytorch-tutorial/master/tutorials/03-advanced/image_captioning/png/model.png'>

In [None]:
## input1 = Input(shape=(1920,))
##input2 = Input(shape=(max_length,))

##img_features = Dense(256, activation='relu')(input1)
##img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)

##sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)
##merged = concatenate([img_features_reshaped,sentence_features],axis=1)
##sentence_features = LSTM(256)(merged)
##x = Dropout(0.5)(sentence_features)
##x = add([x, img_features])
##x = Dense(128, activation='relu')(x)
##x = Dropout(0.5)(x)
##output = Dense(vocab_size, activation='softmax')(x)

##caption_model = Model(inputs=[input1,input2], outputs=output)
##caption_model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])


from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, concatenate, add, Attention, RepeatVector, TimeDistributed, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Define constants
embedding_size = 256
max_length = 34
vocab_size = 10000  # Change this to your actual vocabulary size

# Define image model
image_input = Input(shape=(1920,))
img_features = Dense(256, activation='relu')(image_input)
img_features_repeated = RepeatVector(max_length)(img_features)

# Define language model
language_input = Input(shape=(max_length,))
sentence_features = Embedding(vocab_size, 256, mask_zero=True)(language_input)
sentence_features = LSTM(128, return_sequences=True)(sentence_features)  # Reduced number of units
sentence_features = TimeDistributed(Dense(256))(sentence_features)

# Apply attention
attention = Attention()([img_features_repeated, sentence_features])
merged = concatenate([img_features_repeated, attention], axis=-1)

# Further processing with LSTM
x = LSTM(128, return_sequences=True)(merged)  # Reduced number of units
x = LSTM(256, return_sequences=False)(x)      # Reduced number of units
x = Dropout(0.5)(x)

# Ensure same dimensions before add
img_features_resized = Dense(256)(img_features)  # Adjust dimensions to match LSTM output

x = add([x, img_features_resized])
x = Dense(128, activation='relu')(x)          # Reduced number of units
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)

# Model definition
caption_model = Model(inputs=[image_input, language_input], outputs=output)
caption_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
from tensorflow.keras.utils import plot_model

## **Model Modification**
- A slight change has been made in the original model architecture to push the performance. The image feature embeddings are added to the output of the LSTMs and then passed on to the fully connected layers
- This slightly improves the performance of the model orignally proposed back in 2014: __Show and Tell: A Neural Image Caption Generator__ (https://arxiv.org/pdf/1411.4555.pdf)

In [None]:
plot_model(caption_model, to_file='caption_model.png', show_shapes=True)


In [None]:
caption_model.summary()

In [None]:

train_generator = CustomDataGenerator(df=train,X_col='image',y_col='caption',batch_size=32,directory=image_path,
                                      tokenizer=tokenizer,vocab_size=vocab_size,max_length=max_length,features=features)

validation_generator = CustomDataGenerator(df=test,X_col='image',y_col='caption',batch_size=32,directory=image_path,
                                      tokenizer=tokenizer,vocab_size=vocab_size,max_length=max_length,features=features)

In [None]:
model_name = "model.h5"
checkpoint = ModelCheckpoint(model_name,
                            monitor="val_loss",
                            mode="min",
                            save_best_only = True,
                            verbose=1)

earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0, patience = 10, verbose = 2, restore_best_weights=True)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.2, 
                                            min_lr=0.00000001)

## **Let's train the Model !**

<img src='https://miro.medium.com/max/1400/1*xIXqf46yYonSXkUOWcOCvg.gif'>

In [None]:
history = caption_model.fit(
        train_generator,
        epochs=50,
        validation_data=validation_generator,
        callbacks=[checkpoint,earlystopping,learning_rate_reduction])

# **Inference**
- Learning Curve (Loss Curve)
- Assessment of Generated Captions (by checking the relevance of the caption with respect to the image, BLEU Score will not be used in this kernel)

## **Learning Curve**
- The model has clearly overfit, possibly due to less amount of data
- We can tackle this problem in two ways
    1. Train the model on a larger dataset Flickr40k
    2. Attention Models

In [None]:
plt.figure(figsize=(20,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## **Caption Generation Utility Functions**
- Utility functions to generate the captions of input images at the inference time.
- Here the image embeddings are passed along with the first word, followed by which the text embedding of each new word is passed to generate the next word

In [None]:
def idx_to_word(integer,tokenizer):
    
    for word, index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [None]:
def predict_caption(model, image, tokenizer, max_length, features, word2idx, idx2word):
    """
    Generate a caption for an image using a trained model.
    
    Parameters:
    - model: The trained captioning model.
    - image: The image file name for which to generate the caption.
    - tokenizer: The tokenizer used for encoding captions.
    - max_length: The maximum length of the generated caption.
    - features: A dictionary mapping image filenames to their extracted features.
    - word2idx: A dictionary mapping words to their respective indices.
    - idx2word: A dictionary mapping indices to their respective words.
    
    Returns:
    - A string containing the generated caption.
    """
    
    # Extract the feature vector for the given image
    feature = features[image]
    
    # Initialize the input text with the start token
    in_text = "startseq"
    
    # Generate the caption word by word
    for _ in range(max_length):
        # Convert the current input text to a sequence of indices
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        
        # Pad the sequence to the maximum length
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        
        # Predict the next word in the sequence
        y_pred = model.predict([np.array([feature]), np.array(sequence)])
        
        # Get the index of the predicted word
        y_pred = np.argmax(y_pred)
        
        # Convert the index to the corresponding word
        word = idx2word.get(y_pred)
        
        # If the word is None, break the loop
        if word is None:
            break
        
        # Append the predicted word to the input text
        in_text += " " + word
        
        # If the predicted word is the end token, break the loop
        if word == 'endseq':
            break
            
    # Return the generated caption, excluding the start and end tokens
    return ' '.join(in_text.split()[1:-1])

# Example usage:
# caption = predict_caption(final_model, 'example_image.jpg', tokenizer, max_length, features, word2idx, idx2word)
# print(caption)


## **Taking 15 Random Samples for Caption Prediction**

In [None]:
samples = test.sample(15)
samples.reset_index(drop=True,inplace=True)

In [None]:
for index,record in samples.iterrows():

    img = load_img(os.path.join(image_path,record['image']),target_size=(224,224))
    img = img_to_array(img)
    img = img/255.
    
    caption = predict_caption(caption_model,record['image'], tokenizer, max_length, features, word2idx, idx2word)
    samples.loc[index,'caption'] = caption

# **Results**
- As we can clearly see there is some redundant caption generation e.g. Dog running through the water, overusage of blue shirt for any other coloured cloth
- The model performance can be further improved by training on more data and using attention mechanism so that our model can focus on relevant areas during the text generation
- We can also leverage the interprettability of the attention mechanism to understand which areas of the image leads to the generation of which word

In [None]:
display_images(samples)

<p style='font-size: 18px'><strong>Conclusion: </strong>This may not be the best performing model, but the objective of this kernel is to give a gist of how Image Captioning problems can be approached. In the future work of this kernel <strong>Attention model</strong> training and <strong>BLEU Score</strong> assessment will be performed.</p>

In [10]:
!pip install streamlit -q

[0m

In [11]:
import numpy as np
import tensorflow as tf
from PIL import Image
from sklearn.cluster import KMeans
import seaborn as sns
from tensorflow.keras.initializers import Orthogonal

In [None]:
model.summary

In [12]:
# Download ngrok
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip

# Unzip the downloaded file
!unzip ngrok-stable-linux-amd64.zip

--2024-05-21 11:06:31--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 54.161.241.46, 18.205.222.128, 54.237.133.81, ...
Connecting to bin.equinox.io (bin.equinox.io)|54.161.241.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13921656 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2024-05-21 11:06:33 (13.9 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13921656/13921656]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [13]:
!chmod +x ngrok

In [14]:
!./ngrok authtokens 2gTgesqVJ5fXr6FiY4N7R46L7hs_5FKChinrwKGCVpZw34uEA

NAME:
   ngrok - tunnel local ports to public URLs and inspect traffic

DESCRIPTION:
    ngrok exposes local networked services behinds NATs and firewalls to the
    public internet over a secure tunnel. Share local websites, build/test
    webhook consumers and self-host personal services.
    Detailed help for each command is available with 'ngrok help <command>'.
    Open http://localhost:4040 for ngrok's web interface to inspect traffic.

EXAMPLES:
    ngrok http 80                    # secure public URL for port 80 web server
    ngrok http -subdomain=baz 8080   # port 8080 available at baz.ngrok.io
    ngrok http foo.dev:80            # tunnel to host:port instead of localhost
    ngrok http https://localhost     # expose a local https server
    ngrok tcp 22                     # tunnel arbitrary TCP traffic to port 22
    ngrok tls -hostname=foo.com 443  # TLS traffic for foo.com to port 443
    ngrok start foo bar baz          # start tunnels from the configuration file

VERSI

In [15]:
%%writefile app.py
import streamlit as st
st.title('Image Captioning App')

Writing app.py


In [19]:
%%writefile app.py
import streamlit as st
import numpy as np
import os
from tqdm import tqdm
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.applications.densenet import preprocess_input
import pickle
from PIL import Image
from io import BytesIO

max_length= 34
# Load your trained captioning model
caption_model = load_model('/kaggle/input/image_model/tensorflow2/model1/1/model (1).h5')

with open('/kaggle/working/tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
def idx_to_word(integer,tokenizer):
    
    for word, index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None
    
model = DenseNet201(weights='imagenet', include_top=False, pooling='avg')
fe = Model(inputs=model.input, outputs=model.layers[-2].output)


from keras.layers import GlobalAveragePooling2D

def predict_caption(model, image_file, tokenizer, max_length):
    # Convert the file to bytes and open it as an image
    img = Image.open(BytesIO(image_file.getvalue()))

    # Convert the image to an array and preprocess it
    img = img_to_array(img)
    img = preprocess_input(img)

    # Reshape the image to match the input shape of the DenseNet model
    img = np.expand_dims(img, axis=0)

    # Extract features using DenseNet201
    feature = fe.predict(img, verbose=0)

    # Apply global average pooling to the feature map
    gap = GlobalAveragePooling2D()(feature)

    in_text = "startseq"
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)

        y_pred = model.predict([gap,sequence])
        y_pred = np.argmax(y_pred)
        
        word = idx_to_word(y_pred, tokenizer)
        
        if word is None:
            break
            
        in_text+= " " + word
        
        if word == 'endseq':
            break
        
        final_caption = in_text.replace('startseq', '').replace('endseq', '').strip()
    return final_caption 

# Streamlit app
st.title('Image Captioning App')

uploaded_file = st.file_uploader('Choose an image...', type='jpg')

if uploaded_file is not None:
    # Generate the caption
    caption = predict_caption(caption_model, uploaded_file, tokenizer, max_length)
    
    # Display the image and the caption
    st.image(uploaded_file, caption='Uploaded Image', use_column_width=True)
    st.write('Generated Caption: ', caption)

Overwriting app.py


In [17]:
!npm install localtunnel

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/kaggle/working/package.json'
[0m[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/kaggle/working/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m working No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m working No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m working No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m working No license field.
[0m
+ localtunnel@2.0.2
added 22 packages from 22 contributors and audited 22 packages in 1.542s

3 packages are looking for funding
  run `npm fund` for details

found 1 [93mmoderate[0m severity vulnerability
  run `npm audit fix` to fix them, or `npm audit` for details
[K[?25h7m            [27m[

In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.19.2.2:8501[0m
[34m  External URL: [0m[1mhttp://35.204.119.156:8501[0m
[0m
[K[?25h####......] - refresh-package-json:localtunnel: timing action:finalize[0m[K[0m[Knpx: installed 22 in 4.096s
your url is: https://dull-streets-enjoy.loca.lt
