# **ĐANG SUY NGHĨ XEM NÊN VIẾT CÁI GÌ ĐỂ GIỚI THIỆU**

# **Mount to Gdrive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Import requirement lib**

In [None]:
import os # used to handle files using system commands.
import pickle # used to store numpy features extracted
import numpy as np  # used to perform a wide variety of mathematical operations 
                    # on arrays
from tqdm.notebook import tqdm  # progress bar decorator for iterators. 
                                # Includes a default range iterator printing to 
                                # stderr.

from tensorflow.keras.preprocessing.text import Tokenizer
    # used for loading the text as convert them into a token
from tensorflow.keras.preprocessing.sequence import pad_sequences
    # used for equal distribution of words in sentences filling the remaining 
    # spaces with zeros
from tensorflow.keras.models import Model
    # stored model
from tensorflow.keras.utils import to_categorical, plot_model
    # used to visualize the architecture of the model through different images
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
    # operation for advanced model option

# **Apply API key to use kaggle API cmd**

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp '/content/gdrive/MyDrive/API key/kaggle.json' ~/.kaggle

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

## **Search && download dataset on kaggle**

In [None]:
! kaggle datasets list --user 'adityajn105'

ref                     title                         size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------  ----------------------------  ----  -------------------  -------------  ---------  ---------------  
adityajn105/flickr8k    Flickr 8k Dataset              1GB  2020-04-27 07:27:19          28568        208  0.75             
adityajn105/flickr30k   Flick 30k Dataset              8GB  2020-04-27 08:07:18           1043          8  0.625            
adityajn105/glove6b50d  GLOVE 6B 50D Word Embeddings  68MB  2020-04-27 08:58:27            418          2  0.625            


In [None]:
! kaggle datasets files 'adityajn105/flickr8k'

name          size  creationDate         
------------  ----  -------------------  
captions.txt   3MB  2020-04-27 07:27:19  


In [None]:
! kaggle datasets download 'adityajn105/flickr8k' --unzip

Downloading flickr8k.zip to /content
 99% 1.02G/1.04G [00:06<00:00, 200MB/s]
100% 1.04G/1.04G [00:06<00:00, 171MB/s]


In [None]:
! mkdir '/content/flickr8k'

In [None]:
! mv '/content/captions.txt' '/content/flickr8k'

In [None]:
! mv '/content/Images' '/content/flickr8k'

# **Determind DIR**

In [None]:
BASE_DIR = '/content/flickr8k'

# **Prepare Captions Data for Generator**

## **Load caption data**

In [None]:
with open('/content/flickr8k/captions.txt', 'r') as f_caption:
  next(f_caption)
  captions_data = f_caption.readlines()

captions_data

['1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .\n',
 '1000268201_693b08cb0e.jpg,A girl going into a wooden building .\n',
 '1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .\n',
 '1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .\n',
 '1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .\n',
 '1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting\n',
 '1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with each other on the road .\n',
 '1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots are staring at each other in the street .\n',
 '1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each other on the road .\n',
 '1001773457_577c3a7d70.jpg,Two dogs on pavement moving toward each other .\n',
 '1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front of a painted rainbow 

In [None]:
print("Length caption data:", len(captions_data))

Length caption data: 40455


## **Split and append the captions data with the image**

In [None]:
# create mapping of image to captions
mapping = {}

# process lines
for line in tqdm(captions_data):

  # split the line by comma(,); tokens = [img_id, caption]
  # caption will have more than 1 because caption can include comma (),)
  tokens = line.split(',')

  # check if img with no caption (except bad data)
  if len(tokens) < 2:
      continue
  image_id, caption = tokens[0], tokens[1:]

  # remove extension from image ID
  image_id = image_id.split('.')[0]

  # convert caption list to string
  caption = " ".join(caption)

  # create list if needed
  if image_id not in mapping:
      mapping[image_id] = []

  # store the caption
  mapping[image_id].append(caption)

  0%|          | 0/40455 [00:00<?, ?it/s]

In [None]:
len(mapping)

8091

## **Preprocess Text Data**

In [None]:
import re

def clean(mapping):
  for key, captions in mapping.items():
    for i in range(len(captions)):

      # take one caption at a time
      caption = captions[i]
      # preprocessing steps
        # convert to lowercase
      caption = caption.lower()
        # delete digits, special chars, etc., just take alphabet A-Z, a-z
      caption = re.sub('[^A-Za-z ]', '', caption)
      # caption = caption.replace('[^A-Za-z]', '')
        # delete additional spaces
      caption = re.sub(' +', ' ', caption)
      # caption = caption.replace('\s+', ' ')
        # add start and end tags to the caption
        # delete all words with 1 character
      caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
      captions[i] = caption

In [None]:
# before preprocess of text
mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .\n',
 'A girl going into a wooden building .\n',
 'A little girl climbing into a wooden playhouse .\n',
 'A little girl climbing the stairs to her playhouse .\n',
 'A little girl in a pink dress going into a wooden cabin .\n']

In [None]:
# preprocess the text
clean(mapping)

In [None]:
# after preprocess of text
mapping['1000268201_693b08cb0e']

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

### **Stored "Caption with imgID"**

In [None]:
! mkdir "/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID"

mkdir: cannot create directory ‘/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID’: File exists


In [None]:
import numpy as np
import pickle

# stored caption with img_id as .npy and .pkl file

  # as .pkl
    # Save
with open('caption_with_imgID_flickr.pkl', 'wb') as f:
    pickle.dump(mapping, f)
    # Load
with open('caption_with_imgID_flickr.pkl', 'rb') as f:
    loaded_mapping = pickle.load(f)
print(type(loaded_mapping), len(loaded_mapping))

  # as .npy file
    # Save
np.save('caption_with_imgID_flickr.npy', mapping) 
    # Load
loaded_mapping = np.load('caption_with_imgID_flickr.npy',allow_pickle='TRUE').item()
print(type(loaded_mapping), len(loaded_mapping))

<class 'dict'> 8091
<class 'dict'> 8091


In [None]:
! cp '/content/caption_with_imgID_flickr.pkl' '/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID'
! cp '/content/caption_with_imgID_flickr.npy' '/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID'

### **Make backup/recovery file**

In [None]:
! mkdir '/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID/Backup files'
! cp '/content/caption_with_imgID_flickr.pkl' '/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID/Backup files'
! cp '/content/caption_with_imgID_flickr.npy' '/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID/Backup files'

mkdir: cannot create directory ‘/content/gdrive/MyDrive/Đồ án/Đồ án Thị giác máy tính nâng cao/Final/Caption with imgID/Backup files’: File exists
