# MS-COCO image captioning persian version translation

[Dataset page](https://www.kaggle.com/awsaf49/coco-2017-dataset)

In [None]:
# !wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

--2021-12-13 06:53:52--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.25.156
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.25.156|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2021-12-13 06:54:09 (13.9 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]



In [None]:
# !unzip ./annotations_trainval2017 -d coco_dataset

Archive:  ./annotations_trainval2017.zip
  inflating: coco_dataset/annotations/instances_train2017.json  
  inflating: coco_dataset/annotations/instances_val2017.json  
  inflating: coco_dataset/annotations/captions_train2017.json  
  inflating: coco_dataset/annotations/captions_val2017.json  
  inflating: coco_dataset/annotations/person_keypoints_train2017.json  
  inflating: coco_dataset/annotations/person_keypoints_val2017.json  


In [4]:
!pip install -q sentence_transformers
!pip install -q mtranslate

In [5]:
import pandas as pd
import numpy as np
import json
import random

from sentence_transformers import SentenceTransformer
from mtranslate import translate
from tqdm import tqdm
import torch

# Data

### Prepare data to be translated

In [None]:
json_file = open("/content/coco_dataset/annotations/captions_train2017.json")
parsed_js = json.load(json_file)
json_file.close()

In [None]:
parsed_js.keys()

dict_keys(['info', 'licenses', 'images', 'annotations'])

In [None]:
caption_ds = pd.DataFrame(parsed_js['annotations'])
caption_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591753 entries, 0 to 591752
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   image_id  591753 non-null  int64 
 1   id        591753 non-null  int64 
 2   caption   591753 non-null  object
dtypes: int64(2), object(1)
memory usage: 13.5+ MB


In [None]:
caption_ds

Unnamed: 0,image_id,id,caption
0,203564,37,A bicycle replica with a clock as the front wh...
1,322141,49,A room with blue walls and a white sink and door.
2,16977,89,A car that seems to be parked illegally behind...
3,106140,98,A large passenger airplane flying through the ...
4,106140,101,There is a GOL plane taking off in a partly cl...
...,...,...,...
591748,133071,829655,a slice of bread is covered with a sour cream ...
591749,410182,829658,A long plate hold some fries with some sliders...
591750,180285,829665,Two women sit and pose with stuffed animals.
591751,133071,829693,White Plate with a lot of guacamole and an ext...


In [None]:
caption_ds['image_id'].value_counts()

52109     7
23247     7
336464    7
279818    6
476074    6
         ..
372193    5
368103    5
392683    5
333302    5
8196      5
Name: image_id, Length: 118287, dtype: int64

In [None]:
np.random.choice(np.array(caption_ds.loc[caption_ds['image_id'] == 52109].caption))

'Three teenagers play tennis on a court surrounded by greenery.'

In [None]:
caption_ds.tail()

Unnamed: 0,image_id,id,caption
591748,133071,829655,a slice of bread is covered with a sour cream ...
591749,410182,829658,A long plate hold some fries with some sliders...
591750,180285,829665,Two women sit and pose with stuffed animals.
591751,133071,829693,White Plate with a lot of guacamole and an ext...
591752,133071,829717,A dinner plate has a lemon wedge garnishment.


In [None]:
# find unique image_ids
unique_ids = np.unique(caption_ds['image_id'])
len(unique_ids)

118287

In [None]:
# Pick a random caption from each image's captions
def randomPick(image_id):
    """
    Given an image_id choose a caption among its annotated captions randomly.
    """
    correspond_captions = np.array(caption_ds.loc[caption_ds['image_id'] == image_id].caption) # Find image related captions.
    picked_cap = random.choice(correspond_captions) # choose a random caption
    return picked_cap

# Define a list that will be contained image_ids and there corresponding randomly choosed caption.
picked_ds = []
for image_id in unique_ids:
    caption = randomPick(image_id)
    picked_ds.append(dict({'image_id': image_id, 'caption': caption})) # append it to list

In [None]:
picked_ds

In [None]:
df = pd.DataFrame(picked_ds)
df

Unnamed: 0,image_id,caption
0,9,Closeup of bins of food that include broccoli ...
1,25,A giraffe eating food from the top of the tree.
2,30,A flower vase is sitting on a porch stand.
3,34,The zebra is eating grass in the sun.
4,36,"A woman posing for the camera, holding a pink,..."
...,...,...
118282,581906,"A damaged, leather suit case sitting on a dirt..."
118283,581909,An old boat sits on a trailer hitch.
118284,581913,A group of donuts sitting in a box.
118285,581921,A man riding an orange snow board jumping off ...


In [None]:
df.to_csv("./coco_selected.cvs", index=False)

In [None]:
caption_ds.to_csv("/content/drive/MyDrive/Coco/coco_captions_original.csv", index=False)118

# Translation
Working on selected captions: 

* split them into some chunks, then translate chunks one by one.

In [6]:
class MultiLangSimilarity():
    def __init__(self, model_name, device='cpu'):
        self.model = SentenceTransformer(model_name, device=device)

    def __call__(self, text):
        return self.model.encode(text,  
                                convert_to_tensor=True,
                                normalize_embeddings=True)

    def score(self, a, b):
        a, b = self([a, b])
        return torch.dot(a, b).item()


class Translator():
    def __init__(self, model_name:str, min_score:float=.9, device:str='cpu'):
        self.min_score = min_score
        self.similar = MultiLangSimilarity(model_name=model_name, device=device)

    def __call__(self, sentences:list):
        outputs = []
        for i, sentence in tqdm(enumerate(sentences), total=len(sentences)):
            aug = translate(sentence, from_language='en', to_language='fa')
            score = self.similar.score(sentence, aug)
            if score >= self.min_score:
                outputs.append({'id': i, 'en': sentence, 'fa': aug, '%': score})
        return outputs

In [14]:
# Read selected csv
df = pd.read_csv('/home/kaen/Projects/Datasets/Coco/coco_selected.csv')
df

Unnamed: 0,image_id,caption
0,9,Closeup of bins of food that include broccoli ...
1,25,A giraffe eating food from the top of the tree.
2,30,A flower vase is sitting on a porch stand.
3,34,The zebra is eating grass in the sun.
4,36,"A woman posing for the camera, holding a pink,..."
...,...,...
118282,581906,"A damaged, leather suit case sitting on a dirt..."
118283,581909,An old boat sits on a trailer hitch.
118284,581913,A group of donuts sitting in a box.
118285,581921,A man riding an orange snow board jumping off ...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118287 entries, 0 to 118286
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   image_id  118287 non-null  int64 
 1   caption   118287 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [9]:
# Split data into some chunks
chunks = np.array([df[x:x+5000] for x in range(0, len(df), 5000)])
chunks

  chunks = np.array([df[x:x+5000] for x in range(0, len(df), 5000)])


array([      image_id                                            caption
0            9  Closeup of bins of food that include broccoli ...
1           25    A giraffe eating food from the top of the tree.
2           30         A flower vase is sitting on a porch stand.
3           34              The zebra is eating grass in the sun.
4           36  A woman posing for the camera, holding a pink,...
...        ...                                                ...
4995     24591     Skier on slope near chair lift in alpine area.
4996     24600  A red double decker bus driving in front of a ...
4997     24601  A woman swings her hand out to hit a ball as p...
4998     24608  A parked motorcycle towing a four-wheeled cart...
4999     24609  A person sitting in a small, narrow boat fille...

[5000 rows x 2 columns],
             image_id                                            caption
5000     24621        A person riding a skateboard on the cement.
5001     24625  The woman is straigh

In [10]:
len(chunks)

24

In [15]:
# Append translated result to our dataframe
def appendToDataFrame(result, data_frame):
    for cap in result:
        data_frame.loc[cap['en'], 'fa_caption'] = cap['fa']
        print('Result appended to the dataset...')

def saveDataFrame(data_frame, write_to='/home/kaen/Projects/Datasets/Coco/coco_translated.csv'):
    data_frame.to_csv(write_to, index=False)
    print('Dataset modiifed...')  

In [16]:
if __name__ == '__main__':
  # loop through chunks
  for chunk in chunks:  
    captions = chunk.caption  # Retrieve captions from data chunk
    model_name = 'paraphrase-multilingual-mpnet-base-v2'
    translator = Translator(model_name=model_name, min_score=0.85, device='cpu')
    output = translator(sentences=captions)
    print(output)
    # Append the output to the dataframe
    appendToDataFrame(output, df)
    saveDataFrame(df)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.