# 🦄 SCB10X - Data Preparation Notebook
- Translation script for translting **LLaVA** dataset via https://llava-vl.github.io/
- written by Senmee 🍜 @ 18 Nov, 2023

In [1]:
import json
import pandas as pd
import re
import os
from glob import glob
from tqdm import tqdm

## 1. Get data from json
**LLaVA** json has format like
```
[
  {
    'id': '000000033471',
    'image': 'coco/train2017/000000033471.jpg',
    'conversations': [
      {'from': 'human', 'value': '<image>\nWhat are the colors of the bus in the image?'},
      {'from': 'gpt',   'value': 'The bus in the image is white and red.'},
      ...]
  },
  {...},
]
```
This command session will convert them into dataframe and save as `.xlsx` for translating via google translate web UI

🚨 Note that we need to collect \<image\> in conversation for setting position of image tag in each sentence

In [3]:
# Clean function
def clean_text(text):
  text = re.sub(r'\s+', ' ', text)        # Replace multiple white space with single white space
  return text

### Convert json to Image

In [4]:
# Export data
RAW_DIR = './raw_data'                               
TARGET_DIR = './data'

# Get all filename
file_list = glob(os.path.join(RAW_DIR, '*'))
len(file_list), file_list

(7,
 ['/content/gdrive/MyDrive/SCB10X/raw_dataset/complex_reasoning_77k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/conversation_58k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/detail_23k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/llava_instruct_80k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/llava_instruct_150k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/llava_v1_5_mix665k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/chat.json'])

In [8]:
# Get all data
def get_data(json_path):
  # Read json
  with open(json_path, 'r') as file:
    data = json.load(file)
  name = json_path.split('/')[-1].split('.')[0]
  # Collect data
  datas = []
  for row in tqdm(data):
    id = row['id']
    if 'image' in row.keys():
      image = row['image']
      count = 0
      for value in row['conversations']:
        data_dict = {
            'id': id,
            'image': image,
            'from': value['from'],
            'value': clean_text(value['value']),
            'order': count
        }
        datas.append(data_dict)
        count += 1

  # Convert & Save dataframe
  df = pd.DataFrame(datas)
  filename = name
  df.to_csv(os.path.join(TARGET_DIR, filename+'.csv'), index=False)
  return df

In [9]:
for path in file_list:
  df = get_data(path)

100%|██████████| 665298/665298 [00:45<00:00, 14522.83it/s]


In [11]:
print(len(df))
df

6712322


Unnamed: 0,id,image,from,value,order
0,000000033471,coco/train2017/000000033471.jpg,human,<image> What are the colors of the bus in the ...,0
1,000000033471,coco/train2017/000000033471.jpg,gpt,The bus in the image is white and red.,1
2,000000033471,coco/train2017/000000033471.jpg,human,What feature can be seen on the back of the bus?,2
3,000000033471,coco/train2017/000000033471.jpg,gpt,The back of the bus features an advertisement.,3
4,000000033471,coco/train2017/000000033471.jpg,human,Is the bus driving down the street or pulled o...,4
...,...,...,...,...,...
6712317,07c44727285d8060,textvqa/train_images/07c44727285d8060.jpg,gpt,The pages of opened book is arranged ina heart...,1
6712318,bd75fd0264fdaf5b,textvqa/train_images/bd75fd0264fdaf5b.jpg,human,<image> Provide a one-sentence caption for the...,0
6712319,bd75fd0264fdaf5b,textvqa/train_images/bd75fd0264fdaf5b.jpg,gpt,A sketch of a man in behind the counter in a b...,1
6712320,4508de4f680374a7,textvqa/train_images/4508de4f680374a7.jpg,human,<image> Provide a one-sentence caption for the...,0


## Split dataframe
- Separate entire dataframe into 10k rows per file in `.xlsx` format

In [None]:
import os

DATA_DIR = './data/'
SPLIT_DIR = './split_data/'
RESULT_DIR = './result/'
TRANS_DIR = './trans_data/'

## Split dataframe
- Separate them into 10k rows per file in `.xlsx` format

In [None]:
import pandas as pd
from glob import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import os

# Get all file location
file_list = glob(os.path.join(DATA_DIR, '*'))

def split_file(path, sub_length=10000):
    name = path.split('/')[-1].split('.')[0]

    df = pd.read_csv(path)
    df['value'] = df['value'].astype('str')
    df['value'] = df['value'].apply(lambda text: text.encode('unicode_escape').decode('utf-8')) # Remove unreadable character
    length = len(df)
    num_files = length // sub_length
    for i in tqdm(range(num_files)):
        sub_df = pd.DataFrame(df.loc[i*sub_length:(i+1)*sub_length-1, 'value'])
        sub_df.to_excel(os.path.join(SPLIT_DIR, f'{name}-{i}.xlsx'), index=False)
    if length%sub_length != 0:
        sub_df = pd.DataFrame(df.loc[num_files*sub_length:, 'value'])
        sub_df.to_excel(os.path.join(SPLIT_DIR, f'{name}-{num_files}.xlsx'), index=False)

In [None]:
for path in tqdm(file_list):
    split_file(path)

100%|██████████| 5/5 [00:55<00:00, 11.16s/it]


In [None]:
# Check number of separated files
len(glob(os.path.join(SPLIT_DIR, '*')))

191

## 3. Translate data
- Upload each `.xlsx` files into google translate web UI https://translate.google.co.th/?hl=th&sl=auto&tl=th&op=docs

## 4. Combine data
Combine all splitted data and join them with the raw dataframe

In [None]:
import re

def sort_by_number(text):
    number = text.split('-')[-1].split('.')[0]
    return int(number)

def combine_df(path):
    name = path.split('/')[-1].split('.')[0]
    trans_list = glob(os.path.join(TRANS_DIR, name+'*'))
    trans_list = sorted(trans_list, key=sort_by_number)
    raw_df = pd.read_csv(path)
    trans_data = []
    for i, trans_path in enumerate(tqdm(trans_list)):
        df = pd.read_excel(trans_path)
        trans_data.append(df)

    trans_df = pd.concat(trans_data, ignore_index=False).reset_index(drop=True)
    trans_df.rename(columns={'ค่า': 'translate_value'}, inplace=True)

    combine_df = pd.concat([raw_df, trans_df], axis=1)
    print(f'Finish {name}: {len(combine_df)} rows')
    combine_df.to_csv(os.path.join(RESULT_DIR, name+'.csv'), index=False)

In [None]:
file_list = glob(os.path.join(DATA_DIR, '*'))
for path in tqdm(file_list):
    combine_df(path)