# 🦄 SCB10X - Data Preparation Notebook

In [1]:
import json
import pandas as pd
import re
import os
from glob import glob
from tqdm import tqdm

## Connect Google Drive


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Data Translation
- Use dataset from `LLaVA` for preparing Thai dataset

In [3]:
# Clean function
def clean_text(text):
  # text = text.replace('<image>', ' ')   # Remove Image tag
  # text = text.replace('\n', ' ')        # Remove New line
  text = re.sub(r'\s+', ' ', text)      # Replace multiple white space with single white space
  return text

In [4]:
# Export data
RAW_DIR = '/content/gdrive/MyDrive/SCB10X/raw_dataset' # 🚨 Change your path here
TARGET_DIR = '/content/gdrive/MyDrive/SCB10X/dataset'   # 🚨 Change your path here

# Get all filename
file_list = glob(os.path.join(RAW_DIR, '*'))
len(file_list), file_list

(7,
 ['/content/gdrive/MyDrive/SCB10X/raw_dataset/complex_reasoning_77k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/conversation_58k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/detail_23k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/llava_instruct_80k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/llava_instruct_150k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/llava_v1_5_mix665k.json',
  '/content/gdrive/MyDrive/SCB10X/raw_dataset/chat.json'])

In [5]:
file_list = [file_list[-2]]
file_list

['/content/gdrive/MyDrive/SCB10X/raw_dataset/llava_v1_5_mix665k.json']

In [8]:
# Get all data
def get_data(json_path):
  # Read json
  with open(json_path, 'r') as file:
    data = json.load(file)
  name = json_path.split('/')[-1].split('.')[0]
  # Collect data
  datas = []
  for row in tqdm(data):
    id = row['id']
    if 'image' in row.keys():
      image = row['image']
      count = 0
      for value in row['conversations']:
        data_dict = {
            'id': id,
            'image': image,
            'from': value['from'],
            'value': clean_text(value['value']),
            'order': count
        }
        datas.append(data_dict)
        count += 1

  # Convert & Save dataframe
  df = pd.DataFrame(datas)
  filename = name
  df.to_csv(os.path.join(TARGET_DIR, filename+'.csv'), index=False)
  return df

In [9]:
for path in file_list:
  df = get_data(path)

100%|██████████| 665298/665298 [00:45<00:00, 14522.83it/s]


In [11]:
print(len(df))
df

6712322


Unnamed: 0,id,image,from,value,order
0,000000033471,coco/train2017/000000033471.jpg,human,<image> What are the colors of the bus in the ...,0
1,000000033471,coco/train2017/000000033471.jpg,gpt,The bus in the image is white and red.,1
2,000000033471,coco/train2017/000000033471.jpg,human,What feature can be seen on the back of the bus?,2
3,000000033471,coco/train2017/000000033471.jpg,gpt,The back of the bus features an advertisement.,3
4,000000033471,coco/train2017/000000033471.jpg,human,Is the bus driving down the street or pulled o...,4
...,...,...,...,...,...
6712317,07c44727285d8060,textvqa/train_images/07c44727285d8060.jpg,gpt,The pages of opened book is arranged ina heart...,1
6712318,bd75fd0264fdaf5b,textvqa/train_images/bd75fd0264fdaf5b.jpg,human,<image> Provide a one-sentence caption for the...,0
6712319,bd75fd0264fdaf5b,textvqa/train_images/bd75fd0264fdaf5b.jpg,gpt,A sketch of a man in behind the counter in a b...,1
6712320,4508de4f680374a7,textvqa/train_images/4508de4f680374a7.jpg,human,<image> Provide a one-sentence caption for the...,0


**The next step**: Use Google Sheet for FREE Translation

In [None]:
count = 0
for row in data:
  try:
    image = row['image']
  except:
    print(row)
    count += 1
  if count == 10:
    break

{'id': 'i6IyJda_0', 'model': '', 'conversations': [{'from': 'human', 'value': 'How to tell if a customer segment is well segmented? In 3 bullet points.'}, {'from': 'gpt', 'value': '1. Homogeneity: The segment should consist of customers who share similar characteristics and behaviors.\n2. Distinctiveness: The segment should be different from other segments in terms of their characteristics and behaviors.\n3. Stability: The segment should remain relatively stable over time and not change drastically. The characteristics and behaviors of customers within the segment should not change significantly.'}]}
{'id': 'A5AbcES_0', 'model': '', 'conversations': [{'from': 'human', 'value': 'In Java, I want to replace string like "This is a new {object} at {place}" with a Map, {object: "student", "point 3, 4"}, and get a result "This is a new student at point 3, 4". How can I do?'}, {'from': 'gpt', 'value': 'You can use the `String.format()` method in Java to replace placeholders in a string with va