In [1]:
import numpy as np
import requests
import json
from tqdm import tqdm
from io import BytesIO
from PIL import Image
import os



In [2]:
save_dir = 'ChartFC/'

# Create the directory if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [3]:
with open('claim_explanation_verification_pre_tasksets.json') as f:
    data = json.load(f)

In [4]:
files = set(os.listdir(save_dir))

In [5]:
len(files)

1626

In [6]:
success = 0
fail = 0
failed_urls = set()
for example in tqdm(data):
    url = example["chart_img"]
    filename= os.path.basename(url)
    if filename not in files:
        response = requests.get(url)
        # Extract the filename from the URL
        filename = os.path.join(save_dir, filename)
        if response.status_code != 200:
            failed_urls.add(url)
            fail += 1
        else:
            success += 1
            with open(filename, 'wb') as f:
                f.write(response.content)
    else:
        success += 1

100%|██████████| 9300/9300 [02:31<00:00, 61.27it/s] 


In [7]:
print("Success: ", success)
print("Fail: ", fail)
print("Percentage Success: ", success/(success+fail))

Success:  9300
Fail:  0
Percentage Success:  1.0


In [73]:
images = os.listdir(save_dir)

In [74]:
np.random.seed(40)
test_charts = np.random.choice(images, int(0.045 * len(images)), replace=False)

In [75]:
remaining_images = set(files) - set(test_charts)

In [76]:
len(data)

9300

In [77]:
remaining_claims = [item for item in data if os.path.basename(item["chart_img"]) in remaining_images]

In [78]:
second_alpha_claim = [item for item in data if os.path.basename(item["chart_img"]) in test_charts]

In [79]:
len(remaining_claims)

8016

In [80]:
len(second_alpha_claim)

422

In [81]:
remaining_claims = np.array(remaining_claims)

In [82]:
np.random.seed(42)

# Shuffle the indices of the data
indices = np.random.permutation(len(remaining_claims))

# Calculate the number of samples in the training, validation, and testing sets
num_train = int(0.8 * len(remaining_claims))
num_val = int(0.1 * len(remaining_claims))

# Split the indices into training, validation, and testing sets
train_indices = indices[:num_train]
val_indices = indices[num_train:num_train+num_val]
test_indices = indices[num_train+num_val:]

train_data = remaining_claims[train_indices]
val_data = remaining_claims[val_indices]
test_data = remaining_claims[test_indices]

In [83]:
len(train_data)

6412

In [84]:
len(val_data)

801

In [85]:
len(test_data)

803

In [87]:
os.mkdir("ChartFC/train")
os.mkdir("ChartFC/val")
os.mkdir("ChartFC/test")

In [89]:
with open("ChartFC/train/train.json", "w") as f:
    json.dump(train_data.tolist(), f)
with open("ChartFC/val/val.json", "w") as f:
    json.dump(val_data.tolist(), f)
with open("ChartFC/test/test.json", "w") as f:
    json.dump(test_data.tolist(), f)
with open("ChartFC/test/test_unseen.json", "w") as f:
    json.dump(second_alpha_claim, f)