In [1]:
import os
import pandas as pd
import json
import shutil


In [2]:
# Load dataset N24News JSON file
data_path = "N24News/news/nytimes_train.json"
with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)
df_test = pd.DataFrame(data)

# Build image_path (.jpg in N24News/imgs)
df_test["image_path"] = df_test["image_id"].apply(
    lambda x: os.path.join("N24News", "imgs", f"{x}.jpg")
)
df_test["image_exists"] = df_test["image_path"].apply(os.path.exists) # check if the image path exits
print("Total test records:", len(df_test))
print("Records with valid images:", df_test["image_exists"].sum())
# Filter for valid records only
df_valid = df_test[df_test["image_exists"]].copy()
# Check if we have enough records for sampling (20 sample datasets * 200 records = 4000 records)
enough_for_disjoint = len(df_valid) >= 4000

Total test records: 48988
Records with valid images: 48988


In [3]:
import os
import json
import shutil
import pandas as pd

# assume df_valid exists and has columns "image_exists", "section", "image_path"
valid_df = df_valid[df_valid["image_exists"]]

# Section → numeric mapping
section_mapping = {
    "Health": 1, "Science": 2, "Television": 3, "Travel": 4, "Movies": 5,
    "Dance": 6, "Real Estate": 7, "Economy": 8, "Sports": 9, "Theater": 10,
    "Opinion": 11, "Music": 12, "Books": 13, "Art & Design": 14, "Style": 15,
    "Media": 16, "Food": 17, "Well": 18, "Fashion & Style": 19, "Technology": 20,
    "Your Money": 21, "Education": 22, "Automobiles": 23, "Global Business": 24
}

base_dir = "Data_Train"
os.makedirs(base_dir, exist_ok=True)

for i in range(1, 21):
    # Sample
    sample_df = valid_df.sample(n=200, random_state=1000 + i).copy()
    sample_df["section_numeric"] = sample_df["section"].map(section_mapping)

    # Paths under Data_Train/
    json_path = os.path.join(base_dir, f"nytimes_{i}.json")
    csv_path  = os.path.join(base_dir, f"nytimes_{i}.csv")
    imgs_dir  = os.path.join(base_dir, f"imgs_{i}")
    os.makedirs(imgs_dir, exist_ok=True)

    # Save JSON
    records = sample_df.drop(columns=["image_exists"]).to_dict(orient="records")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    # Save CSV
    sample_df.drop(columns=["image_exists"]).to_csv(csv_path, index=False)

    # Copy images
    for _, row in sample_df.iterrows():
        src = row["image_path"]
        if os.path.exists(src):
            dst = os.path.join(imgs_dir, os.path.basename(src))
            shutil.copyfile(src, dst)

    print(f"Saved sample {i} → {json_path}, {csv_path}, {imgs_dir}")


Saved sample 1 → Data_Train/nytimes_1.json, Data_Train/nytimes_1.csv, Data_Train/imgs_1
Saved sample 2 → Data_Train/nytimes_2.json, Data_Train/nytimes_2.csv, Data_Train/imgs_2
Saved sample 3 → Data_Train/nytimes_3.json, Data_Train/nytimes_3.csv, Data_Train/imgs_3
Saved sample 4 → Data_Train/nytimes_4.json, Data_Train/nytimes_4.csv, Data_Train/imgs_4
Saved sample 5 → Data_Train/nytimes_5.json, Data_Train/nytimes_5.csv, Data_Train/imgs_5
Saved sample 6 → Data_Train/nytimes_6.json, Data_Train/nytimes_6.csv, Data_Train/imgs_6
Saved sample 7 → Data_Train/nytimes_7.json, Data_Train/nytimes_7.csv, Data_Train/imgs_7
Saved sample 8 → Data_Train/nytimes_8.json, Data_Train/nytimes_8.csv, Data_Train/imgs_8
Saved sample 9 → Data_Train/nytimes_9.json, Data_Train/nytimes_9.csv, Data_Train/imgs_9
Saved sample 10 → Data_Train/nytimes_10.json, Data_Train/nytimes_10.csv, Data_Train/imgs_10
Saved sample 11 → Data_Train/nytimes_11.json, Data_Train/nytimes_11.csv, Data_Train/imgs_11
Saved sample 12 → Data_T

In [4]:
# 1) Load the full dataset and build image paths
data_path = "N24News/news/nytimes_train.json"
with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)
df = pd.DataFrame(data)

df["image_path"] = df["image_id"].apply(
    lambda x: os.path.join("N24News", "imgs", f"{x}.jpg")
)
df["image_exists"] = df["image_path"].apply(os.path.exists)

print("Total records:", len(df))
print("Records with valid images:", df["image_exists"].sum())

# 2) Filter to only valid‐image rows
df_valid = df[df["image_exists"]].copy()

# 3) Sample 4000 rows for testing
sample_df = df_valid.sample(n=4000, random_state=42).copy()

# 4) Map section names → numeric codes
sample_df["section_numeric"] = sample_df["section"].map(section_mapping)

# 5) Prepare output directory
base_test_dir = "Data_test"
os.makedirs(base_test_dir, exist_ok=True)

# 6) Save sampled JSON
out_json = os.path.join(base_test_dir, "nytimes_test_sample4000.json")
with open(out_json, "w", encoding="utf-8") as f:
    records = sample_df.drop(columns=["image_exists"], errors="ignore").to_dict(orient="records")
    json.dump(records, f, ensure_ascii=False, indent=2)

# 7) Save sampled CSV
out_csv = os.path.join(base_test_dir, "nytimes_test_sample4000.csv")
sample_df.drop(columns=["image_exists"], errors="ignore").to_csv(out_csv, index=False)

# 8) Copy sampled images
imgs_test_dir = os.path.join(base_test_dir, "imgs_test_sample4000")
os.makedirs(imgs_test_dir, exist_ok=True)
for _, row in sample_df.iterrows():
    src = row["image_path"]
    if os.path.exists(src):
        dst = os.path.join(imgs_test_dir, os.path.basename(src))
        shutil.copyfile(src, dst)

print(f"Saved 4000-sample →\n  JSON: {out_json}\n  CSV:  {out_csv}\n  IMGS: {imgs_test_dir}")


Total records: 48988
Records with valid images: 48988
Saved 4000-sample →
  JSON: Data_test/nytimes_test_sample4000.json
  CSV:  Data_test/nytimes_test_sample4000.csv
  IMGS: Data_test/imgs_test_sample4000


In [7]:
import os
import shutil

BASE_PATH = "Data_train"
IMG_DIR_TEMPLATE = "imgs_{}"
MERGED_IMG_DIR = os.path.join(BASE_PATH, "imgs_all")

# 1) Create the merged folder if it doesn't exist
os.makedirs(MERGED_IMG_DIR, exist_ok=True)

# 2) Copy all images from imgs_1 … imgs_20 into imgs_all
for i in range(1, 21):
    src_dir = os.path.join(BASE_PATH, IMG_DIR_TEMPLATE.format(i))
    if not os.path.isdir(src_dir):
        continue
    for fname in os.listdir(src_dir):
        src_path = os.path.join(src_dir, fname)
        dst_path = os.path.join(MERGED_IMG_DIR, fname)
        # if you want to avoid overwriting same-named files, you could add a prefix:
        # dst_path = os.path.join(MERGED_IMG_DIR, f"round{i}_{fname}")
        shutil.copy2(src_path, dst_path)

print(f"All images copied into {MERGED_IMG_DIR}")


All images copied into Data_train/imgs_all
