In [1]:
import numpy as np

# Path to the .npz file
npz_file_path = 'C:/Users/ojus/Pictures/icr2025/papers100M-bin/raw/data.npz'

# Load the .npz file and inspect the contents
with np.load(npz_file_path, allow_pickle=True) as data:
    print("Arrays in the .npz file:")
    for array_name in data.files:
        print(f"Array name: '{array_name}'")


Arrays in the .npz file:
Array name: 'node_feat'
Array name: 'edge_index'
Array name: 'num_nodes_list'
Array name: 'num_edges_list'
Array name: 'node_year'


In [2]:
import os
import numpy as np

# Define the path to the .npz file
npz_file_path = 'C:/Users/ojus/Pictures/icr2025/papers100M-bin/raw/data.npz'

# Function to calculate the size of the .npz file in MB
def calculate_size_in_mb(file_path):
    file_size = os.path.getsize(file_path)
    return file_size / (1024 * 1024)  # Convert to MB

# Function to process arrays
def process_array(array_name, file_path):
    total_entries = 0
    total_sum = 0

    with np.load(file_path, allow_pickle=True) as data:
        array = data[array_name]
        total_entries = array.size
        
        # Assuming these arrays may contain numerical data for summation
        total_sum = np.sum(array)

    return total_sum, total_entries

# Function to calculate stats for the remaining arrays
def calculate_stats_in_npz(file_path):
    total_sum = 0
    total_entries = 0

    try:
        with np.load(file_path, allow_pickle=True) as data:
            for array_name in data.files:
                if array_name == 'node_feat':
                    print(f"Skipping array '{array_name}' due to size.")
                    continue
                
                print(f"Processing array '{array_name}'...")
                array_sum, entries = process_array(array_name, file_path)
                total_sum += array_sum
                total_entries += entries
    except MemoryError:
        print("MemoryError: Could not process large array in .npz file.")

    return total_sum, total_entries

# Calculate the size in MB of the .npz file
size_in_mb = calculate_size_in_mb(npz_file_path)

# Calculate stats for the remaining arrays
total_sum, total_entries = calculate_stats_in_npz(npz_file_path)

if total_entries > 0:
    mean_value = total_sum / total_entries
else:
    mean_value = 0

print(f"Size of the .npz file: {size_in_mb:.2f} MB")
print(f"Mean value of entries in processable arrays: {mean_value:.2f}")


Skipping array 'node_feat' due to size.
Processing array 'edge_index'...
Processing array 'num_nodes_list'...
Processing array 'num_edges_list'...
Processing array 'node_year'...
Size of the .npz file: 57118.71 MB
Mean value of entries in processable arrays: 43484701.77


In [31]:
import pandas as pd
import os

# File paths
train_csv = 'C:/Users/ojus/Pictures/icr2025/papers100M-bin/split/time/train.csv'
test_csv = 'C:/Users/ojus/Pictures/icr2025/papers100M-bin/split/time/test.csv'
validate_csv = 'C:/Users/ojus/Pictures/icr2025/papers100M-bin/split/time/valid.csv'

# Function to calculate mean value from a single-column CSV
def calculate_mean(file_path):
    chunk_size = 10**6
    total_sum = 0
    total_count = 0
    
    for chunk in pd.read_csv(file_path, header=None, chunksize=chunk_size):
        total_sum += chunk[0].sum()
        total_count += chunk[0].count()
    
    return total_sum / total_count if total_count > 0 else 0

# Function to get file size in MB
def get_file_size_mb(file_path):
    return os.path.getsize(file_path) / (1024 * 1024)  # Convert bytes to MB

# Calculate means and sizes
train_mean = calculate_mean(train_csv)
train_size_mb = get_file_size_mb(train_csv)

test_mean = calculate_mean(test_csv)
test_size_mb = get_file_size_mb(test_csv)

validate_mean = calculate_mean(validate_csv)
validate_size_mb = get_file_size_mb(validate_csv)

# Print results
print(f"Train Mean: {train_mean}, Train File Size: {train_size_mb:.2f} MB")
print(f"Test Mean: {test_mean}, Test File Size: {test_size_mb:.2f} MB")
print(f"Validate Mean: {validate_mean}, Validate File Size: {validate_size_mb:.2f} MB")


Train Mean: 48483348.9312165, Train File Size: 10.48 MB
Test Mean: 106366818.33732702, Test File Size: 2.04 MB
Validate Mean: 91249065.8791522, Validate File Size: 1.12 MB
