In [10]:
import os
import math
import requests
import subprocess

server_url = 'https://dataverse.harvard.edu'
api_token = ''
dataset_id = 'doi:10.7910/DVN/Q2KIES'
file_path = "../data/fnc_transcripts_html_2025.tar.gz" # fnc_transcripts_text_2025.tar.gz fnc_transcripts_text_2025.tar.gz ../data/foxnews-transcript-urls-2025.csv.gz
upload_url = f"{server_url}/api/datasets/:persistentId/add"

In [8]:
 params = {
        'persistentId': dataset_id,
        'key': api_token
    }

In [9]:
filename = os.path.basename(file_path)
file_obj = open(file_path, 'rb')
files = {
    'file': (filename, file_obj, 'application/x-gzip')
}

# Upload the file
print(f"Uploading {filename} to dataset {dataset_id}...")
try:
    response = requests.post(
        upload_url, 
        params=params,
        files=files
    )
    
    # Check response
    if response.status_code == 200:
        result = response.json()
        print(f"Upload successful! File ID: {result['data']['files'][0]['dataFile']['id']}")
        print(f"Access the file at: {result['data']['files'][0]['dataFile']['persistentId']}")
    else:
        print(f"Upload failed with status code {response.status_code}")
        print(f"Response: {response.text}")
except Exception as e:
    print(f"Error during upload: {str(e)}")
finally:
    # Always close the file object
    file_obj.close()

Uploading fnc_transcripts_html_2025.tar.gz to dataset doi:10.7910/DVN/Q2KIES...
Error during upload: string longer than 2147483647 bytes


In [None]:
chunk_size = "1G"  # 1GB chunks

# Get original file size
original_size = os.path.getsize(file_path)
num_chunks = math.ceil(original_size / (1024**3))  # Calculate number of 1GB chunks needed
print(f"Original file size: {original_size / (1024**3):.2f} GB")
print(f"Will split into {num_chunks} chunks of 1GB (last chunk may be smaller)")

# Upload URL
upload_url = f"{server_url}/api/datasets/:persistentId/add"
params = {
    'persistentId': dataset_id,
    'key': api_token
}

# Create and upload each chunk
for i in range(num_chunks):
    part_file = f"{file_path}.part{i+1}"
    skip_blocks = i
    
    # For all chunks except the last one, use count=1 to get exactly 1GB
    if i < num_chunks - 1:
        cmd = ["dd", f"if={file_path}", f"of={part_file}", "bs=1G", f"skip={skip_blocks}", "count=1"]
    else:
        # For the last chunk, don't specify count so it takes all remaining data
        cmd = ["dd", f"if={file_path}", f"of={part_file}", "bs=1G", f"skip={skip_blocks}"]
    
    # Execute the dd command
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    
    print(f"Uploading part {i+1}/{num_chunks}: {os.path.basename(part_file)}...")
    
    with open(part_file, 'rb') as file_obj:
        files = {
            'file': (os.path.basename(part_file), file_obj, 'application/octet-stream')
        }
        
        try:
            response = requests.post(
                upload_url,
                params=params,
                files=files
            )
            
            if response.status_code == 200:
                result = response.json()
                file_id = result['data']['files'][0]['dataFile']['id']
                print(f"✅ Success! File ID: {file_id}")
            else:
                print(f"❌ Failed with status code {response.status_code}")
                print(f"Response: {response.text}")
        except Exception as e:
            print(f"❌ Error: {str(e)}")

print("\nUpload completed. To reassemble the file, use:")
print(f"cat {file_path}.part* > {os.path.basename(file_path)}")

Original file size: 3.56 GB
Will split into 4 chunks of 1GB (last chunk may be smaller)
Uploading part 1/4: fnc_transcripts_html_2025.tar.gz.part1...
✅ Success! File ID: 11014501
Uploading part 2/4: fnc_transcripts_html_2025.tar.gz.part2...
✅ Success! File ID: 11014510
Uploading part 3/4: fnc_transcripts_html_2025.tar.gz.part3...
