In [None]:
# This notebook contains code showing the many steps I took to get the large citations and metadata downloads processed.
# For documentation/provenance purposes. No need to replicate this. Use the sampled data I got at the end.

In [4]:
import time
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
import math
import os

# Neo4j connection details
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password" # your password to the local DBMS you created on Neo4j Desktop where you want to upload the data

In [3]:
%%time

def filter_citations_with_metadata(citations_file, metadata_file, output_prefix, batch_size=1000000):
    """
    Filter the citations to only include records that have matching metadata
    for both citing and cited papers (based on the 'omid' column in metadata).
    
    Parameters:
        citations_file (str): Path to the CSV file containing citations.
        metadata_file (str): Path to the CSV file containing metadata.
        output_prefix (str): Prefix for the output batch files.
        batch_size (int): Number of original records to process per batch.
    """
    # Load the metadata into a Pandas DataFrame (use 'omid' as the identifier column)
    metadata_df = pd.read_csv(metadata_file, dtype={'omid': 'object'})
    
    # Load the citations data (ensure 'citing' and 'cited' are treated as strings)
    citations_df = pd.read_csv(citations_file, dtype={'citing': 'object', 'cited': 'object'})
    
    # Extract the 'omid' values from metadata (set of valid metadata ids)
    valid_metadata_ids = set(metadata_df['omid'].unique())
    
    # Initialize variables for tracking progress and estimating time
    total_records = len(citations_df)
    processed_records = 0
    start_time = time.time()
    
    # Prepare an empty list to store filtered citations
    filtered_citations = []
    
    # Define the number of batches needed based on total records and batch size
    num_batches = math.ceil(total_records / batch_size)
    
    # Iterate over the citations data
    for _, row in citations_df.iterrows():
        citing_id = row['citing']
        cited_id = row['cited']
        
        # Check if both citing and cited IDs exist in the valid metadata set
        if citing_id in valid_metadata_ids and cited_id in valid_metadata_ids:
            filtered_citations.append(row)
        
        # Update progress and estimate time remaining
        processed_records += 1
        elapsed_time = time.time() - start_time
        remaining_time = (elapsed_time / processed_records) * (total_records - processed_records)
        
        # Print progress every 1000 records
        if processed_records % 100000 == 0 or processed_records == total_records:
            print(f"Processed {processed_records} out of {total_records} original records. "
                  f"Estimated time remaining: {remaining_time / 60:.2f} minutes.")
        
        # If we've processed enough original records for a batch, save that batch
        if processed_records % batch_size == 0:
            batch_df = pd.DataFrame(filtered_citations)
            batch_num = processed_records // batch_size
            batch_df.to_csv(f"{output_prefix}_batch_{batch_num + 1}.csv", index=False)
            print(f"Saved batch {batch_num + 1} with {len(filtered_citations)} filtered records.")
            filtered_citations = []  # Reset for the next batch
    
    # After the loop, save any remaining records that didn't fill a full batch
    if filtered_citations:
        batch_num = num_batches  # The last batch number
        batch_df = pd.DataFrame(filtered_citations)
        batch_df.to_csv(f"{output_prefix}_batch_{batch_num}.csv", index=False)
        print(f"Saved final batch {batch_num} with {len(filtered_citations)} filtered records.")
    
    print(f"Original number of citations: {total_records}")
    print(f"Filtered number of citations: {processed_records}")

# Example usage
citations_file = "filtered_citations.csv"  # Path to your citations file
metadata_file = "filtered_metadata_with_split_ids_4parts.csv"  # Path to your metadata file
output_prefix = "filtered_citations_with_metadata"  # Prefix for the batch output files

# Call the function to filter citations and save in batches
filter_citations_with_metadata(citations_file, metadata_file, output_prefix, batch_size=13000000)




Processed 100000 out of 103094766 original records. Estimated time remaining: 187.26 minutes.
Processed 200000 out of 103094766 original records. Estimated time remaining: 161.98 minutes.
Processed 300000 out of 103094766 original records. Estimated time remaining: 150.71 minutes.
Processed 400000 out of 103094766 original records. Estimated time remaining: 135.24 minutes.
Processed 500000 out of 103094766 original records. Estimated time remaining: 130.12 minutes.
Processed 600000 out of 103094766 original records. Estimated time remaining: 126.48 minutes.
Processed 700000 out of 103094766 original records. Estimated time remaining: 120.18 minutes.
Processed 800000 out of 103094766 original records. Estimated time remaining: 117.04 minutes.
Processed 900000 out of 103094766 original records. Estimated time remaining: 111.83 minutes.
Processed 1000000 out of 103094766 original records. Estimated time remaining: 110.08 minutes.
Processed 1100000 out of 103094766 original records. Estima

In [8]:
%%time

def concatenate_batch_files(batch_file_prefix, output_file, batch_count):
    """
    Concatenate all batch files into a single CSV file.

    Parameters:
        batch_file_prefix (str): Prefix of the batch files (e.g., "filtered_citations_with_metadata_batch_").
        output_file (str): Path to save the concatenated CSV.
        batch_count (int): Total number of batch files to concatenate.
    """
    # List to hold DataFrames from all batch files
    all_batches = []
    
    # Iterate through all batch files and read them into DataFrames
    for i in range(1, batch_count + 1):
        batch_file = f"{batch_file_prefix}_{i}.csv"
        if os.path.exists(batch_file):  # Check if the batch file exists
            print(f"Reading {batch_file}...")
            batch_df = pd.read_csv(batch_file)
            all_batches.append(batch_df)
        else:
            print(f"Warning: {batch_file} does not exist!")
    
    # Concatenate all DataFrames into one
    if all_batches:
        concatenated_df = pd.concat(all_batches, ignore_index=True)
        print(f"Concatenated {len(all_batches)} batch files.")
        
        # Save the concatenated DataFrame to the output file
        concatenated_df.to_csv(output_file, index=False)
        print(f"Saved concatenated file to {output_file}")
    else:
        print("No valid batch files found to concatenate.")

# Example usage:
batch_file_prefix = "filtered_metadata_batch"  # Prefix for batch files
output_file = "all_filtered_metadata.csv"  # Output file to save the concatenated CSV
batch_count = 8  # Total number of batch files you want to concatenate (adjust based on your situation)

# Call the function to concatenate the batch files
concatenate_batch_files(batch_file_prefix, output_file, batch_count)


Reading filtered_metadata_batch_1.csv...
Reading filtered_metadata_batch_2.csv...
Reading filtered_metadata_batch_3.csv...
Reading filtered_metadata_batch_4.csv...
Reading filtered_metadata_batch_5.csv...
Reading filtered_metadata_batch_6.csv...
Reading filtered_metadata_batch_7.csv...
Reading filtered_metadata_batch_8.csv...
Concatenated 8 batch files.
Saved concatenated file to all_filtered_metadata.csv
CPU times: user 2min 35s, sys: 14.1 s, total: 2min 49s
Wall time: 2min 50s


In [7]:
%%time

# Constants for batch processing
BATCH_SIZE = 1000000  # Adjust batch size as needed
OUTPUT_FILE_PREFIX = "filtered_metadata_batch"
CITATIONS_FILE = "all_filtered_citations.csv"
METADATA_FILE = "filtered_metadata_with_split_ids_4parts.csv"

def filter_metadata_by_citations(citations_file, metadata_file, output_file_prefix, batch_size):
    """
    Filter the metadata based on the omid in citing/cited columns from the citations file and save in batches.

    Parameters:
        citations_file (str): Path to the citations file.
        metadata_file (str): Path to the metadata file.
        output_file_prefix (str): Prefix for the output batch files.
        batch_size (int): The number of records to process in each batch.
    """
    # Load citations data (citing and cited columns only)
    print("Loading citations data...")
    citations_df = pd.read_csv(citations_file, usecols=['citing', 'cited'], dtype={'citing': 'object', 'cited': 'object'})
    
    # Combine citing and cited columns into a single set of unique omids
    omid_set = set(citations_df['citing'].dropna().unique()).union(citations_df['cited'].dropna().unique())
    print(f"Extracted {len(omid_set)} unique omids from citations.")
    
    # Load metadata file
    print("Loading metadata file...")
    metadata_df = pd.read_csv(metadata_file, dtype={'omid': 'object'})

    # Filter metadata to include only records where omid is in omid_set
    filtered_metadata_df = metadata_df[metadata_df['omid'].isin(omid_set)]
    print(f"Filtered metadata to {len(filtered_metadata_df)} records matching omids from citations.")

    # Process and save in batches
    total_records = len(filtered_metadata_df)
    start_time = time.time()
    processed_records = 0
    batch_count = 0

    for start in range(0, total_records, batch_size):
        # Get the chunk (batch) of data
        batch_df = filtered_metadata_df.iloc[start:start + batch_size]
        
        # Save the batch to a CSV file
        batch_count += 1
        batch_output_file = f"{output_file_prefix}_{batch_count}.csv"
        batch_df.to_csv(batch_output_file, index=False)
        print(f"Saved batch {batch_count} to {batch_output_file}")
        
        # Update progress and time remaining estimate
        processed_records += len(batch_df)
        elapsed_time = time.time() - start_time
        remaining_time = (elapsed_time / processed_records) * (total_records - processed_records)
        print(f"Processed {processed_records} out of {total_records} records. Estimated time remaining: {remaining_time / 60:.2f} minutes.")
    
    print("Filtering and batch saving complete.")

# Example usage:
filter_metadata_by_citations(CITATIONS_FILE, METADATA_FILE, OUTPUT_FILE_PREFIX, BATCH_SIZE)


Loading citations data...
Extracted 7999487 unique omids from citations.
Loading metadata file...




Filtered metadata to 7999487 records matching omids from citations.
Saved batch 1 to filtered_metadata_batch_1.csv
Processed 1000000 out of 7999487 records. Estimated time remaining: 2.22 minutes.
Saved batch 2 to filtered_metadata_batch_2.csv
Processed 2000000 out of 7999487 records. Estimated time remaining: 1.96 minutes.
Saved batch 3 to filtered_metadata_batch_3.csv
Processed 3000000 out of 7999487 records. Estimated time remaining: 1.68 minutes.
Saved batch 4 to filtered_metadata_batch_4.csv
Processed 4000000 out of 7999487 records. Estimated time remaining: 1.38 minutes.
Saved batch 5 to filtered_metadata_batch_5.csv
Processed 5000000 out of 7999487 records. Estimated time remaining: 1.05 minutes.
Saved batch 6 to filtered_metadata_batch_6.csv
Processed 6000000 out of 7999487 records. Estimated time remaining: 0.71 minutes.
Saved batch 7 to filtered_metadata_batch_7.csv
Processed 7000000 out of 7999487 records. Estimated time remaining: 0.36 minutes.
Saved batch 8 to filtered_met

In [13]:
# File paths
CITATIONS_FILE = "all_filtered_citations.csv"
SAMPLED_CITATIONS_FILE = "sampled_citations.csv"
METADATA_BATCHES = [f"filtered_metadata_batch_{i}.csv" for i in range(1, 9)]  # filtered_metadata_batch_1.csv through filtered_metadata_batch_8.csv
SAMPLED_METADATA_FILE = "sampled_citations_metadata.csv"

# Parameters
SAMPLE_SIZE = 500  # Random sample size for the initial citation selection
MAX_ITERATIONS = 3  # Limit iterations to avoid infinite loops or long processing times

# Function to sample citations and iteratively filter based on citing/cited IDs
def sample_and_filter_citations(citations_file, sample_size=500, max_iterations=10):
    """
    Randomly sample citations, and iteratively filter based on citing/cited IDs.
    
    Parameters:
        citations_file (str): Path to the all_filtered_citations.csv file.
        sample_size (int): The number of records to randomly sample.
        max_iterations (int): Maximum number of iterations to perform when filtering.
    
    Returns:
        pd.DataFrame: Filtered citations after iterative matching.
    """
    # Load citations file in chunks (use a smaller chunk size to manage memory)
    chunk_size = 1000000
    sampled_citations = []
    first_pass = True
    
    start_time = time.time()
    for chunk in pd.read_csv(citations_file, chunksize=chunk_size):
        # Randomly sample `sample_size` citations from the current chunk
        if first_pass:
            # Sample from the entire citations file randomly
            sampled_chunk = chunk.sample(n=sample_size, random_state=42)
            sampled_citations = sampled_chunk[['id', 'citing', 'cited']]
            first_pass = False
        else:
            # Iteratively filter based on citing/cited IDs matching any of the previously sampled IDs
            sampled_ids = set(sampled_citations['citing']).union(set(sampled_citations['cited']))
            filtered_chunk = chunk[chunk['citing'].isin(sampled_ids) | chunk['cited'].isin(sampled_ids)]
            sampled_citations = pd.concat([sampled_citations, filtered_chunk[['id', 'citing', 'cited']]])

        # Update time remaining estimate
        elapsed_time = time.time() - start_time
        records_processed = len(sampled_citations)
        total_records = len(chunk) * max_iterations  # Approximate total record count
        estimated_time_remaining = (elapsed_time / records_processed) * (total_records - records_processed)
        print(f"Processed {records_processed} citations. Estimated time remaining: {estimated_time_remaining / 60:.2f} minutes.")

        # Stop if we've iterated enough times or hit a reasonable limit
        if len(sampled_citations) >= SAMPLE_SIZE * max_iterations:
            break
    
    # Save sampled citations to a CSV file
    print(len(sampled_citations), " records in sampled citations")
    sampled_citations.to_csv(SAMPLED_CITATIONS_FILE, index=False)
    return sampled_citations

# Function to filter metadata based on sampled citations
def filter_metadata_with_sampled_citations(metadata_batches, sampled_citations_df):
    """
    Filter metadata records based on citing and cited IDs from sampled citations.
    
    Parameters:
        metadata_batches (list of str): List of metadata batch files (e.g., filtered_metadata_batch_1.csv).
        sampled_citations_df (pd.DataFrame): DataFrame containing the sampled citations.
    
    Returns:
        pd.DataFrame: Filtered metadata records.
    """
    sampled_omid_ids = set(sampled_citations_df['citing']).union(set(sampled_citations_df['cited']))
    filtered_metadata = []
    
    start_time = time.time()
    for idx, batch in enumerate(metadata_batches):
        batch_df = pd.read_csv(batch)
        
        # Filter metadata based on 'omid' matching citing or cited IDs
        filtered_batch = batch_df[batch_df['omid'].isin(sampled_omid_ids)]
        filtered_metadata.append(filtered_batch)
        
        # Update time remaining estimate
        elapsed_time = time.time() - start_time
        total_batches = len(metadata_batches)
        estimated_time_remaining = (elapsed_time / (idx + 1)) * (total_batches - (idx + 1))
        print(f"Processed batch {batch}. Estimated time remaining: {estimated_time_remaining / 60:.2f} minutes.")
    
    # Concatenate filtered metadata from all batches
    filtered_metadata_df = pd.concat(filtered_metadata, ignore_index=True)
    
    # Save filtered metadata to CSV
    print(len(filtered_metadata_df), " records in filtered metadata")
    filtered_metadata_df.to_csv(SAMPLED_METADATA_FILE, index=False)
    return filtered_metadata_df


def main():
    # Step 1: Sample and filter citations
    sampled_citations_df = sample_and_filter_citations(CITATIONS_FILE, sample_size=SAMPLE_SIZE)
    
    # Step 2: Filter metadata records based on sampled citations
    sampled_metadata_df = filter_metadata_with_sampled_citations(METADATA_BATCHES, sampled_citations_df)
    
    print(f"Sampled citations saved to: {SAMPLED_CITATIONS_FILE}")
    print(f"Sampled metadata saved to: {SAMPLED_METADATA_FILE}")

if __name__ == "__main__":
    main()


Processed 500 citations. Estimated time remaining: 527.85 minutes.
Processed 1255 citations. Estimated time remaining: 440.98 minutes.
Processed 2002 citations. Estimated time remaining: 413.16 minutes.
Processed 2811 citations. Estimated time remaining: 393.84 minutes.
Processed 3748 citations. Estimated time remaining: 367.58 minutes.
Processed 4771 citations. Estimated time remaining: 346.91 minutes.
Processed 5719 citations. Estimated time remaining: 339.35 minutes.
5719  records in sampled citations
Processed batch filtered_metadata_batch_1.csv. Estimated time remaining: 1.00 minutes.
Processed batch filtered_metadata_batch_2.csv. Estimated time remaining: 0.86 minutes.
Processed batch filtered_metadata_batch_3.csv. Estimated time remaining: 0.72 minutes.
Processed batch filtered_metadata_batch_4.csv. Estimated time remaining: 0.57 minutes.
Processed batch filtered_metadata_batch_5.csv. Estimated time remaining: 0.43 minutes.
Processed batch filtered_metadata_batch_6.csv. Estimate