In [1]:
import os
import json
from tqdm import tqdm

def split_jsonl_file(input_file, output_dir, num_chunks=10):
    """
    Splits a large JSONL file into multiple smaller JSON files in a memory-efficient manner.

    Args:
        input_file (str): Path to the input JSONL file.
        output_dir (str): Directory to store the output chunk files.
        num_chunks (int): Number of chunks to split the file into.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # First, determine the total number of lines in the file
    print("Counting total lines in the file...")
    total_lines = sum(1 for _ in open(input_file, 'r', encoding='utf-8'))

    # Calculate the approximate number of lines per chunk
    lines_per_chunk = total_lines // num_chunks

    # Initialize variables for splitting
    chunk_index = 1
    current_chunk_lines = []
    chunk_file_path = os.path.join(output_dir, f"{chunk_index}.json")

    with open(input_file, 'r', encoding='utf-8') as infile:
        with tqdm(total=total_lines, desc="Processing", unit="line") as pbar:
            for line_number, line in enumerate(infile, start=1):
                # Parse the JSON line
                data = json.loads(line)

                # Add the parsed data to the current chunk lines
                current_chunk_lines.append(data)

                # If we've reached the lines per chunk, write the chunk to a file
                if len(current_chunk_lines) >= lines_per_chunk and chunk_index < num_chunks:
                    with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
                        json.dump(current_chunk_lines, chunk_file, ensure_ascii=False, indent=4)

                    # Clear the current chunk and move to the next one
                    current_chunk_lines = []
                    chunk_index += 1
                    chunk_file_path = os.path.join(output_dir, f"{chunk_index}.json")

                # Update the progress bar
                pbar.update(1)

            # Write any remaining lines to the last chunk file
            if current_chunk_lines:
                with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
                    json.dump(current_chunk_lines, chunk_file, ensure_ascii=False, indent=4)

    print(f"Splitting complete. Files saved in {output_dir}")

# Example usage
split_jsonl_file("bookcorpus/bookcorpus.jsonl", "bookcorpus/chunks", num_chunks=10)

Counting total lines in the file...


Processing: 100%|██████████| 74004228/74004228 [05:31<00:00, 223557.61line/s] 


Splitting complete. Files saved in bookcorpus/chunks


In [None]:
import os
import json
from collections import defaultdict
from tqdm import tqdm
from itertools import islice

def generate_ngrams(text, n):
    """Generate n-grams from a given text."""
    words = text.split()
    return [tuple(words[i:i + n]) for i in range(len(words) - n + 1)]

def count_ngrams_in_chunks(input_dir, output_file, n_values=[2, 3, 4, 5]):
    """
    Generate and count n-gram pairs for a corpus stored in chunks.

    Args:
        input_dir (str): Directory containing the chunk files.
        output_file (str): Path to the output JSON file to store n-gram counts.
        n_values (list): List of n values for which to generate n-grams.
    """
    # Dictionary to store n-gram counts for each n
    ngram_counts = {n: defaultdict(int) for n in n_values}

    # Get the list of chunk files
    chunk_files = sorted(
        [os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.json')]
    )

    # Outer progress bar for chunks
    with tqdm(total=len(chunk_files), desc="Chunks", unit="chunk") as chunk_pbar:
        # Process each chunk file
        for chunk_file in chunk_files:
            with open(chunk_file, 'r', encoding='utf-8') as infile:
                data = json.load(infile)

                # Inner progress bar for lines in the current chunk
                with tqdm(total=len(data), desc=f"Lines in {os.path.basename(chunk_file)}", unit="line", leave=False) as line_pbar:
                    # Iterate over each entry in the chunk
                    for entry in data:
                        text = entry.get("text", "")

                        # Generate and count n-grams for each n
                        for n in n_values:
                            ngrams = generate_ngrams(text, n)
                            for ngram in ngrams:
                                ngram_counts[n][ngram] += 1

                        # Update inner progress bar
                        line_pbar.update(1)

            # Update outer progress bar
            chunk_pbar.update(1)

    # Convert defaultdicts to regular dictionaries for JSON serialization
    ngram_counts_serializable = {
        n: {" ".join(ngram): count for ngram, count in ngram_counts[n].items()}
        for n in n_values
    }

    # Write the n-gram counts to the output JSON file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(ngram_counts_serializable, outfile, ensure_ascii=False, indent=4)

    print(f"N-gram counts saved to {output_file}")

# Example usage
count_ngrams_in_chunks("bookcorpus/chunks", "ngram_counts.json", n_values=[2, 3, 4, 5])