### on SADIRI 1%: Do Candidate/Query Overlap: No

In [None]:
# check the overlap between candidate and query files
from datasets import load_from_disk
directory_path = '/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/down_1_shuffle/train'

In [6]:
# Load the dataset
dataset = load_from_disk(directory_path)

# Display the first few rows of the train split
print(dataset['train'])

Dataset({
    features: ['query_id', 'query_authorID', 'query_text', 'candidate_id', 'candidate_authorID', 'candidate_text'],
    num_rows: 40796
})


In [7]:
# Extract the query_id and candidate_id columns from the train split
query_ids = set(dataset['train']['query_id'])
candidate_ids = set(dataset['train']['candidate_id'])

# Find the intersection of query_id and candidate_id
overlap = query_ids.intersection(candidate_ids)

# Check if there is any overlap
if overlap:
    print("There are overlapping IDs between query_id and candidate_id:")
    print(overlap)
else:
    print("There are no overlapping IDs between query_id and candidate_id.")

There are no overlapping IDs between query_id and candidate_id.


## On SADIRI 100%, amazon: do candidate/query overlap?: No

In [18]:
import pandas as pd
amazon_base = '/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/amazon/'
# Load the train_queries.jsonl file
queries_path = amazon_base + 'train_queries.jsonl'
queries_df = pd.read_json(queries_path, lines=True)

# Load the train_candidates.jsonl file
candidates_path = amazon_base + 'train_candidates.jsonl'
candidates_df = pd.read_json(candidates_path, lines=True)

# Print the keys of the train_candidates.jsonl DataFrame
print("\nKeys in the train_candidates.jsonl DataFrame:")
print(candidates_df.columns)
print(candidates_df[:1])

# Extract query_id and candidate_id from the jsonl datasets
query_ids_jsonl = set(queries_df['documentID'])
candidate_ids_jsonl = set(candidates_df['documentID'])

# Check for overlap between query_id and candidate_id
overlap = query_ids_jsonl.intersection(candidate_ids_jsonl)

if overlap:
    print("There are overlapping IDs between query_id and candidate_id:")
    print(overlap)
else:
    print("There are no overlapping IDs between query_id and candidate_id.")


Keys in the train_candidates.jsonl DataFrame:
Index(['documentID', 'authorIDs', 'fullText', 'spanAttribution', 'isNeedle',
       'collectionNum', 'source', 'dateCollected', 'publiclyAvailable',
       'deidentified', 'languages', 'lengthWords', 'dateCreated',
       'timeCreated', 'sourceSpecific'],
      dtype='object')
                             documentID                 authorIDs  \
0  7041c0a5-6ef6-4e41-8630-5a8409583d89  ['A0103849GBVWICKXD4T6']   

                                            fullText  \
0  Bought this as a possible EDC item, but found ...   

                                     spanAttribution  isNeedle collectionNum  \
0  [{'authorID': 'A0103849GBVWICKXD4T6', 'start':...     False        amazon   

   source dateCollected  publiclyAvailable  deidentified languages  \
0  amazon    2022-12-31               True          True      [en]   

   lengthWords  dateCreated  timeCreated  \
0           72  09 15, 2014  09 15, 2014   

                                

## Get the size of full (not with held-out) SADIRI TRAIN dataset in terms of #words

In [2]:
project_base = "/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/"
data_folders = ["amazon", "ao3", "bookcorpus", "gmane", "nytimes-articles-and-comments", "pubmed", "realnews", "reddit", "stackexchange", "wiki_articles"]
data_folders = [project_base + folder_name for folder_name in data_folders]

In [5]:
import pandas as pd
import os

def count_words_in_fullText(file_path):
    # Load the JSONL file into a DataFrame
    df = pd.read_json(file_path, lines=True)
    
    # Check if the 'fullText' column exists
    if 'fullText' in df.columns:
        # Count the number of words in the 'fullText' column
        word_count = df['fullText'].str.split().str.len().sum()
        return word_count
    else:
        return 0

def process_datasets(dataset_paths):
    total_word_count = 0
    word_counts = {}
    
    for dataset_path in dataset_paths:
        dataset_name = os.path.basename(dataset_path)
        
        # Initialize word count for the dataset
        dataset_word_count = 0
        
        # Process only the specified JSONL files in the dataset folder
        for file_name in ['train_queries.jsonl', 'train_candidates.jsonl']:
            file_path = os.path.join(dataset_path, file_name)
            if os.path.exists(file_path):
                file_word_count = count_words_in_fullText(file_path)
                dataset_word_count += file_word_count
        
        # Store the word count for the dataset
        word_counts[dataset_name] = dataset_word_count
        total_word_count += dataset_word_count
    
    return word_counts, total_word_count

In [33]:
# Process the datasets and get the word counts
word_counts, total_word_count = process_datasets(data_folders)

# Print the word counts for each dataset
for dataset_name, word_count in word_counts.items():
    print(f"Total number of words in {dataset_name}: {word_count}")

# Print the overall total word count
print(f"Overall total number of words: {total_word_count}")

Total number of words in amazon: 31650279
Total number of words in ao3: 573926907
Total number of words in bookcorpus: 57367225
Total number of words in gmane: 141837101
Total number of words in nytimes-articles-and-comments: 24131163
Total number of words in pubmed: 9748317
Total number of words in realnews: 272933709
Total number of words in reddit: 446769021
Total number of words in stackexchange: 153991860
Total number of words in wiki_articles: 34779747
Overall total number of words: 1747135329


In [32]:
word_counts, total_word_count

({'': 34779747}, 1747135329)

### held out SADIRI dataset

In [3]:
data_folders = ["blogcorpus", "food.com-recipes", "sfu-socc", "goodreads", "wiki_discussions"]
data_folders = [project_base + folder_name for folder_name in data_folders]

In [6]:
# Process the datasets and get the word counts
word_counts, total_word_count = process_datasets(data_folders)

# Print the word counts for each dataset
for dataset_name, word_count in word_counts.items():
    print(f"Total number of words in {dataset_name}: {word_count}")

# Print the overall total word count
print(f"Overall total number of words: {total_word_count}")

Total number of words in blogcorpus: 8189607
Total number of words in food.com-recipes: 4346765
Total number of words in sfu-socc: 3007117
Total number of words in goodreads: 53683977
Total number of words in wiki_discussions: 40827678
Overall total number of words: 110055144


## peak at shared/3/datasets/wikipedia/enwiki/pages-articles/enwiki-20230601-pages-articles.clean-text.txt.gz 

In [1]:
wiki_path = "/shared/3/datasets/wikipedia/enwiki/pages-articles/enwiki-20230601-pages-articles.clean-text.txt.gz"

In [2]:
import gzip

def peek_gzipped_file(file_path, num_lines=10):
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for _ in range(num_lines):
            print(next(file))

In [3]:
peek_gzipped_file(wiki_path, num_lines=10)

'Fred' Watson founded the Patent Collapsible Sidecar Company, later renamed to Watsonian Folding Sidecar Company Ltd, after he built a folding sidecar that allowed him to get his motorcycle and sidecar combination through a narrow entrance to the yard at his house.

The first round was held on August 7.

Born in Guildford, Darren is the brother of former England seam bowler Martin Bicknell. However, he struggled to have similar international impact in the 1990s, the selectors instead preferring players such as Jason Gallian and Mark Lathwell. Most of Bicknell's career was spent at Surrey, but he finished his career with seven seasons at Nottinghamshire, before retiring at the end of the 2006 season.

Kairos Na Pame Parakato is a Greek language album by Antonis Remos from 1998 which sold more than 100,000 copies and was certified double platinum."

Thomas Valente (portrayed by Daniel Benzali) was a high-ranking official in the domestic nuclear detection office of the Department of Homel

## peak at /shared/3/datasets/YouTube-Commons  

In [1]:
import dask.dataframe as dd

In [2]:
# Load the dataset using Dask
dataset = dd.read_parquet('/shared/3/datasets/YouTube-Commons')

# Get 10 elements from the dataset
sample = dataset.head(30)

# Display the sample to understand its structure
print(sample)

       video_id                                   video_link  \
0   AjonQW43ZoU  https://www.youtube.com/watch?v=AjonQW43ZoU   
1   TKTxd2icycA  https://www.youtube.com/watch?v=TKTxd2icycA   
2   Bogr9_dWZDY  https://www.youtube.com/watch?v=Bogr9_dWZDY   
3   trkGXvw9KJI  https://www.youtube.com/watch?v=trkGXvw9KJI   
4   ukMZkP-ips0  https://www.youtube.com/watch?v=ukMZkP-ips0   
5   z_l7smopvVg  https://www.youtube.com/watch?v=z_l7smopvVg   
6   pN5EqyT2KNM  https://www.youtube.com/watch?v=pN5EqyT2KNM   
7   8IxzJoOvYXI  https://www.youtube.com/watch?v=8IxzJoOvYXI   
8   JswzVm1akhk  https://www.youtube.com/watch?v=JswzVm1akhk   
9   9_Hun-bAt1o  https://www.youtube.com/watch?v=9_Hun-bAt1o   
10  4CnQdUBRb70  https://www.youtube.com/watch?v=4CnQdUBRb70   
11  PM_O1cxCdxc  https://www.youtube.com/watch?v=PM_O1cxCdxc   
12  kL02ImZMpR4  https://www.youtube.com/watch?v=kL02ImZMpR4   
13  39aTyWu9OzA  https://www.youtube.com/watch?v=39aTyWu9OzA   
14  WahQNXX_0gM  https://www.youtube.com

In [6]:
# Filter the dataset where both transcription_language and original_language are 'en'
filtered_dataset = dataset[(dataset['transcription_language'] == 'en') & (dataset['original_language'] == 'en')]

# Count the number of entries
entry_count = filtered_dataset.shape[0].compute()

# Count unique channel_id
unique_channel_count = filtered_dataset['channel_id'].nunique().compute()

# Sum the word_count
total_word_count = filtered_dataset['word_count'].sum().compute()

# Print the results
print(f"Number of entries: {entry_count}")
print(f"Number of unique channels: {unique_channel_count}")
print(f"Total word count: {total_word_count}")

Number of entries: 1489377
Number of unique channels: 279141
Total word count: 3579987480


## peak at /shared/4/datasets/CommonCrawl/

In [9]:
import lzma
import pandas as pd

# Define the path to your file
file_path = '/shared/4/datasets/CommonCrawl/en.txt.xz'

# Read the .xz file using lzma
with lzma.open(file_path, mode='rt', encoding='utf-8') as file:
    # Read the first 1000 lines (or as many as you need to peek at)
    lines = [next(file) for _ in range(20)]

# Convert the lines to a pandas DataFrame for easier visualization
df = pd.DataFrame(lines)

# Display the first few lines
print(df.head())


                                                   0
0  Belmont Estate is on the market for $63 millio...
1  “Within the city we’ve had homes that have sol...
2  The three-storey home has five bedrooms, twelv...
3  Surrounding the property is a Versailles-inspi...
4  According to Frosch, the listing has received ...


## peak at /shared/4/datasets/thepile

In [13]:
import zstandard as zstd
import json
import os

# Define the directory and file of interest
directory = '/shared/4/datasets/thepile/pile/train'
filename = '00.jsonl.zst'  # Change this to any file you want to peek into

file_path = os.path.join(directory, filename)

# Number of lines to peek at
num_lines_to_peek = 10

# Open the .zst file and read the first few lines
with open(file_path, 'rb') as f:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(f) as reader:
        buffer = ''
        while num_lines_to_peek > 0:
            chunk = reader.read(1024)  # Read in chunks of 1024 bytes
            if not chunk:
                break
            buffer += chunk.decode('utf-8')
            lines = buffer.split('\n')
            buffer = lines.pop()
            
            for line in lines:
                if num_lines_to_peek == 0:
                    break
                print(json.loads(line))
                num_lines_to_peek -= 1

{'text': 'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web. Playing on the web works, but you have to simulate multi-touch for table moving and that can be a bit confusing.\n\nThere’s a lot I’d like to talk about. I’ll go through every topic, insted of making the typical what went right/wrong list.\n\nConcept\n\nWorking over the theme was probably one of the hardest tasks I had to face.\n\nOriginally, I had an idea of what kind of game I wanted to develop, gameplay wise – something with lots of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident I could fit any theme around it.\n\nIn the end, the problem with a theme like “Evolution” in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge.

In [15]:
filename = '00.jsonl.zst'  # Change this to any file you want to peek into

file_path = os.path.join(directory, filename)

# Number of lines to peek at
num_lines_to_peek = 10000  # Increase the number of lines to ensure we capture all unique pile_set_name

# Set to capture unique pile_set_name values
unique_pile_set_names = set()

# Open the .zst file and read lines
with open(file_path, 'rb') as f:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(f) as reader:
        buffer = ''
        while num_lines_to_peek > 0:
            chunk = reader.read(1024)  # Read in chunks of 1024 bytes
            if not chunk:
                break
            buffer += chunk.decode('utf-8')
            lines = buffer.split('\n')
            buffer = lines.pop()
            
            for line in lines:
                if num_lines_to_peek == 0:
                    break
                try:
                    data = json.loads(line)
                    if 'meta' in data and 'pile_set_name' in data['meta']:
                        unique_pile_set_names.add(data['meta']['pile_set_name'])
                except json.JSONDecodeError:
                    continue
                num_lines_to_peek -= 1
                
                # Check if we have all unique pile_set_name values
                if len(unique_pile_set_names) == 22:
                    break

# Print the unique pile_set_name values
print(unique_pile_set_names)

{'Ubuntu IRC', 'USPTO Backgrounds', 'NIH ExPorter', 'DM Mathematics', 'HackerNews', 'Gutenberg (PG-19)', 'EuroParl', 'FreeLaw', 'Pile-CC', 'PubMed Central', 'Wikipedia (en)', 'ArXiv', 'OpenSubtitles', 'PubMed Abstracts', 'BookCorpus2', 'YoutubeSubtitles', 'Enron Emails', 'Github', 'PhilPapers', 'Books3', 'StackExchange', 'OpenWebText2'}


## Sample 1.5 billion words from wikipedia

In [8]:
import gzip
import random
from datasets import Dataset
import os

def count_words_in_line(line):
    return len(line.split())

def extract_random_lines(file_path, target_word_count=1_500_000_000, seed=42):
    random.seed(seed)
    total_word_count = 0
    selected_lines = []

    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        lines = file.readlines()
        random.shuffle(lines)

        for line_number, line in enumerate(lines, 1):
            line_word_count = count_words_in_line(line)
            if total_word_count + line_word_count > target_word_count:
                break
            total_word_count += line_word_count
            selected_lines.append((line_number, line))

    return selected_lines

def save_to_huggingface_format(lines, output_path):
    dataset_dict = {
        "line_number": [line_number for line_number, _ in lines],
        "text": [text for _, text in lines]
    }
    dataset = Dataset.from_dict(dataset_dict)
    dataset.save_to_disk(output_path)

def process_wikipedia_file(input_path, output_path):
    lines = extract_random_lines(input_path)
    save_to_huggingface_format(lines, output_path)

# Replace with your actual file path and desired output path
input_path = "/shared/3/datasets/wikipedia/enwiki/pages-articles/enwiki-20230601-pages-articles.clean-text.txt.gz"
output_path = "/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/wikipedia"

process_wikipedia_file(input_path, output_path)

Saving the dataset (0/20 shards):   0%|          | 0/25383803 [00:09<?, ? examples/s]

In [5]:
from datasets import load_from_disk

output_path = "/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/wikipedia"
dataset = load_from_disk(output_path)
first_five_lines = dataset.select(range(5))
for row in first_five_lines:
    print(f"Line {row['line_number']}: {row['text']}")

Loading dataset from disk:   0%|          | 0/20 [00:00<?, ?it/s]

Line 1: Hønefoss in Storelva (Ringerike) valley is left by the line northwest into the Sogna valley, where the route uses its northern valley flanks to Sokna. To get to the Krøderen (lake) valley, Bergen Line follows Rudselva, passes by Langevannet and Breidvannet lakes and also through the 2.3 kilometer Haversting tunnel, which runs parallel to the Norwegian National Road 7, (), Ørgenvik tunnel. Hallingdal valley is used as a natural corridor further north-west, mostly on the left river bank, serving the larger settlements of Flå and Nesbyen. At Svenkerud to the north of Nesbyen the valley flank is changed. Soutwestbound to Gol, the ascent becomes gradually steeper in the following section to Geilo.

Line 2: The Korovin pistol (Пистолет Коровина, Тульский Коровин (ТК), GAU Index 56-A-112) is regarded as the first Soviet semi-automatic pistol.

Line 3: Due to COVID-19 pandemic, this year's Indonesia International Motor Show was split into two events, the IIMS Virtual, which was held vi

## Sample 1.5 billion words from 2021 Twitter API Samples ==> see styletokenizer/create_twitter_sample.py

In [1]:
import bz2
import json
import os
from datasets import Dataset

def count_words(text):
    return len(text.split())

def process_file(file_path, target_word_count_per_file, data):
    cumulative_word_count = 0
    with bz2.open(file_path, 'rt') as file:
        for line in file:
            tweet = json.loads(line)
            text = tweet.get("text", "")
            tweet_id = tweet.get("id", "")
            word_count = count_words(text)
            
            cumulative_word_count += word_count
            data.append({"id": tweet_id, "text": text})
            
            if cumulative_word_count >= target_word_count_per_file:
                return data, cumulative_word_count
    return data, cumulative_word_count


def sample_texts_from_files(directory, target_word_count):
    """
        get the number of compressed files and determine how much to sample from each file,
        st target word count is distributed equally across bz2 files
    """
    files = [os.path.join(root, file_name)
             for root, _, files in os.walk(directory)
             for file_name in sorted(files)
             if not "2022" in file_name and (file_name.endswith('p2.bz2') or file_name.endswith('p1.bz2'))]
    print(files)
    num_files = len(files)
    target_word_count_per_file = target_word_count // num_files
    data = []

    for filen_index, file_path in enumerate(files):
        print(f"At file {filen_index}/{num_files} called {file_path}")
        print(f"Extracting {target_word_count_per_file} words")
        data, _ = process_file(file_path, target_word_count_per_file, data)

    return data

In [2]:
directory = '/nfs/locker/twitter-decahose-locker/2021'
output_path = '/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/twitter'
target_word_count = 1_500_000_000
data = sample_texts_from_files(directory, target_word_count)
dataset = Dataset.from_dict({"id": [item["id"] for item in data], "text": [item["text"] for item in data]})
dataset.save_to_disk(output_path)

['/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-01.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-01.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-06.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-06.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-07.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-07.p2.bz2', '/nfs

At file 1/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-01.p2.bz2
Extracting 2074688 words
At file 2/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p1.bz2
Extracting 2074688 words
At file 3/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p2.bz2
Extracting 2074688 words
At file 4/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p1.bz2
Extracting 2074688 words
At file 5/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p2.bz2
Extracting 2074688 words
At file 6/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p1.bz2
Extracting 2074688 words
At file 7/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p2.bz2
Extracting 2074688 words
At file 8/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p1.bz2
Extracting 2074688 words
At file 9/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p2.bz2

KeyboardInterrupt: 

## Sample 1.5 billion for a mixed dataset

SOURCE			GENRE					DOMAIN 								TOTAL 				 SAMPLE


SADIRI			forum					reddit	 						  446,769,021 		   249,000,000
SADIRI			literature				ao3					  			  573,926,907 		   100,000,000
SADIRI			literature				BookCorpus2 					   57,367,225 			50,000,000
ThePile			literature 				Gutenberg before 1919 					?				50,000,000			
SADIRI			news					realnews						  272,933,709 		   169,000,000
SADIRI			news/comments			nytimes-articles-and-comments:     24,131,163 			24,131,163
SADIRI			news/comments			sfu-socc 							3,007,117 			 3,007,117
ThePile			Q&A 					stackexchange 				  	  		? 		   	   200,000,000
SADIRI			reviews					goodreads 					  	   53,683,977  			53,683,977
SADIRI			reviews					amazon	 						   31,650,279 			31,650,279
SADIRI			mails					gmane 						      141,837,101  		   141,837,101
YouTubeCommons	transcripts 			YouTubeCommons 					  	  				   100,000,000
ThePile			transcripts				OpenSubtitles 											50,000,000
ThePile			code				 	Github													50,000,000
WET CommonCrawl	raw text webpages 		CommonCrawl 								    	   100,000,000
s2orc			science					s2orc					  	    				       100,000,000
ThePile			Mathematics 			DM Mathematics  										20,000,000
SADIRI			blogs					blogcorpus	 						 			 		 8,189,607
																		--------------- 	---------------
										TOTAL: 						   		 				 1,500,499,244 

### SADIRI samples

In [2]:
project_base = "/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/"
data_folders = ["reddit", "ao3", "bookcorpus", "realnews", "nytimes-articles-and-comments", "sfu-socc", "goodreads",  "amazon",  "gmane", "blogcorpus"]
data_folders = [project_base + folder_name for folder_name in data_folders]
# word_samples = [249000000, 
#                 100000000, 
#                 50000000, 
#                 169000000, 
#                 24131163, 
#                 3007117,
#                 53683977,
#                 31650279,
#                 141837101,
#                 8189607]
word_samples = [10 for _ in range(len(data_folders))]
assert(len(word_samples) == len(data_folders))

In [11]:
import os
import pandas as pd
from collections import defaultdict

def sample_texts_from_dataframe(data_df, target_word_count):
    sampled_texts = []
    document_ids = []
    current_word_count = 0
    data_df = data_df.sample(frac=1).reset_index(drop=True)  # Shuffle the texts
    for idx, row in data_df.iterrows():
        text = row['fullText']
        words = text.split()
        word_count = len(words)
        sampled_texts.append(text)
        document_ids.append(row['documentID'])
        current_word_count += word_count
        if current_word_count >= target_word_count:
            break
    return {
        "sampled_texts": sampled_texts,
        "sampled_word_count": current_word_count,
        "document_ids": document_ids
    }

def sample_sadiri_texts(dataset_paths, word_samples):
    data_samples = defaultdict(list)
    total_word_count = defaultdict(int)
    
    for dataset_path, word_count in zip(dataset_paths, word_samples):
        dataset_name = os.path.basename(dataset_path)
                
        combined_df = pd.DataFrame()
        
        # Combine both JSONL files into one dataframe
        for file_name in ['train_queries.jsonl', 'train_candidates.jsonl']:
            file_path = os.path.join(dataset_path, file_name)
            if os.path.exists(file_path):
                data_df = pd.read_json(file_path, lines=True)
                print(f"Loaded {len(data_df)} rows from {file_path}")
                combined_df = pd.concat([combined_df, data_df], ignore_index=True)
            else:
                print(f"{file_name} does not exist in {dataset_path}.")
                
        sample_dict = sample_texts_from_dataframe(data_df, word_count)
                
        data_samples['source'] += ["SADIRI" for _ in range(len(sample_dict))]
        data_samples['domain'] += [dataset_name for _ in range(len(sample_dict))]
        data_samples['text'] += sample_dict["sampled_texts"]
        data_samples['id'] += sample_dict["document_ids"]
        total_word_count[dataset_name] += sample_dict["sampled_word_count"]
        print(f"Total words collected for {dataset_name}: {total_word_count[dataset_name]}")
      

    print(data_samples)

In [12]:
sample_sadiri_texts(data_folders, word_samples)

Loaded 2090161 rows from /shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/reddit/train_queries.jsonl
Loaded 2090161 rows from /shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/reddit/train_candidates.jsonl
Total words collected for reddit: 74
Loaded 317274 rows from /shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/ao3/train_queries.jsonl
Loaded 317274 rows from /shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/ao3/train_candidates.jsonl
Total words collected for ao3: 544
Loaded 56319 rows from /shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/bookcorpus/train_queries.jsonl
Loaded 56319 rows from /shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/bookcorpus/train_candidates.jsonl
Total words collected for bookcorpus: 512
Loaded 181024 rows from /shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/realnews/train_queries.json

###  SKIPPING as it is unclear what en.txt.xz is -- WET Common Crawl dataset

In [None]:
wet_file_path = '/shared/4/datasets/CommonCrawl/en.txt.xz'

In [45]:
import lzma
import random

def estimate_words_per_line(file_path, sample_size=1000):
    total_lines = 0
    sampled_words = 0
    with lzma.open(file_path, mode='rt', encoding='utf-8') as file:
        for line in file:
            total_lines += 1
            if total_lines <= sample_size:
                sampled_words += len(line.split())
    average_words_per_line = sampled_words / sample_size
    return total_lines, average_words_per_line

def sample_common_crawl(dataset_paths, word_samples):
    sampled_lines = []
    line_numbers = []
    total_current_word_count = 0
    
    # First pass to estimate total number of lines and average words per line
    total_lines, average_words_per_line = estimate_words_per_line(dataset_paths)
    
    print(total_lines)
    
    # Estimate total word count
    estimated_total_word_count = average_words_per_line * total_lines
    
    # Calculate sampling probability
    sampling_probability = word_samples / estimated_total_word_count
    
    # Second pass to perform sampling
    with lzma.open(dataset_paths, mode='rt', encoding='utf-8') as file:
        line_number = 0
        for line in file:
            line_number += 1
            word_count = len(line.split())
            
            # Decide whether to include this line based on the calculated probability
            if random.random() < sampling_probability:
                sampled_lines.append(line.strip())
                line_numbers.append(line_number)
                total_current_word_count += word_count
                
            # Stop if we've reached the target word count
            if total_current_word_count >= word_samples:
                break
    
    # Construct the result dictionary
    result = {
        "id": line_numbers,
        "text": sampled_lines,
        "domain": ["commoncrawl"] * len(sampled_lines),
        "source": ["commoncrawl"] * len(sampled_lines)
    }
    
    return result

In [46]:
dataset_path = '/shared/4/datasets/CommonCrawl/en.txt.xz'
word_samples = 10
sampled_data = sample_common_crawl(dataset_path, word_samples)
print(sampled_data)

KeyboardInterrupt: 

### ThePile Samples (the pile is already shuffled, see https://arxiv.org/pdf/2101.00027)

In [21]:
pile_set_names = ['Gutenberg (PG-19)', 'StackExchange', 'OpenSubtitles', 'Github', 'Pile-CC' ,'DM Mathematics']
# word_counts = [50000000, 200000000, 50000000, 50000000, 100000000, 20000000]
word_counts = [10 for _ in range(len(pile_set_names))]

In [43]:
import os
import zstandard as zstd
import json

def read_lines_from_zst(file_path):
    """Generator to read lines from a .zst compressed file"""
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(f) as reader:
            buffer = ''
            while True:
                chunk = reader.read(1024)  # Read in chunks of 1024 bytes
                if not chunk:
                    break
                buffer += chunk.decode('utf-8')
                lines = buffer.split('\n')
                buffer = lines.pop()
                for line in lines:
                    yield line

def sample_pile(pile_set_names, word_counts):
    dir_path = "/shared/4/datasets/thepile/pile/train"
    zst_files = [f for f in os.listdir(dir_path) if f.endswith('.jsonl.zst')]
    
    sampled_lines = []
    sampled_texts = []
    domains = []

    # Dictionary to keep track of the word count for each pile set name
    word_counts_dict = dict(zip(pile_set_names, word_counts))
    current_word_counts = {name: 0 for name in pile_set_names}
    
    line_counter = 0

    def should_continue_sampling():
        """Check if we need to continue sampling, so ANY requirement not yet fulfilled"""
        return any(current_word_counts[name] < word_counts_dict[name] for name in pile_set_names)
    
    for filename in zst_files:
        if not should_continue_sampling():
            break

        
        file_path = os.path.join(dir_path, filename)
        for line in read_lines_from_zst(file_path):
            if not should_continue_sampling():
                break

            try:
                data = json.loads(line)
                pile_set_name = data.get('meta', {}).get('pile_set_name')
                if (pile_set_name in pile_set_names) and (current_word_counts[pile_set_name]<word_counts_dict[pile_set_name]):
                    text = data.get('text', '')
                    text_word_count = len(text.split())
                    sampled_lines.append(line_counter)
                    sampled_texts.append(text)
                    domains.append(pile_set_name)
                    current_word_counts[pile_set_name] += text_word_count
            except json.JSONDecodeError:
                print("decode error")
                continue
            line_counter += 1

    # Return the sampled data
    return {
        "id": sampled_lines,
        "domain": domains,
        "source": ["thePile"] * len(sampled_texts),
        "text": sampled_texts,
    }


In [44]:
result = sample_pile(pile_set_names, word_counts)
print(result)



### YouTube Commons Sample

In [1]:
from dask.distributed import Client
import dask.dataframe as dd

def sample_YouTubeCommons(required_word_count):    
    # Load the dataset using Dask
    dataset = dd.read_parquet('/shared/3/datasets/YouTube-Commons')
     print(f"Dataset loaded")

    # Filter the dataset where both transcription_language and original_language are 'en'
    filtered_dataset = dataset[(dataset['transcription_language'] == 'en') & (dataset['original_language'] == 'en')]
    print(f"Dataset filtered")
    
    # Debugging step: Check the number of rows after filtering
    filtered_count = filtered_dataset.shape[0].compute()
    print(f"Number of rows after filtering: {filtered_count}")

    if filtered_count == 0:
        raise ValueError("No data found after filtering. Check the filter conditions or data content.")

    # Shuffle the dataset
    shuffled_dataset = filtered_dataset.sample(frac=1, random_state=42)

    # Compute the shuffled dataset
    filtered_df = shuffled_dataset.compute()

    # Initialize variables to store the sampled texts and the accumulated word count
    sampled_texts = []
    total_word_count = 0
    sampled_ids = []

    # Iterate through the shuffled dataframe and sample texts until the required word count is reached
    for _, row in filtered_df.iterrows():
        sampled_texts.append(row['text'])
        total_word_count += row['word_count']
        sampled_ids.append(row['video_id'])
        if total_word_count >= required_word_count:
            break
    
    # Ensure that we have sampled enough texts to meet the word count requirement
    if total_word_count < required_word_count:
        raise ValueError("Not enough data to meet the required word count")
    
    return {
        "id": sampled_ids,
        "domain": ["YouTubeCommons"] * len(sampled_texts),
        "source": ["YouTubeCommons"] * len(sampled_texts),
        "text": sampled_texts,
    }



In [2]:
# Example usage
try:
    sampled_data = sample_YouTubeCommons(100)
    print(sampled_data)
except Exception as e:
    print(f"Error: {e}")

Number of rows after filtering: 1489377
{'id': ['UJiOpInAP4U'], 'domain': ['YouTubeCommons'], 'source': ['YouTubeCommons'], 'text': ["tonight live from the scullery theater from the impersonator capital of the world in the heart of downtown las vegas we present the downtown podcast starring your host mr tonight's guest from rat pack las vegas peter pavone and from real gaming lawrence vaughn and now ladies and gentlemen gentlemen let's give it up for the man who doesn't mean jerry seinfeld thank you thank you guys oh that was good yeah jerry scifield you already knew jerry what's to do with all the sniffles we're doing a podcast it's not a late night show we got late night show here we got a podcast here it's the audio it's a radio what is it what's the dude i don't know sorry all right i tried it tried we have an impersonator on tonight and i wanted to impress him i don't think i did nope doesn't look impressed okay so my favorite thing happened this last week uh there was a flight fr

### s2orc sample

In [3]:
import os
import json
import random

s2orc_path = "/shared/3/projects/citation-context/s2orc/s2orc"

def count_words(text):
    return len(text.split())

def read_files_and_sample(path, target_word_count):
    word_count = 0
    sampled_ids = []
    sampled_texts = []
    
    files = [os.path.join(path, f) for f in os.listdir(path) if f.startswith('s2orc_')]
    num_files = len(files)
    words_per_file = target_word_count // num_files
    
    for file_path in files:
        with open(file_path, 'r') as f:
            lines = f.readlines()
            total_lines = len(lines)
            random_indices = random.sample(range(total_lines), total_lines)  # Shuffle the line indices
            
            for idx in random_indices:
                if word_count >= target_word_count:
                    break
                
                line = json.loads(lines[idx])
                corpusid = line.get('corpusid')
                text = line.get('content', {}).get('text', "")
                words = text.split()
                
                if word_count + len(words) > target_word_count:
                    remaining_words = target_word_count - word_count
                    sampled_texts.append(" ".join(words[:remaining_words]))
                    sampled_ids.append(corpusid)
                    word_count += remaining_words
                    break
                else:
                    sampled_texts.append(text)
                    sampled_ids.append(corpusid)
                    word_count += len(words)
                
                if word_count >= words_per_file * (files.index(file_path) + 1):
                    break

        if word_count >= target_word_count:
            break

    return sampled_ids, sampled_texts

def sample_s2orc(required_word_count):
    sampled_ids, sampled_texts = read_files_and_sample(s2orc_path, required_word_count)
    return {
        "id": sampled_ids,
        "domain": ["s2orc"] * len(sampled_texts),
        "source": ["s2orc"] * len(sampled_texts),
        "text": sampled_texts,
    }


SyntaxError: incomplete input (2723164150.py, line 1)

In [None]:
sampled_data = sample_s2orc(10)
print(sampled_data)

## Stats on fitting corpora

In [5]:
from datasets import load_from_disk

# Load the dataset from the specified path with train/dev/test splits
dataset_path = "/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/wikipedia"
dataset = load_from_disk(dataset_path)

Loading dataset from disk:   0%|          | 0/20 [00:00<?, ?it/s]

In [9]:
train_dataset = dataset['train']
dev_dataset = dataset['dev']
test_dataset = dataset['test']

In [10]:
    print(f"Train word count: {sum(train_dataset['word_count'])}")
    print(f"Dev word count: {sum(dev_dataset['word_count'])}")
    print(f"Test word count: {sum(test_dataset['word_count'])}")

Train word count: 1469999792
Dev word count: 15000029
Test word count: 15000087


In [12]:
if "domain" in train_dataset[0]:
    for split in [train_dataset, dev_dataset, test_dataset]:
        domain_word_count = {}
        for domain, word_count in zip(split["domain"], split["word_count"]):
            if domain not in domain_word_count:
                domain_word_count[domain] = 0
            domain_word_count[domain] += word_count
        print(f"Domain word count distribution for {split}: {domain_word_count}")

            

In [18]:
def count_words(text):
    return len(text.split())

In [20]:
import bz2
import json
data = []
file_path = '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-05-18.p2.bz2'
total_word_count = 0
with bz2.open(file_path, 'rt') as file:
    for line in file:
        tweet = json.loads(line)
        text = tweet.get("text", "")
        tweet_id = tweet.get("id", "")
        word_count = count_words(text)
        data.append({"id": tweet_id, "text": text, "word_count": word_count})
        total_word_count += word_count
        if total_word_count > 3000000:
            break
print(data)

EOFError: Compressed file ended before the end-of-stream marker was reached