### on SADIRI 1%: Do Candidate/Query Overlap: No

In [None]:
# check the overlap between candidate and query files
from datasets import load_from_disk
directory_path = '/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/down_1_shuffle/train'

In [6]:
# Load the dataset
dataset = load_from_disk(directory_path)

# Display the first few rows of the train split
print(dataset['train'])

Dataset({
    features: ['query_id', 'query_authorID', 'query_text', 'candidate_id', 'candidate_authorID', 'candidate_text'],
    num_rows: 40796
})


In [7]:
# Extract the query_id and candidate_id columns from the train split
query_ids = set(dataset['train']['query_id'])
candidate_ids = set(dataset['train']['candidate_id'])

# Find the intersection of query_id and candidate_id
overlap = query_ids.intersection(candidate_ids)

# Check if there is any overlap
if overlap:
    print("There are overlapping IDs between query_id and candidate_id:")
    print(overlap)
else:
    print("There are no overlapping IDs between query_id and candidate_id.")

There are no overlapping IDs between query_id and candidate_id.


## On SADIRI 100%, amazon: do candidate/query overlap?: No

In [18]:
import pandas as pd
amazon_base = '/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/amazon/'
# Load the train_queries.jsonl file
queries_path = amazon_base + 'train_queries.jsonl'
queries_df = pd.read_json(queries_path, lines=True)

# Load the train_candidates.jsonl file
candidates_path = amazon_base + 'train_candidates.jsonl'
candidates_df = pd.read_json(candidates_path, lines=True)

# Print the keys of the train_candidates.jsonl DataFrame
print("\nKeys in the train_candidates.jsonl DataFrame:")
print(candidates_df.columns)
print(candidates_df[:1])

# Extract query_id and candidate_id from the jsonl datasets
query_ids_jsonl = set(queries_df['documentID'])
candidate_ids_jsonl = set(candidates_df['documentID'])

# Check for overlap between query_id and candidate_id
overlap = query_ids_jsonl.intersection(candidate_ids_jsonl)

if overlap:
    print("There are overlapping IDs between query_id and candidate_id:")
    print(overlap)
else:
    print("There are no overlapping IDs between query_id and candidate_id.")


Keys in the train_candidates.jsonl DataFrame:
Index(['documentID', 'authorIDs', 'fullText', 'spanAttribution', 'isNeedle',
       'collectionNum', 'source', 'dateCollected', 'publiclyAvailable',
       'deidentified', 'languages', 'lengthWords', 'dateCreated',
       'timeCreated', 'sourceSpecific'],
      dtype='object')
                             documentID                 authorIDs  \
0  7041c0a5-6ef6-4e41-8630-5a8409583d89  ['A0103849GBVWICKXD4T6']   

                                            fullText  \
0  Bought this as a possible EDC item, but found ...   

                                     spanAttribution  isNeedle collectionNum  \
0  [{'authorID': 'A0103849GBVWICKXD4T6', 'start':...     False        amazon   

   source dateCollected  publiclyAvailable  deidentified languages  \
0  amazon    2022-12-31               True          True      [en]   

   lengthWords  dateCreated  timeCreated  \
0           72  09 15, 2014  09 15, 2014   

                                

## Get the size of full (not with held-out) SADIRI TRAIN dataset in terms of #words

In [2]:
project_base = "/shared/3/projects/hiatus/aggregated_trainset_v2/content_masking_research/"
data_folders = ["amazon", "ao3", "bookcorpus", "gmane", "nytimes-articles-and-comments", "pubmed", "realnews", "reddit", "stackexchange", "wiki_articles"]
data_folders = [project_base + folder_name for folder_name in data_folders]

In [5]:
import pandas as pd
import os

def count_words_in_fullText(file_path):
    # Load the JSONL file into a DataFrame
    df = pd.read_json(file_path, lines=True)
    
    # Check if the 'fullText' column exists
    if 'fullText' in df.columns:
        # Count the number of words in the 'fullText' column
        word_count = df['fullText'].str.split().str.len().sum()
        return word_count
    else:
        return 0

def process_datasets(dataset_paths):
    total_word_count = 0
    word_counts = {}
    
    for dataset_path in dataset_paths:
        dataset_name = os.path.basename(dataset_path)
        
        # Initialize word count for the dataset
        dataset_word_count = 0
        
        # Process only the specified JSONL files in the dataset folder
        for file_name in ['train_queries.jsonl', 'train_candidates.jsonl']:
            file_path = os.path.join(dataset_path, file_name)
            if os.path.exists(file_path):
                file_word_count = count_words_in_fullText(file_path)
                dataset_word_count += file_word_count
        
        # Store the word count for the dataset
        word_counts[dataset_name] = dataset_word_count
        total_word_count += dataset_word_count
    
    return word_counts, total_word_count

In [33]:
# Process the datasets and get the word counts
word_counts, total_word_count = process_datasets(data_folders)

# Print the word counts for each dataset
for dataset_name, word_count in word_counts.items():
    print(f"Total number of words in {dataset_name}: {word_count}")

# Print the overall total word count
print(f"Overall total number of words: {total_word_count}")

Total number of words in amazon: 31650279
Total number of words in ao3: 573926907
Total number of words in bookcorpus: 57367225
Total number of words in gmane: 141837101
Total number of words in nytimes-articles-and-comments: 24131163
Total number of words in pubmed: 9748317
Total number of words in realnews: 272933709
Total number of words in reddit: 446769021
Total number of words in stackexchange: 153991860
Total number of words in wiki_articles: 34779747
Overall total number of words: 1747135329


In [32]:
word_counts, total_word_count

({'': 34779747}, 1747135329)

### held out SADIRI dataset

In [3]:
data_folders = ["blogcorpus", "food.com-recipes", "sfu-socc", "goodreads", "wiki_discussions"]
data_folders = [project_base + folder_name for folder_name in data_folders]

In [6]:
# Process the datasets and get the word counts
word_counts, total_word_count = process_datasets(data_folders)

# Print the word counts for each dataset
for dataset_name, word_count in word_counts.items():
    print(f"Total number of words in {dataset_name}: {word_count}")

# Print the overall total word count
print(f"Overall total number of words: {total_word_count}")

Total number of words in blogcorpus: 8189607
Total number of words in food.com-recipes: 4346765
Total number of words in sfu-socc: 3007117
Total number of words in goodreads: 53683977
Total number of words in wiki_discussions: 40827678
Overall total number of words: 110055144


## peak at shared/3/datasets/wikipedia/enwiki/pages-articles/enwiki-20230601-pages-articles.clean-text.txt.gz 

In [1]:
wiki_path = "/shared/3/datasets/wikipedia/enwiki/pages-articles/enwiki-20230601-pages-articles.clean-text.txt.gz"

In [2]:
import gzip

def peek_gzipped_file(file_path, num_lines=10):
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for _ in range(num_lines):
            print(next(file))

In [3]:
peek_gzipped_file(wiki_path, num_lines=10)

'Fred' Watson founded the Patent Collapsible Sidecar Company, later renamed to Watsonian Folding Sidecar Company Ltd, after he built a folding sidecar that allowed him to get his motorcycle and sidecar combination through a narrow entrance to the yard at his house.

The first round was held on August 7.

Born in Guildford, Darren is the brother of former England seam bowler Martin Bicknell. However, he struggled to have similar international impact in the 1990s, the selectors instead preferring players such as Jason Gallian and Mark Lathwell. Most of Bicknell's career was spent at Surrey, but he finished his career with seven seasons at Nottinghamshire, before retiring at the end of the 2006 season.

Kairos Na Pame Parakato is a Greek language album by Antonis Remos from 1998 which sold more than 100,000 copies and was certified double platinum."

Thomas Valente (portrayed by Daniel Benzali) was a high-ranking official in the domestic nuclear detection office of the Department of Homel

## Sample 1.5 billion words from wikipedia

In [8]:
import gzip
import random
from datasets import Dataset
import os

def count_words_in_line(line):
    return len(line.split())

def extract_random_lines(file_path, target_word_count=1_500_000_000, seed=42):
    random.seed(seed)
    total_word_count = 0
    selected_lines = []

    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        lines = file.readlines()
        random.shuffle(lines)

        for line_number, line in enumerate(lines, 1):
            line_word_count = count_words_in_line(line)
            if total_word_count + line_word_count > target_word_count:
                break
            total_word_count += line_word_count
            selected_lines.append((line_number, line))

    return selected_lines

def save_to_huggingface_format(lines, output_path):
    dataset_dict = {
        "line_number": [line_number for line_number, _ in lines],
        "text": [text for _, text in lines]
    }
    dataset = Dataset.from_dict(dataset_dict)
    dataset.save_to_disk(output_path)

def process_wikipedia_file(input_path, output_path):
    lines = extract_random_lines(input_path)
    save_to_huggingface_format(lines, output_path)

# Replace with your actual file path and desired output path
input_path = "/shared/3/datasets/wikipedia/enwiki/pages-articles/enwiki-20230601-pages-articles.clean-text.txt.gz"
output_path = "/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/wikipedia"

process_wikipedia_file(input_path, output_path)

Saving the dataset (0/20 shards):   0%|          | 0/25383803 [00:09<?, ? examples/s]

In [5]:
from datasets import load_from_disk

output_path = "/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/wikipedia"
dataset = load_from_disk(output_path)
first_five_lines = dataset.select(range(5))
for row in first_five_lines:
    print(f"Line {row['line_number']}: {row['text']}")

Loading dataset from disk:   0%|          | 0/20 [00:00<?, ?it/s]

Line 1: Hønefoss in Storelva (Ringerike) valley is left by the line northwest into the Sogna valley, where the route uses its northern valley flanks to Sokna. To get to the Krøderen (lake) valley, Bergen Line follows Rudselva, passes by Langevannet and Breidvannet lakes and also through the 2.3 kilometer Haversting tunnel, which runs parallel to the Norwegian National Road 7, (), Ørgenvik tunnel. Hallingdal valley is used as a natural corridor further north-west, mostly on the left river bank, serving the larger settlements of Flå and Nesbyen. At Svenkerud to the north of Nesbyen the valley flank is changed. Soutwestbound to Gol, the ascent becomes gradually steeper in the following section to Geilo.

Line 2: The Korovin pistol (Пистолет Коровина, Тульский Коровин (ТК), GAU Index 56-A-112) is regarded as the first Soviet semi-automatic pistol.

Line 3: Due to COVID-19 pandemic, this year's Indonesia International Motor Show was split into two events, the IIMS Virtual, which was held vi

## Sample 1.5 billion words from 2021 Twitter API Samples

In [1]:
import bz2
import json
import os
from datasets import Dataset

def count_words(text):
    return len(text.split())

def process_file(file_path, target_word_count_per_file, data):
    cumulative_word_count = 0
    with bz2.open(file_path, 'rt') as file:
        for line in file:
            tweet = json.loads(line)
            text = tweet.get("text", "")
            tweet_id = tweet.get("id", "")
            word_count = count_words(text)
            
            cumulative_word_count += word_count
            data.append({"id": tweet_id, "text": text})
            
            if cumulative_word_count >= target_word_count_per_file:
                return data, cumulative_word_count
    return data, cumulative_word_count


def sample_texts_from_files(directory, target_word_count):
    """
        get the number of compressed files and determine how much to sample from each file,
        st target word count is distributed equally across bz2 files
    """
    files = [os.path.join(root, file_name)
             for root, _, files in os.walk(directory)
             for file_name in sorted(files)
             if not "2022" in file_name and (file_name.endswith('p2.bz2') or file_name.endswith('p1.bz2'))]
    print(files)
    num_files = len(files)
    target_word_count_per_file = target_word_count // num_files
    data = []

    for filen_index, file_path in enumerate(files):
        print(f"At file {filen_index}/{num_files} called {file_path}")
        print(f"Extracting {target_word_count_per_file} words")
        data, _ = process_file(file_path, target_word_count_per_file, data)

    return data

In [2]:
directory = '/nfs/locker/twitter-decahose-locker/2021'
output_path = '/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/twitter'
target_word_count = 1_500_000_000
data = sample_texts_from_files(directory, target_word_count)
dataset = Dataset.from_dict({"id": [item["id"] for item in data], "text": [item["text"] for item in data]})
dataset.save_to_disk(output_path)

['/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-01.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-01.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-06.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-06.p2.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-07.p1.bz2', '/nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-07.p2.bz2', '/nfs

At file 1/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-01.p2.bz2
Extracting 2074688 words
At file 2/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p1.bz2
Extracting 2074688 words
At file 3/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-02.p2.bz2
Extracting 2074688 words
At file 4/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p1.bz2
Extracting 2074688 words
At file 5/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-03.p2.bz2
Extracting 2074688 words
At file 6/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p1.bz2
Extracting 2074688 words
At file 7/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-04.p2.bz2
Extracting 2074688 words
At file 8/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p1.bz2
Extracting 2074688 words
At file 9/723 called /nfs/locker/twitter-decahose-locker/2021/decahose.2021-01-05.p2.bz2

KeyboardInterrupt: 