# Intro

This notebook contains code to prepare any additional data that I used during the competition. For each dataset, I performed some preprocessing and then transformed the data to sentence embeddings.

# Setup

In [3]:
# Import dependencies
import gzip
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('ggplot')
import seaborn as sns
import re
import math
import torch
from scipy.stats import truncnorm
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from pathlib import Path
from datasets import load_dataset, concatenate_datasets
import gc
gc.enable()

In [4]:
# import os

# if os.getuid() == 0:
#     print("Running as root user.")
# else:
#     print("Not running as root user.")


In [5]:
import logging

# loggerオブジェクトの作成
logger = logging.getLogger('02log')

# Constants

In [6]:
BASE_INPUT = '/home/masa1357/Dockerdata/gitfile/kaggle_clrp_1st_place_solution/'
BASE_OUTPUT = '/home/masa1357/Dockerdata/gitfile/kaggle_clrp_1st_place_solution/'

# Functions

In [7]:
def create_dir_if_not_exist(out_dir):
    output_dir = Path(out_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    return output_dir

In [8]:
# a utility function to save a pandas dataframe to csv
# it will create directories if they don't exist
def df_to_csv(df, out_dir, out_file):
    output_dir = create_dir_if_not_exist(os.path.join(BASE_OUTPUT, out_dir))
    df.to_csv(output_dir / out_file)

In [9]:
def encode_and_save(sentences, out_dir, data_name, scores=None, model_name='paraphrase-TinyBERT-L6-v2'):
  model = SentenceTransformer(model_name)
  logger.info('Start encoding!')
  encoded = model.encode(sentences, convert_to_tensor=True)
  output_dir = create_dir_if_not_exist(os.path.join(BASE_OUTPUT, out_dir))
  output_dir = '/home/masa1357/Dockerdata/gitfile/kaggle_clrp_1st_place_solution/embeddings'  
  out_file = os.path.join(output_dir, 'encoded-' + data_name + '-' + model_name + '.pt')
  pairs = []
  for idx, sent in enumerate(sentences):
    pair = [sent, encoded[idx]]
    if scores:
      pair.append(score[idx])
  with open(out_file, 'wb') as f:
    torch.save(encoded, f)
    logger.info('save complited!')

In [10]:
def get_simple_wiki():
    simplewiki_path = os.path.join(BASE_OUTPUT, 'data/external/simplewiki-2020-11-01.jsonl.gz')
    if not os.path.exists(simplewiki_path):
        util.http_get('https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/simplewiki-2020-11-01.jsonl.gz', simplewiki_path)
    passages = []
    with gzip.open(simplewiki_path, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            data = json.loads(line.strip())
            passages.extend(data['paragraphs'])
    return passages

In [11]:
def truncated_normal(mean=180, sd=17, low=135, high=205):
    """
    Return a number that belong to a normal distribution
    
    Parameters:
    -----------
    
    mean: (int/float)
        Mean of the distribution
        
    sd: (int/float)
        Standard deviation of the distribution
        
    low: (int/float)
        Lowest number fo the distribution
        
    high: (int/float)
    """
    return truncnorm( (low - mean) / sd, (high - mean) / sd, loc=mean, scale=sd ).rvs()

In [12]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [13]:
def get_trainset_word_distribution(text):
  words = text.split()
  cut = math.floor(truncated_normal())
  chunked = chunks(words, cut)
  texts = [' '.join(c) for c in chunked]
  return texts

In [14]:
def clean_file(file):
  attribution = ''
  texts = []
  attribution_start = False
  current_text = ''
  max_len = truncated_normal()
  for ln in file:
    line = ln.strip()
    if line != '':
      if re.search('free to download', line) or attribution_start:
        attribution = attribution + ' ' + line 
        attribution_start = True
      else:
        if len(current_text) < max_len:
          current_text = current_text + ' ' + line
        else:
          texts.append(current_text)
          current_text = line
          max_len = truncated_normal()
  attributions = [attribution for _ in texts]
  return texts, attributions

In [15]:
def get_cb_corpus():
    in_dir = os.path.join(BASE_INPUT, 'data/external/cb_corpus.txt')
    chapters = []
    current_chapter = []
    
    with open(in_dir, 'r') as f:
        for line in tqdm(f):
          ln = line.strip()
          if ln[:7] == 'CHAPTER':
            chapters.append(current_chapter)
            current_chapter = []
          elif not re.match(r'_BOOK_TITLE_|-LCB-|-RCB-', ln) and ln != '':
            rand_div = truncated_normal()
            curr_len = len(' '.join(current_chapter).split(' '))
            if curr_len < rand_div:
              current_chapter.append(ln)
            else:
              chapters.append(current_chapter)
              current_chapter = []
    return chapters

# Wikipedia data

This data contains text snippets from Wikipedia. It was downloaded from https://huggingface.co/datasets/wikitext and some preprocessing was applied.

In [48]:
wikitext_dataset = load_dataset('wikitext', 'wikitext-103-v1')
logging.info('DL COMPLEATED')



  0%|          | 0/3 [00:00<?, ?it/s]

In [49]:
# apply some preprocessing
wikitext_train = wikitext_dataset['train']
wikitext_train = wikitext_train.filter(lambda example: len(example['text'])>100)

def replace_n(example):
  example['text'] = example['text'].replace('\n', ' ')
  return example

wikitext_train = wikitext_train.map(replace_n)

# we only want samples between 600 and 1100 characters
wikitext_train = wikitext_train.filter(lambda example: len(example['text']) < 1100 and len(example['text']) > 600)
logging.info('get wikitexttrain_data!')
# convert the dataset to a dataframe and save it
wikitext_train_pd = wikitext_train.to_pandas()
wikitext_train_pd.head()



Unnamed: 0,text
0,Senjō no Valkyria 3 : <unk> Chronicles ( Japa...
1,The game takes place during the Second Europa...
2,"Partly due to these events , and partly due t..."
3,"Famitsu enjoyed the story , and were particul..."
4,PlayStation Official Magazine - UK praised th...


In [50]:
df_to_csv(df=wikitext_train_pd, out_dir='data/preprocessed', out_file='wiki_snippets.csv')

In [51]:
# convert the dataset to sentence embeddings and save the result
wiki_snippets = wikitext_train_pd.text.tolist()
encode_and_save(sentences=wiki_snippets, out_dir='/embeddings', data_name='wiki_snippets')
gc.collect()

0

# SimpleWiki data

This data contains snippets from Simple Wiki. It was downloaded from https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/simplewiki-2020-11-01.jsonl.gz

In [52]:
simplewiki_snippets = get_simple_wiki()

# filter out snippets which are too long
simplewiki_filtered = [p for p in simplewiki_snippets if len(p) < 1200]

# convert the dataset to a dataframe and save it
simple_df = pd.DataFrame(simplewiki_filtered, columns=['text'])
simple_df.head()
df_to_csv(df=simple_df, out_dir='data/preprocessed', out_file='simplewiki.csv')

# convert the dataset to sentence embeddings and save the result
encode_and_save(sentences=simplewiki_filtered, out_dir='embeddings', data_name='simplewiki') 

# Bookcorpus data
This data contains part of the book corpus. It was downloaded from https://huggingface.co/datasets/bookcorpusopen

**Please note:**

Due to processing resource limitations, only 20% of the bookcorpus dataset were selected. I made the selection randomly. The code can still be used to see how I preprocessed the data, but the resulting selection may produce different results during model training.

In [16]:
import zstandard as zstd
import requests
import json

BASE_URL = "https://the-eye.eu/public/AI/pile/train/"

def download_and_decode_file(file_number, chunk_size=16384):
    # Construct the file name and URL
    file_name = f"{file_number:02d}.jsonl.zst"
    url = BASE_URL + file_name
    
    # Initialize decompressor
    dctx = zstd.ZstdDecompressor()
    
    # Data buffer for chunks
    buffer = b""
    
    # Final decompressed data
    decompressed_data = b""
    
    # Parse the jsonl data
    data = []
    
    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        with dctx.stream_reader(response.raw) as reader:
            while True:
                chunk = reader.read(chunk_size)
                if not chunk:
                    break
                
                buffer += chunk
                lines = buffer.splitlines()

                # Keep the last partial line in the buffer for the next chunk
                buffer = lines.pop() if buffer[-1] != 10 else b""
                
                for line in lines:
                    data.append(json.loads(line))
    
    return data

# Example usage
data_00 = download_and_decode_file(0)
print(data_00[0])  # Print the first JSON object from the 00.jsonl.zst file



{'text': 'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web. Playing on the web works, but you have to simulate multi-touch for table moving and that can be a bit confusing.\n\nThere’s a lot I’d like to talk about. I’ll go through every topic, insted of making the typical what went right/wrong list.\n\nConcept\n\nWorking over the theme was probably one of the hardest tasks I had to face.\n\nOriginally, I had an idea of what kind of game I wanted to develop, gameplay wise – something with lots of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident I could fit any theme around it.\n\nIn the end, the problem with a theme like “Evolution” in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge.

In [17]:
import json

with open(BASE_OUTPUT, "w") as f:
    json.dump(data_00, f)


IsADirectoryError: [Errno 21] Is a directory: '/home/masa1357/Dockerdata/gitfile/kaggle_clrp_1st_place_solution/'

In [2]:
from datasets import Dataset
# load the dataset
#bookcorpus = load_dataset('bookcorpusopen')
bookcorpus = Dataset.from_dict({'text': data_00})
# apply some preprocessing
#bookcorpus = bookcorpus['train'].remove_columns('title')

def process_batch(batch):
  out = []
  for text in batch['text']:
    out.extend(get_trainset_word_distribution(text))
  return {'text': out}

bookcorpus_chunked = bookcorpus.map(process_batch, batched=True, batch_size=1000)
bookcorpus_chunked = bookcorpus_chunked.filter(lambda example: len(example['text']) < 1200)

NameError: name 'data_00' is not defined

In [None]:

# convert to pandas, select 20% and save
bookcorpus_df = bookcorpus_chunked.to_pandas()
msk = np.random.rand(len(bookcorpus)) < 0.2
bookcorpus_02 = bookcorpus[msk]
print(type(bookcorpus_02))
bookcorpus_02 = pd.DataFrame(bookcorpus_02)
df = bookcorpus_02


<class 'dict'>


In [None]:

df_to_csv(df, out_dir='data/preprocessed', out_file='bookcorpus.csv')

# convert the dataset to sentence embeddings and save the result
bookcorpus_texts = bookcorpus_02.text.tolist()
encode_and_save(bookcorpus_texts, out_dir='embeddings', data_name='bookcorpus')
gc.collect()

0

# African Storybooks data

This data was downloaded manually from https://www.africanstorybook.org/ .
I downloaded all books starting from letter A up to and including letter D.
The downloaded books were converted from .epub to .txt using Calibre (`ebook-convert input.epub output.txt`).

The full bash script used to convert the books:
```
#!/bin/bash
for filename in *.epub; do
        ebook-convert $filename "$filename.txt"
done
```


In [16]:
# read in the data and clean the texts
in_dir = os.path.join(BASE_INPUT, 'data/external/a_d_txt')
all_texts = []
all_attributions = []
for file in os.listdir(in_dir):
  with open(os.path.join(in_dir, file), 'r') as f:
    txt, attr = clean_file(f)
    if txt != '' and attr != '':    
      all_texts.extend(txt)
      all_attributions.extend(attr)

# create and save as pandas dataframe
asb_df = pd.DataFrame.from_dict({'text': all_texts, 'attribution': all_attributions})
df_to_csv(df=asb_df, out_dir='data/preprocessed', out_file='asb.csv')

# convert the dataset to sentence embeddings and save the result
asb_sents = asb_df.text.tolist()
encode_and_save(sentences=asb_sents, out_dir='embeddings', data_name='asb')
gc.collect()

FileNotFoundError: [Errno 2] No such file or directory: '/home/masa1357/Dockerdata/gitfile/kaggle_clrp_1st_place_solution/data/external/a_d_txt'

# Scraped data
This dataset contains scraped data from wikipedia, wikibooks, simplewiki and kids.frontiersin.org. It was taken from https://www.kaggle.com/teeyee314/readability-url-scrape.

In [22]:
in_dir = os.path.join(BASE_INPUT, 'data/external/external.csv')
scraped_data = pd.read_csv(in_dir)

txts = []
for txt in scraped_data.external_text.values:
    if isinstance(txt, str):  # 確認して、txtが文字列型であることを保証する
        txts.extend(get_trainset_word_distribution(txt))

scraped_df = pd.DataFrame(txts, columns=['text'])
df_to_csv(df=scraped_df, out_dir='data/preprocessed', out_file='kaggle_scraped.csv')

scraped_sents = scraped_df.text.tolist()
encode_and_save(sentences=scraped_sents, out_dir='embeddings', data_name='kaggle_scraped')
gc.collect()

2023-08-11 09:10:14.442936: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


20

# Onestop Corpus data
This dataset was downloaded from https://huggingface.co/datasets/onestop_english

In [23]:
onestop_data = load_dataset('onestop_english')
onestop_data = onestop_data['train']
onestop_df = onestop_data.to_pandas()

df_to_csv(df=onestop_df, out_dir='data/preprocessed', out_file='onestop.csv')

onestop_sents = onestop_df.text.tolist()
encode_and_save(sentences=onestop_sents, out_dir='embeddings', data_name='onestop')
gc.collect()

Downloading builder script:   0%|          | 0.00/5.71k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

Downloading and preparing dataset onestop_english/default to /root/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf...


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/567 [00:00<?, ? examples/s]

Dataset onestop_english downloaded and prepared to /root/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

9

# CC News data
This dataset was downloaded from https://huggingface.co/datasets/cc_news

In [24]:
news_data = load_dataset('cc_news')
news_data = news_data['train']
news_data = news_data.filter(lambda example: len(example['text']) < 1200)
news_df = pd.DataFrame(news_data['text'], columns=['text'])

df_to_csv(df=news_df, out_dir='data/preprocessed', out_file='news.csv')

news_sents = news_df.text.tolist()
encode_and_save(sentences=news_sents, out_dir='embeddings', data_name='news')
gc.collect()

Downloading builder script:   0%|          | 0.00/4.32k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading and preparing dataset cc_news/plain_text to /root/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/e3d5612f02fe5f11826a0d9614328b1772e27e5d685f4ec438e7f768e4581734...


Downloading data:   0%|          | 0.00/845M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/708241 [00:00<?, ? examples/s]

Dataset cc_news downloaded and prepared to /root/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/e3d5612f02fe5f11826a0d9614328b1772e27e5d685f4ec438e7f768e4581734. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Filter:   0%|          | 0/708241 [00:00<?, ? examples/s]

0

# Children's book corpus data
This dataset was downloaded from https://research.fb.com/downloads/babi/

In [26]:
cb_corpus = get_cb_corpus()
cb_corpus = [' '.join(c) for c in cb_corpus]
cb_corpus = pd.DataFrame(cb_corpus, columns=['text'])
cb_corpus.drop([0])

df_to_csv(df=cb_corpus, out_dir='data/preprocessed', out_file='cb_corpus.csv')

cb_sents = cb_corpus.text.tolist()
encode_and_save(sentences=cb_sents, out_dir='embeddings', data_name='cb_corpus')
gc.collect()

91760it [01:05, 1394.36it/s]


8388