In [1]:
import pandas as pd


def export_corpus_txt(pkl_path, txt_path):
    df = pd.read_pickle(pkl_path)
    with open(txt_path, "w") as f:
        for row in df["text"]:
            f.write(" ".join(row) + "\n")

export_corpus_txt("data/tokenized/corpus_16.pkl", "data/tokenized/corpus_16.txt")

In [2]:
import pandas as pd
import json


def export_to_jsonl(pkl_path, jsonl_path):
    df = pd.read_pickle(pkl_path)
    with open(jsonl_path, "w") as f:
        for _, row in df.iterrows():
            json.dump({"id": row["id"], "text": row["text"]}, f)
            f.write("\n")

export_to_jsonl(
    "data/tokenized/segmented_corpus_16.pkl", "data/tokenized/segmented_corpus_16.jsonl"
)

In [3]:
from datasets import load_dataset 

dataset = load_dataset("mythezone/financial-corpus-a-share", split="train")

print(dataset[0])
print(f"Total examples: {len(dataset)}")
print(f"Total columns: {len(dataset.column_names)}")

README.md:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

segmented_corpus_16.jsonl:   0%|          | 0.00/61.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

{'id': '1_0', 'text': ['7956', '6856', '8855', '7C68', '6746', '7877', '8A88', '8A7A', '8978', '0300', '8968', '8C7B', '7867', '8988', '7867', '8856', '7968', '6767', '8978', '6756', '8A79', '8866']}
Total examples: 300775
Total columns: 2


In [5]:
from transformers import PreTrainedTokenizerFast 

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="data/tokenized/financial_tokenizer.json"
)

sample = dataset[0]["text"]
print(sample)

encoding = tokenizer(" ".join(sample), return_tensors="pt")
print(encoding.input_ids)
print(encoding.attention_mask)

['7956', '6856', '8855', '7C68', '6746', '7877', '8A88', '8A7A', '8978', '0300', '8968', '8C7B', '7867', '8988', '7867', '8856', '7968', '6767', '8978', '6756', '8A79', '8866']
tensor([[ 85, 100,  25, 414, 135,  11,  58,  37,   7, 383,  32,  78,   9,  21,
           9,  17,  23, 174,   7,  70,  19,  12]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
