# Build a simple data set
This notebook downloads Taylor Swift lyrics from Hugging Face and builds a simple dataset for training a language model.

In [59]:
from datasets import load_dataset
ds = load_dataset("skeskinen/books3_basic_sentenses_paraphrased", split="train")

README.md:   0%|          | 0.00/528 [00:00<?, ?B/s]

(…)-00000-of-00001-630edd713726adb8.parquet:   0%|          | 0.00/44.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/670204 [00:00<?, ? examples/s]

In [60]:
df = ds.to_pandas()
df

Unnamed: 0,text,book,pos,smog_index,paraphrase
0,This title was originally cataloged by the Lib...,Are You My Mother_ - P.D. Eastman,0.014706,4.8,The title was cataloged by the Library of Cong...
1,The egg jumped.,Are You My Mother_ - P.D. Eastman,0.113971,4.8,The egg flew into the air.
2,So away she went.,Are You My Mother_ - P.D. Eastman,0.158088,4.8,She went away.
3,"""Where is my mother?""",Are You My Mother_ - P.D. Eastman,0.191176,4.8,Where is my mother?
4,He looked for her.,Are You My Mother_ - P.D. Eastman,0.202206,4.8,He was looking for her.
...,...,...,...,...,...
670199,He seemed to be in his early twenties.,A Gathering of Old Men,0.280271,5.6,He was in his twenties.
670200,"He was about five eight, and weighed round a h...",A Gathering of Old Men,0.280271,5.6,He was around 100 and forty pounds.
670201,Even from this distance you could see he was s...,A Gathering of Old Men,0.280271,5.6,He was scared even from this distance.
670202,"He was unarmed, and he reached back into the c...",A Gathering of Old Men,0.280271,5.6,He reached into the car for a gun and was not ...


In [61]:
import unicodedata

def normalize_text(text):
    # Normalize to NFKD form which separates characters from their diacritics
    normalized = unicodedata.normalize('NFKD', text)
    # Filter out combining characters (accents, etc.)
    ascii_text = ''.join([c for c in normalized if not unicodedata.combining(c)])
    return ascii_text

In [62]:
import re
re_split = re.compile(r'[\n,;]')

In [83]:
from encode import encode_to_morse

def apply_encoding(batch):
    lines = []
    morse = []

    for text in batch["paraphrase"]:
        lines.append(text)
        morse.append(encode_to_morse(normalize_text(text), skip_unknown=True))

    return {
        "line": lines,
        "morse": morse
    }


In [84]:
new_ds = ds.map(apply_encoding, batched=True, remove_columns=["paraphrase", "text", "book", "pos", "smog_index"], num_proc=8)
new_ds.to_pandas()

Map (num_proc=8):   0%|          | 0/670204 [00:00<?, ? examples/s]

Unnamed: 0,line,morse
0,The title was cataloged by the Library of Cong...,- .... . / - .. - .-.. . / .-- .- ... / -.-. ....
1,The egg flew into the air.,- .... . / . --. --. / ..-. .-.. . .-- / .. -....
2,She went away.,... .... . / .-- . -. - / .- .-- .- -.-- .-.-.-
3,Where is my mother?,.-- .... . .-. . / .. ... / -- -.-- / -- --- -...
4,He was looking for her.,.... . / .-- .- ... / .-.. --- --- -.- .. -. -...
...,...,...
670199,He was in his twenties.,.... . / .-- .- ... / .. -. / .... .. ... / - ...
670200,He was around 100 and forty pounds.,.... . / .-- .- ... / .- .-. --- ..- -. -.. / ...
670201,He was scared even from this distance.,.... . / .-- .- ... / ... -.-. .- .-. . -.. / ...
670202,He reached into the car for a gun and was not ...,.... . / .-. . .- -.-. .... . -.. / .. -. - --...


In [85]:
import pandas as pd
from datasets import Dataset

# Convert Dataset to pandas DataFrame
df = new_ds.to_pandas()

# Drop duplicates (across all columns, or specify subset)
df = df.drop_duplicates()  # Or: df.drop_duplicates(subset=["line"])

# Convert back to Hugging Face Dataset
deduped_dataset = Dataset.from_pandas(df, preserve_index=False)

deduped_dataset.to_pandas()

Unnamed: 0,line,morse
0,The title was cataloged by the Library of Cong...,- .... . / - .. - .-.. . / .-- .- ... / -.-. ....
1,The egg flew into the air.,- .... . / . --. --. / ..-. .-.. . .-- / .. -....
2,She went away.,... .... . / .-- . -. - / .- .-- .- -.-- .-.-.-
3,Where is my mother?,.-- .... . .-. . / .. ... / -- -.-- / -- --- -...
4,He was looking for her.,.... . / .-- .- ... / .-.. --- --- -.- .. -. -...
...,...,...
531971,Where did you get those?,.-- .... . .-. . / -.. .. -.. / -.-- --- ..- /...
531972,They're Uncle Clarry's.,- .... . -.-- .----. .-. . / ..- -. -.-. .-.. ...
531973,She could see the back porch and steps.,... .... . / -.-. --- ..- .-.. -.. / ... . . /...
531974,Theo said that he looked normal.,- .... . --- / ... .- .. -.. / - .... .- - / ....


In [86]:
# Test if special are normalized
from decode import decode_from_morse

print(deduped_dataset[15687])
print(decode_from_morse(deduped_dataset[15687]["morse"]))


{'line': "Domino's pink nose twitched as he crawled out from under the table.", 'morse': '-.. --- -- .. -. --- .----. ... / .--. .. -. -.- / -. --- ... . / - .-- .. - -.-. .... . -.. / .- ... / .... . / -.-. .-. .- .-- .-.. . -.. / --- ..- - / ..-. .-. --- -- / ..- -. -.. . .-. / - .... . / - .- -... .-.. . .-.-.-'}
DOMINO'S PINK NOSE TWITCHED AS HE CRAWLED OUT FROM UNDER THE TABLE.


Change this to your Hugging Face repository if needed

In [None]:
deduped_dataset.push_to_hub("philipfourie/books3_basic_sentenses_paraphrased-Morse", private=False)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/532 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/philipfourie/books3_basic_sentenses_paraphrased-Morse/commit/b7b7719e52db13b28540a48e6d8b80a299c8c01f', commit_message='Upload dataset', commit_description='', oid='b7b7719e52db13b28540a48e6d8b80a299c8c01f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/philipfourie/books3_basic_sentenses_paraphrased-Morse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='philipfourie/books3_basic_sentenses_paraphrased-Morse'), pr_revision=None, pr_num=None)

: 