# Build a simple data set
This notebook downloads Taylor Swift lyrics from Hugging Face and builds a simple dataset for training a language model.

In [1]:
from datasets import load_dataset
ds = load_dataset("huggingartists/taylor-swift")

In [2]:
df = ds["train"].to_pandas()
df.head(7)

Unnamed: 0,text
0,"Car rides to Malibu\nStrawberry ice cream, one..."
1,"Vintage tee, brand new phone\nHigh heels on co..."
2,"I can see you standing, honey\nWith his arms a..."
3,We could leave the Christmas lights up til Jan...
4,"Im doing good, Im on some new shit\nBeen sayin..."
5,I dont like your little games\nDont like your ...
6,"Betty, I wont make assumptions\nAbout why you ..."


In [3]:
import unicodedata

def normalize_text(text):
    # Normalize to NFKD form which separates characters from their diacritics
    normalized = unicodedata.normalize('NFKD', text)
    # Filter out combining characters (accents, etc.)
    ascii_text = ''.join([c for c in normalized if not unicodedata.combining(c)])
    return ascii_text

In [4]:
from encode import encode_to_morse

def apply_encoding(batch):
    lines = []
    morse = []

    for text in batch["text"]:
        split_lines = text.split("\n")
        lines.extend(split_lines)
        morse.extend([encode_to_morse(normalize_text(line), skip_unknown=True) for line in split_lines])

    return {
        "line": lines,
        "morse": morse
    }


In [5]:
new_ds = ds.map(apply_encoding, remove_columns="text", batched=True)
new_ds["train"].to_pandas()

Unnamed: 0,line,morse
0,Car rides to Malibu,-.-. .- .-. / .-. .. -.. . ... / - --- / -- .-...
1,"Strawberry ice cream, one spoon for two",... - .-. .- .-- -... . .-. .-. -.-- / .. -.-....
2,And tradin jackets,.- -. -.. / - .-. .- -.. .. -. / .--- .- -.-. ...
3,Laughin’ bout how small it looks on you,.-.. .- ..- --. .... .. -. / -... --- ..- - / ...
4,Watching reruns of Glee,.-- .- - -.-. .... .. -. --. / .-. . .-. ..- -...
...,...,...
33715,I just wanna keep calling your name until you ...,.. / .--- ..- ... - / .-- .- -. -. .- / -.- . ...
33716,I just wanna keep calling your name until you ...,.. / .--- ..- ... - / .-- .- -. -. .- / -.- . ...
33717,I just wanna keep calling your name until you ...,.. / .--- ..- ... - / .-- .- -. -. .- / -.- . ...
33718,I just wanna keep calling your name until you ...,.. / .--- ..- ... - / .-- .- -. -. .- / -.- . ...


In [6]:
import pandas as pd
from datasets import Dataset

# Convert Dataset to pandas DataFrame
df = new_ds["train"].to_pandas()

# Drop duplicates (across all columns, or specify subset)
df = df.drop_duplicates()  # Or: df.drop_duplicates(subset=["line"])

# Convert back to Hugging Face Dataset
deduped_dataset = Dataset.from_pandas(df, preserve_index=False)

deduped_dataset.to_pandas()

Unnamed: 0,line,morse
0,Car rides to Malibu,-.-. .- .-. / .-. .. -.. . ... / - --- / -- .-...
1,"Strawberry ice cream, one spoon for two",... - .-. .- .-- -... . .-. .-. -.-- / .. -.-....
2,And tradin jackets,.- -. -.. / - .-. .- -.. .. -. / .--- .- -.-. ...
3,Laughin’ bout how small it looks on you,.-.. .- ..- --. .... .. -. / -... --- ..- - / ...
4,Watching reruns of Glee,.-- .- - -.-. .... .. -. --. / .-. . .-. ..- -...
...,...,...
15684,Vas-tu chuchoter doucement et lentement ?,...- .- ... -....- - ..- / -.-. .... ..- -.-. ...
15685,"Je suis captivée par toi, bébé, comme le feu d...",.--- . / ... ..- .. ... / -.-. .- .--. - .. .....
15686,Et les étincelles volent,. - / .-.. . ... / . - .. -. -.-. . .-.. .-.. ...
15687,"Oh, bébé, souris",--- .... --..-- / -... . -... . --..-- / ... -...


In [7]:
# Test if special are normalized
from decode import decode_from_morse

print(deduped_dataset[15687])
print(decode_from_morse(deduped_dataset[15687]["morse"]))


{'line': 'Oh, bébé, souris', 'morse': '--- .... --..-- / -... . -... . --..-- / ... --- ..- .-. .. ...'}
OH, BEBE, SOURIS


Change this to your Hugging Face repository if needed

In [8]:
deduped_dataset.push_to_hub("philipfourie/morse-taylor-swift")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/311 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/philipfourie/morse-taylor-swift/commit/2ae28a792c9b92c2e3eae5a3f5fea5471b5a45ae', commit_message='Upload dataset', commit_description='', oid='2ae28a792c9b92c2e3eae5a3f5fea5471b5a45ae', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/philipfourie/morse-taylor-swift', endpoint='https://huggingface.co', repo_type='dataset', repo_id='philipfourie/morse-taylor-swift'), pr_revision=None, pr_num=None)