# Create voice donation sentences

When people donate their voice, we ask them to read some verification sentences so that it's not possible to just use a pre-made recording.

This notebook selects sentences by filtering from CommonVoice.


In [None]:
import pandas as pd

In [None]:
# Download here:
# https://commonvoice.mozilla.org/en/datasets select "Common Voice Delta Segment 21.0"
df = pd.read_csv(
    "./data/cv-corpus-21.0-delta-2025-03-14/en/validated_sentences.tsv", sep="\t"
)

In [None]:
def count_uppercase_letters(s):
    return sum(1 for c in s if c.isupper())


def is_ascii(s):
    return all(ord(c) < 128 for c in s)


def max_word_length(s):
    if not s.strip():
        return 0

    return max(len(word) for word in s.split())


def is_ok_sentence(s):
    return (
        is_ascii(s)
        # Exclude long complicated words
        and max_word_length(s) <= 10
        # Proper names might be difficult to pronounce to non-native speakers
        # and are harder to check automatically, so we exclude them. The one capital
        # letter allowed is for the first letter of the sentence.
        and count_uppercase_letters(s) == 1
        and 30 <= len(s) <= 80
        # No questions or exclamations
        and s[-1] == "."
    )


df["is_ok_sentence"] = df["sentence"].apply(is_ok_sentence)

In [None]:
df.loc[df["is_ok_sentence"], "sentence"]

In [None]:
sum(df["is_ok_sentence"])

In [None]:
sentences = df.loc[df["is_ok_sentence"], "sentence"].tolist()[:10000]

In [None]:
with open("../unmute/tts/voice_donation_sentences.txt", "w") as f:
    for sentence in sentences:
        f.write(sentence + "\n")