In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(
	"deu_wikipedia_2021_1M-words.txt", 
	sep="\t", 
	header=None,
	index_col=0,
	names=["word", "occurences"]
)
df = df.dropna()

In [None]:
# Filter rows for non german chars
filtered = df[~df['word'].str.contains(r'\d')]
filtered = filtered[filtered['word'].str.contains(r'^[a-zA-ZäöüßÄÖÜ\s]+$')]

# Map umlaute to their expanded form
umlaut_map = {
    'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
    'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue'
}
def replace_umlauts(word):
    for umlaut, replacement in umlaut_map.items():
        word = word.replace(umlaut, replacement)
    return word

filtered['word'] = filtered['word'].apply(replace_umlauts)
filtered['word'] = filtered['word'].str.replace(' ', '')
filtered['word'] = filtered['word'].str.upper()
# Remove duplicates
filtered = filtered.drop_duplicates(subset='word')

In [None]:
# Filter the list for valid crossword words
filtered = filtered[filtered['word'].str.len() <= 21]

In [None]:
# Visualise length distributions
filtered['length'] = filtered['word'].apply(len)
grouped = filtered.groupby('length')

for length, group in grouped:
    print(f"Length: {length}. Words: {len(group)}")
    
print(f"Total length: {len(filtered)}")

In [None]:
sorted_df = filtered.assign(length=filtered['word'].str.len()).sort_values(by=['length', 'word']).drop(columns='length')
sorted_df["score"] = 50
sorted_df[["word", "score"]].to_csv("filtered.txt", index=None, header=None, sep=";")