In [1]:
import pandas as pd

In [10]:
lines = []
with open("wörter.txt", "r") as f:
	lines = f.readlines()

words = []
for line in lines:
	# Remove whitespace and line terminations
	line = line.strip()

	w = line.split(" ")

	# Score derivatives and root words
	for index, word in enumerate(w):
		score = 50 if index == 0 else 25

		# Except plural derivative, also 50
		if index == 1 and (word[-2:] == "en" or word[-1:] == "e") and word[-4:-2] not in ["st", "er", "en", "et"]:
			score = 50

		words += [{"word": word, "score": score}]

In [11]:
df = pd.DataFrame.from_records(words)

In [12]:
# Filter rows for non german chars
filtered = df[~df['word'].str.contains(r'\d')]
filtered = filtered[filtered['word'].str.contains(r'^[a-zA-ZäöüßÄÖÜ\s]+$')]

# Map umlaute to their expanded form
umlaut_map = {
    'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
    'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue'
}
def replace_umlauts(word):
    for umlaut, replacement in umlaut_map.items():
        word = word.replace(umlaut, replacement)
    return word

filtered['word'] = filtered['word'].apply(replace_umlauts)
filtered['word'] = filtered['word'].str.replace(' ', '')
filtered['word'] = filtered['word'].str.upper()
# Remove duplicates
filtered = filtered.drop_duplicates(subset='word')

In [13]:
# Filter the list for valid crossword words
filtered = filtered[filtered['word'].str.len() <= 21]

In [14]:
# Visualise length distributions
filtered['length'] = filtered['word'].apply(len)
grouped = filtered.groupby('length')

for length, group in grouped:
    print(f"Length: {length}. Words: {len(group)}")
    
print(f"Total length: {len(filtered)}")

Length: 1. Words: 26
Length: 2. Words: 99
Length: 3. Words: 502
Length: 4. Words: 1613
Length: 5. Words: 4025
Length: 6. Words: 8017
Length: 7. Words: 13192
Length: 8. Words: 20197
Length: 9. Words: 29139
Length: 10. Words: 36426
Length: 11. Words: 41696
Length: 12. Words: 43228
Length: 13. Words: 38965
Length: 14. Words: 32369
Length: 15. Words: 25400
Length: 16. Words: 19261
Length: 17. Words: 13986
Length: 18. Words: 9809
Length: 19. Words: 6627
Length: 20. Words: 4310
Length: 21. Words: 2793
Total length: 351680


In [15]:
sorted_df = filtered.assign(length=filtered['word'].str.len()).sort_values(by=['length', 'word']).drop(columns='length')
sorted_df.to_csv("filtered.txt", index=None, header=None, sep=";")