In [8]:
import pandas as pd
import re
import requests
import openai

# === Set up OpenAI API (new SDK) ===
client = openai.OpenAI(api_key="INSERT_YOUR_API_KEY_HERE")  # Replace with your actual key

# === Load ENDOH Enriched File (with real frequencies) ===
df = pd.read_csv("ENDOH_enriched.csv")
df = df[['Preferred Label', 'Parents', 'Frequency']].dropna()
df['Preferred Label'] = df['Preferred Label'].astype(str)
df['Word Count'] = df['Preferred Label'].apply(lambda x: len(x.replace('_', ' ').split()))

# === Normalize Word Count and Frequency ===
word_stats = df['Word Count'].describe()
freq_stats = df['Frequency'].describe()
seed_stats = {
    'min_wc': word_stats['min'], 'max_wc': word_stats['max'],
    'min_freq': freq_stats['min'], 'max_freq': freq_stats['max']
}

def normalize_wc(wc, stats):
    return max(0, min(1, 1 - ((wc - stats['min_wc']) / (stats['max_wc'] - stats['min_wc']))))

def normalize_freq(f, stats):
    return max(0, min(1, (f - stats['min_freq']) / (stats['max_freq'] - stats['min_freq'])))

# === GPT-4.5 German Translation ===
def translate_to_german(term):
    prompt = f"Translate the following English concept into fluent German. Only return the translation. No explanation or punctuation.\n\n{term.replace('_', ' ')}"
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"[Error translating '{term}']: {e}")
        return term

# === Translation Scoring Utilities ===
def word_count(term):
    cleaned = re.sub(r'[^A-Za-z0-9\\s]', '', term)
    return len(cleaned.split())

def extract_words(term):
    cleaned = re.sub(r'[^A-Za-z0-9\\s]', '', term)
    return set(cleaned.split())

def decompose_german_term(term):
    return set(re.findall(r'[A-ZÄÖÜa-zäöüß]+', term))

def is_compound_word(eng, ger):
    return bool(extract_words(eng) & decompose_german_term(ger))

def translation_score(eng, ger):
    eng_wc = word_count(eng)
    ger_wc = word_count(ger)
    score = 0.0
    if 1 <= eng_wc <= 3:
        score = 1.0 if ger_wc <= eng_wc else 0.8 if ger_wc == eng_wc + 1 else 0.5
    elif 4 <= eng_wc <= 6:
        score = 1.0 if ger_wc < eng_wc else 0.8 if ger_wc == eng_wc else 0.5
    elif 7 <= eng_wc <= 20:
        score = 0.9 if ger_wc < eng_wc * 0.8 else 0.7 if ger_wc < eng_wc else 0.4
    elif 21 <= eng_wc <= 80:
        score = 0.7 if ger_wc < eng_wc * 0.8 else 0.4
    if is_compound_word(eng, ger):
        score += 0.1
    return min(score, 1.0)

# === POS Pattern and Combination Score ===
valid_combos = {
    frozenset(['noun', 'noun']): 1.0,
    frozenset(['adjective', 'noun']): 0.95,
    frozenset(['noun', 'noun', 'noun']): 0.9,
    frozenset(['verb', 'noun']): 0.85,
    frozenset(['noun', 'adjective']): 0.8,
    frozenset(['noun', 'verb']): 0.75,
    frozenset(['adjective', 'noun', 'noun']): 0.7,
    frozenset(['adjective', 'adjective', 'noun']): 0.65,
    frozenset(['noun', 'prepositional phrase']): 0.6,
    frozenset(['adjective', 'adjective', 'adjective', 'noun']): 0.55,
    frozenset(['noun', 'noun', 'prepositional phrase']): 0.5,
    frozenset(['adjective', 'noun', 'noun', 'noun']): 0.45,
    frozenset(['noun', 'noun', 'noun', 'noun']): 0.4,
    frozenset(['noun', 'adjective', 'noun', 'noun']): 0.35,
    frozenset(['adjective', 'noun', 'noun', 'noun', 'noun']): 0.3
}

def check_merriam(word):
    url = f"https://www.dictionaryapi.com/api/v3/references/medical/json/{word}?key=INSERT_YOUR_API_KEY_HERE"
    try:
        r = requests.get(url).json()
        for entry in r:
            if isinstance(entry, dict) and 'meta' in entry and entry['meta']['id'] == word:
                return entry.get('fl')
    except:
        return None
    return None

def combination_score(term):
    tags = [check_merriam(w) for w in term.replace('_', ' ').split() if check_merriam(w)]
    pos_set = frozenset(tags)
    for combo in valid_combos:
        if pos_set.issubset(combo):
            return valid_combos[combo]
    return 0.0

# === Goodness Score Calculation ===
alpha, beta, lambd, theta = 0.15, 0.22, 0.31, 0.27
detailed_scores = []

for i, row in df.iterrows():
    concept = row['Preferred Label']
    wc = row['Word Count']
    freq = row['Frequency']
    norm_wc = normalize_wc(wc, seed_stats)
    norm_freq = normalize_freq(freq, seed_stats)
    german = translate_to_german(concept)
    tscore = translation_score(concept, german)
    combo = combination_score(concept)
    goodness = (alpha * combo) + (beta * norm_wc) + (lambd * tscore) + (theta * norm_freq)

    detailed_scores.append({
        "Concept": concept,
        "German Translation": german,
        "Word Count": wc,
        "Frequency": freq,
        "Normalized WC": round(norm_wc, 3),
        "Normalized Freq": round(norm_freq, 3),
        "Translation Score": round(tscore, 3),
        "Combination Score": round(combo, 3),
        "Goodness Score": round(goodness, 4)
    })

# === Final DataFrame and Output ===
detailed_df = pd.DataFrame(detailed_scores)


# Average Goodness Score
average_goodness = detailed_df["Goodness Score"].mean()
print("\n📊 Average Goodness Score across ENDOH concepts:", round(average_goodness, 4))


📊 Average Goodness Score across ENDOH concepts: 0.4879


In [9]:
detailed_df

Unnamed: 0,Concept,German Translation,Word Count,Frequency,Normalized WC,Normalized Freq,Translation Score,Combination Score,Goodness Score
0,Waste_managment_facility,Abfallwirtschaftsanlage,3,2086,0.6,0.004,1.0,0.4,0.5031
1,Industrial_noise_pollution,Industrielle Lärmbelästigung,3,0,0.6,0.000,1.0,0.4,0.5020
2,Ozone_Depeletion,Ozonschichtabbau,2,0,0.8,0.000,1.0,0.4,0.5460
3,Airborne_disease_transmission_in_public_spaces,Übertragung von Krankheiten durch die Luft in ...,6,49,0.0,0.000,1.0,0.0,0.3100
4,Built_in_environment,Eingebaute Umgebung,3,16514,0.6,0.031,1.0,0.0,0.4504
...,...,...,...,...,...,...,...,...,...
97,Social_Support_network,Soziales Unterstützungsnetzwerk,3,235,0.6,0.000,1.0,0.4,0.5021
98,High_level_radon_home,Haus mit hoher Radonbelastung,4,20,0.4,0.000,1.0,0.4,0.4580
99,Unregulated_cosmetic_use,Unregulierter Kosmetikgebrauch,3,26,0.6,0.000,1.0,0.4,0.5020
100,Indoor_air_purification,Innenraumluftreinigung,3,67,0.6,0.000,1.0,0.4,0.5020


In [10]:
detailed_df.to_csv("endoh_goodness.csv",index=False)