In [81]:
import pandas as pd
import numpy as np
import ast

# loading the dataset
df = pd.read_csv("sub_20_eng.csv")

In [82]:
# checking structure
print(df.columns.tolist())
print("\nmissing values:")
print(df.isnull().sum())
print("\ndata types:")
print(df.dtypes)

['guid', 'course_number', 'course_name', 'course_subtitle', 'district', 'event_type', 'minimum_participants', 'current_participants', 'maximum_participants', 'number_of_sessions', 'start_date', 'end_date', 'target_group', 'keywords', 'description', 'category_version', 'category_text', 'registration_phone', 'registration_email', 'registration_link', 'contact_person_salutation', 'contact_person_title', 'contact_person_last_name', 'contact_person_first_name', 'contact_person_phone', 'contact_person_email', 'locations_address', 'locations_appointments', 'price_amount', 'price_discount_possible', 'price_additional', 'lecturer_salutation', 'lecturer_title', 'lecturer_last_name', 'lecturer_first_name', 'website_type', 'website_last_name', 'website_uri', 'lecturer', 'merkmale_merkmal_last_name', 'merkmale_merkmal_wert', 'locations_address_facility', 'locations_address_postal_code', 'locations_address_city', 'locations_address_street', 'locations_address_room', 'locations_address_longitude', 'l

In [83]:
# flattening the relevant columns
# helper to safely parse stringified lists/dicts
def safe_parse(x):
    try:
        return ast.literal_eval(x)
    except:
        return x

# flatten relevant fields
def extract_description(x):
    x = safe_parse(x)
    if isinstance(x, list) and x and isinstance(x[0], dict):
        return x[0].get('text', '')
    return str(x)

def flatten_keywords(x):
    x = safe_parse(x)
    if isinstance(x, list):
        return ', '.join(str(i) for i in x)
    return str(x)

def extract_first(x, key):
    x = safe_parse(x)
    if isinstance(x, list) and x and isinstance(x[0], dict):
        return x[0].get(key)
    return None

def get_email(x):
    x = safe_parse(x)
    return x.get('mail') if isinstance(x, dict) else None

# apply flattening
df['description_clean'] = df['description'].apply(extract_description)
df['keywords_clean'] = df['keywords'].apply(flatten_keywords)
df['course_weekday'] = df['locations_appointments'].apply(lambda x: extract_first(x, 'weekday'))
df['course_start_date'] = pd.to_datetime(df['locations_appointments'].apply(lambda x: extract_first(x, 'start_date')), errors='coerce')
df['course_start_time'] = df['locations_appointments'].apply(lambda x: extract_first(x, 'start_time'))
df['course_end_time'] = df['locations_appointments'].apply(lambda x: extract_first(x, 'end_time'))
df['registration_email_clean'] = df['registration_email'].apply(get_email)
df['contact_person_fullname'] = df['contact_person_first_name'].fillna('') + ' ' + df['contact_person_last_name'].fillna('')

In [84]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# create text column for search
df['search_text'] = df['course_name'].fillna('') + ' ' + df['keywords_clean'].fillna('') + ' ' + df['description_clean'].fillna('') + df['category_label'].fillna('')

# load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# encode all course descriptions
course_embeddings = model.encode(df['search_text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [94]:
# define a function to get the most semantically similar courses based on a user query
def get_semantic_matches(user_query, top_n=30):
    # turn user query into a vector using the embedding model
    query_embedding = model.encode([user_query])

    # compute similarity between user query and every course using cosine similarity
    similarities = cosine_similarity(query_embedding, course_embeddings)[0]

    # get the top_n course indices with the highest similarity
    top_indices = similarities.argsort()[-top_n:][::-1]

    # get a copy of those top rows from the dataframe
    matched_df = df.iloc[top_indices].copy()

    # store their similarity score as a new column
    matched_df['semantic_score'] = similarities[top_indices]

    return matched_df

# example query
user_query = "english writing course for beginners"

# get top semantic matches from the search
sem_matches = get_semantic_matches(user_query)
sem_matches.head()

Unnamed: 0,guid,course_number,course_name,course_subtitle,district,event_type,minimum_participants,current_participants,maximum_participants,number_of_sessions,...,keywords_clean,course_weekday,course_start_date,course_start_time,course_end_time,registration_email_clean,contact_person_fullname,search_text,course_duration_days,semantic_score
624,711249,Mi201-075S,REVISION 101 - How to turn a first draft into ...,,Mitte,Course,6,4,9,24,...,"Auf Englisch, Kreatives Schreiben, CULTURE, SC...",Saturday,2025-05-24,11:00,17:30,,Susann Thust,REVISION 101 - How to turn a first draft into ...,8,0.605297
1591,720619,TS430.077F,Englisch B1/B2 Just for Kicks,Für Teilnehmerinnen und Teilnehmer mit Vorkenn...,Tempelhof-Schöneberg,Course,7,11,14,18,...,"B1, B2, Englisch, Konversation, level_B1, LANG...",Wednesday,2025-04-30,15:00,16:30,,Kirsten Althaus,"Englisch B1/B2 Just for Kicks B1, B2, Englisch...",56,0.580758
1569,720372,TS430.063F,Englisch B1 Speaking and Grammar,Für Teilnehmerinnen und Teilnehmer mit Vorkenn...,Tempelhof-Schöneberg,Course,7,6,14,27,...,"Auffrischung, B1, Englisch, Grammatik, level_B...",Thursday,2025-04-03,17:30,19:45,,Kirsten Althaus,Englisch B1 Speaking and Grammar Auffrischung...,91,0.57786
1570,720375,TS430.064F,Englisch B1 Speaking and Grammar,Für Teilnehmerinnen und Teilnehmer mit Vorkenn...,Tempelhof-Schöneberg,Course,7,5,14,18,...,"B1, Englisch, Grammatik, Konversation, level_B...",Thursday,2025-04-03,13:00,14:30,,Kirsten Althaus,"Englisch B1 Speaking and Grammar B1, Englisch...",91,0.572621
1508,719680,TS430.112F-W,Englisch B2/C1 Online Reading Club,Für Teilnehmerinnen und Teilnehmer mit Vorkenn...,Tempelhof-Schöneberg,Course,7,6,14,18,...,"B2, BigBlueButton, C1, Englisch, level_B2, Lit...",Monday,2025-03-24,18:00,19:30,,Kirsten Althaus,"Englisch B2/C1 Online Reading Club B2, BigBlue...",70,0.56183


In [86]:
# converting date columns in the main df
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')  # this is the course start date
df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')      # this is the course end date
df['course_start_date'] = pd.to_datetime(df['course_start_date'], errors='coerce')  # optional field

# calculating how long the course lasts (in days)
df['course_duration_days'] = (df['end_date'] - df['start_date']).dt.days

# now doing the same for the subset returned by semantic search
sem_matches['course_start_date'] = pd.to_datetime(sem_matches['course_start_date'], errors='coerce')
sem_matches['start_date'] = pd.to_datetime(sem_matches['start_date'], errors='coerce')
sem_matches['end_date'] = pd.to_datetime(sem_matches['end_date'], errors='coerce')

# recompute duration so we can filter or rank on it
sem_matches['course_duration_days'] = (sem_matches['end_date'] - sem_matches['start_date']).dt.days

In [89]:
# defining user preferences for matching
user_preferences = {
    'district': 'neukölln',                     # preferred location
    'budget_max': 500,                          # max price they want to pay
    'preferred_days': ['saturday', 'sunday'],   # preferred course days
    'start_after': pd.to_datetime('2025-05-01'),
    'end_before': pd.to_datetime('2025-07-30'),
    'min_duration_days': 14,                    # course must be at least 2 weeks
    'max_sessions': 50                          # limit on total number of sessions
}

# assigning weights to features used in ranking
weights = {
    'district': 1.0,
    'budget': 0.8,
    'weekday': 0.6,
    'start_date': 0.5,
    'duration': 0.4,
    'sessions': 0.3
}

In [None]:
# applying strict filters before ranking
filtered_df = sem_matches.copy()

# drop rows where start date is missing
filtered_df = filtered_df[pd.notnull(filtered_df['course_start_date'])]

# filter by district
filtered_df = filtered_df[filtered_df['district'].str.lower() == user_preferences['district'].lower()]

# filter by budget
filtered_df = filtered_df[pd.to_numeric(filtered_df['price_amount'], errors='coerce') <= user_preferences['budget_max']]

# filter by start date
filtered_df = filtered_df[
    (filtered_df['course_start_date'] >= user_preferences['start_after']) &
    (filtered_df['course_start_date'] <= user_preferences['end_before'])
]

# filter by weekday
filtered_df = filtered_df[filtered_df['course_weekday'].str.lower().isin([d.lower() for d in user_preferences['preferred_days']])]

# filter by course duration
filtered_df = filtered_df[pd.to_numeric(filtered_df['course_duration_days'], errors='coerce') >= user_preferences['min_duration_days']]

# filter by session count
filtered_df = filtered_df[pd.to_numeric(filtered_df['number_of_sessions'], errors='coerce') <= user_preferences['max_sessions']]

# reset index
filtered_df = filtered_df.reset_index(drop=True)

In [90]:
# assign preference-based score to each row
def preference_score(row):
    score = 0

    if str(row['district']).lower() == user_preferences['district'].lower():
        score += weights['district']

    if pd.notnull(row['price_amount']) and row['price_amount'] <= user_preferences['budget_max']:
        score += weights['budget']

    if str(row['course_weekday']).lower() in [d.lower() for d in user_preferences['preferred_days']]:
        score += weights['weekday']

    if pd.notnull(row['course_start_date']) and user_preferences['start_after'] <= row['course_start_date'] <= user_preferences['end_before']:
        score += weights['start_date']

    if pd.notnull(row['course_duration_days']) and row['course_duration_days'] >= user_preferences['min_duration_days']:
        score += weights['duration']

    if pd.notnull(row['number_of_sessions']) and row['number_of_sessions'] <= user_preferences['max_sessions']:
        score += weights['sessions']

    return score

# apply the score function and sort
sem_matches['match_score'] = sem_matches.apply(preference_score, axis=1)
ranked_df = sem_matches.sort_values(by='match_score', ascending=False).reset_index(drop=True)

In [91]:
# label how strong the semantic match is (optional, nice for UI)
def label_strength(score):
    if score > 0.6:
        return "Strong"
    elif score > 0.3:
        return "Moderate"
    else:
        return "Weak"

ranked_df["match_strength"] = ranked_df["semantic_score"].apply(label_strength)

# display top 10
display(ranked_df[['guid', 'course_name', 'district', 'price_amount', 'course_weekday',
                   'course_start_date', 'category_label', 'semantic_score',
                   'match_score', 'match_strength', 'registration_link',
                   'contact_person_fullname', 'registration_email_clean']].head(10))

Unnamed: 0,guid,course_name,district,price_amount,course_weekday,course_start_date,category_label,semantic_score,match_score,match_strength,registration_link,contact_person_fullname,registration_email_clean
0,711249,REVISION 101 - How to turn a first draft into ...,Mitte,85.3,Saturday,2025-05-24,,0.605297,2.2,Strong,http://www.berlin.de/vhsmitte,Susann Thust,
1,692823,Englisch B1.3 (Onlinekurs),Spandau,56.2,Thursday,2025-05-15,German as a Second Language,0.518765,2.0,Moderate,http://www.vhs-spandau.de,Vanessa-Jessica Pinn,
2,714016,Englisch A2/B1 - Use it or Lose it - Update yo...,Charlottenburg-Wilmersdorf,123.6,Thursday,2025-05-15,German as a Second Language,0.427778,2.0,Moderate,http://www.vhs-city-west.de,Kinan Azzam,
3,719908,Englisch C1 TOEFL Preparation - Online,Tempelhof-Schöneberg,74.4,Wednesday,2025-05-07,German as a Second Language,0.455302,2.0,Moderate,https://www.berlin.de/vhs-tempelhof-schoeneberg/,Kirsten Althaus,
4,692845,Englisch B2/C1 - Let’s read a short story! (Fl...,Spandau,56.2,Friday,2025-05-09,German as a Second Language,0.501603,2.0,Moderate,http://www.vhs-spandau.de,Vanessa-Jessica Pinn,
5,692840,Englisch C1.2 (Onlinekurs),Spandau,56.2,Thursday,2025-05-15,German as a Second Language,0.515205,2.0,Moderate,http://www.vhs-spandau.de,Vanessa-Jessica Pinn,
6,692842,Englisch B2 - für die Generation 55+ (Präsenzk...,Spandau,73.6,Thursday,2025-05-15,German as a Second Language,0.417502,2.0,Moderate,http://www.vhs-spandau.de,Vanessa-Jessica Pinn,
7,720699,Englisch C1/C2 In the news,Tempelhof-Schöneberg,69.56,Monday,2025-05-05,German as a Second Language,0.549889,2.0,Moderate,https://www.berlin.de/vhs-tempelhof-schoeneberg/,Kirsten Althaus,
8,710503,Englisch B1+/B2 Crash Course Speaking Skills,Steglitz-Zehlendorf,147.75,Tuesday,2025-06-24,German as a Second Language,0.48438,1.6,Moderate,www.vhssz.de,Janette Dukes,
9,710524,Englisch B2 Crash Course Speaking Skills,Steglitz-Zehlendorf,146.75,Monday,2025-06-16,German as a Second Language,0.422706,1.6,Moderate,www.vhssz.de,Janette Dukes,
