In [2]:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import random
from datetime import datetime, timedelta
import ast
import itertools, pulp

  from .autonotebook import tqdm as notebook_tqdm


In [3]:


# --- Helper Functions ---

def safe_parse(x):
    try:
        return ast.literal_eval(x)
    except:
        return x

def extract_first(x, key):
    x = safe_parse(x)
    if isinstance(x, list):
        for entry in x:
            if isinstance(entry, dict) and key in entry:
                return entry[key]
    return None

def extract_description(x):
    x = safe_parse(x)
    if isinstance(x, list):
        for entry in x:
            if isinstance(entry, dict) and entry.get('property') == 'Description':
                return entry.get('text', '')
    return str(x)

def flatten_keywords(x):
    x = safe_parse(x)
    if isinstance(x, list):
        return ', '.join([str(i) for i in x])
    return str(x)

def get_email(x):
    x = safe_parse(x)
    if isinstance(x, dict):
        return x.get('mail')
    return None

def get_contact_fullname(row):
    first = str(row.get('contact_person_first_name', '')).strip()
    last = str(row.get('contact_person_last_name', '')).strip()
    return f"{first} {last}".strip()

def get_address_component(x, component):
    x = safe_parse(x)
    if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict):
        return x[0].get(component)
    return None

def extract_discount_info(x):
    x = safe_parse(x)
    if isinstance(x, str):
        return x
    return ''



In [4]:
def prepare_course_data(df: pd.DataFrame) -> pd.DataFrame:
    df['description_clean'] = df['description'].apply(extract_description)
    df['keywords_clean'] = df['keywords'].apply(flatten_keywords)

    df['course_weekday'] = df['locations_appointments'].apply(lambda x: extract_first(x, 'weekday'))
    df['course_start_date'] = df['locations_appointments'].apply(lambda x: extract_first(x, 'start_date'))
    df['course_start_time'] = df['locations_appointments'].apply(lambda x: extract_first(x, 'start_time'))
    df['course_end_time'] = df['locations_appointments'].apply(lambda x: extract_first(x, 'end_time'))

    df['registration_email_clean'] = df['registration_email'].apply(get_email)
    df['contact_person_fullname'] = df.apply(get_contact_fullname, axis=1)

    df['address_facility'] = df['locations_address'].apply(lambda x: get_address_component(x, 'facility'))
    df['address_postal_code'] = df['locations_address'].apply(lambda x: get_address_component(x, 'postal_code'))
    df['address_city'] = df['locations_address'].apply(lambda x: get_address_component(x, 'city'))
    df['address_street'] = df['locations_address'].apply(lambda x: get_address_component(x, 'street'))
    df['address_room'] = df['locations_address'].apply(lambda x: get_address_component(x, 'room'))
    df['address_longitude'] = df['locations_address'].apply(lambda x: get_address_component(x, 'longitude'))
    df['address_latitude'] = df['locations_address'].apply(lambda x: get_address_component(x, 'latitude'))
    df['address_accessible'] = df['locations_address'].apply(lambda x: get_address_component(x, 'accessible'))

    df['price_additional_clean'] = df['price_additional'].apply(extract_discount_info)

    df['prop_occupancy_left'] = (df['maximum_participants'] - df['current_participants']) / df['maximum_participants']
    df['prop_minimum_to_reach'] = (df['minimum_participants'] - df['current_participants']) / df['minimum_participants']
    df['prop_minimum_to_reach'] = df['prop_minimum_to_reach'].clip(lower=0)

    np.random.seed(42)
    df['number_of_women'] = df['current_participants'].apply(lambda x: np.random.randint(0, x + 1) if x > 0 else 0)
    df['percent_women'] = np.where(df['current_participants'] > 0,
                                   df['number_of_women'] / df['current_participants'], 0)
    df['prop_men'] = np.where(df['current_participants'] > 0,
                              (df['current_participants'] - df['number_of_women']) / df['current_participants'], 0)

    df['sponsored'] = np.random.choice([1, 0], size=len(df), p=[0.25, 0.75])
    df['gap_to_80_percent_women'] = 0.8 - df['percent_women']
    df['gap_to_80_percent_men'] = 0.8 - df['prop_men']

    # Create one-hot encoded target group columns
    if 'target_group' in df.columns:
        target_groups = df['target_group'].dropna().unique()
        for group in target_groups:
            column_name = f"target_group_{group}"
            df[column_name] = df['target_group'].apply(lambda x: 1 if x == group else 0)

    # Ensure price is numeric
    df['price_amount'] = pd.to_numeric(df.get('price_amount', np.nan), errors='coerce')

    return df

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class SemanticSearchEngine:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.course_embeddings = None
        self.course_texts = None

    def build_index(self, df: pd.DataFrame) -> None:
        df['search_text'] = df['course_name'].fillna('') + ' ' + \
                            df['keywords_clean'].fillna('') + ' ' + \
                            df['description_clean'].fillna('')
        self.course_texts = df['search_text'].tolist()
        self.course_embeddings = self.model.encode(self.course_texts, show_progress_bar=True)

    def query(self, user_query: str, df: pd.DataFrame, top_n: int = 30) -> pd.DataFrame:
        if self.course_embeddings is None:
            raise ValueError("Index not built. Call `build_index(df)` first.")
        query_embedding = self.model.encode([user_query])
        similarities = cosine_similarity(query_embedding, self.course_embeddings)[0]
        top_indices = similarities.argsort()[-top_n:][::-1]
        matched_df = df.iloc[top_indices].copy()
        matched_df['semantic_score'] = similarities[top_indices]
        return matched_df.reset_index(drop=True)


In [6]:
class user_preferenceReranker:
    def __init__(self, weights=None):
        self.weights = weights or {
            'district': 1.0,
            'budget': 0.8,
            'weekday': 0.6,
            'start_date': 0.5
        }

    def rerank(self, df: pd.DataFrame, prefs: dict) -> pd.DataFrame:
        # Ensure price is numeric here too (safety net)
        df['price_amount'] = pd.to_numeric(df['price_amount'], errors='coerce')
        # Remove full courses (no available seats)
        df = df[df['prop_occupancy_left'] > 0].copy()
        
        def score(row):
            s = 0
            # Match district
            if str(row.get('district', '')).lower() == prefs.get('district', '').lower():
                s += self.weights['district']
            # Budget logic
            try:
                if pd.notnull(row['price_amount']) and float(row['price_amount']) <= prefs.get('budget_max', float('inf')):
                    s += self.weights['budget']
            except (TypeError, ValueError):
                pass
            # Preferred weekday
            if row.get('course_weekday') in prefs.get('preferred_days', []):
                s += self.weights['weekday']
            # Date window
            start = prefs.get('start_after')
            end = prefs.get('end_before')
            course_date = row.get('course_start_date')
            if pd.notnull(course_date) and start and end:
                try:
                    if start <= course_date <= end:
                        s += self.weights['start_date']
                except Exception:
                    pass
            return s

        df['match_score'] = df.apply(score, axis=1)
        return df.sort_values(by='match_score', ascending=False).reset_index(drop=True)


In [7]:
class platform_preferenceRanker:
    ALLOWED_TARGET_GROUPS = [
        "People with a migration background",
        "Illiterate people",
        "Women",
        "People with disabilities",
        "Older adults / older people",
        "Other target groups",
        "Children",
        "Adolescents / young people"
    ]

    def __init__(self, df: pd.DataFrame, user_gender: str, selected_target_groups: list):
        self.df = df.copy()
        self.user_gender = user_gender.lower()
        self.gender_col = 'gap_to_80_percent_women' if self.user_gender == 'female' else 'gap_to_80_percent_men'
        cleaned_groups = [tg for tg in selected_target_groups if tg in self.ALLOWED_TARGET_GROUPS]
        if self.user_gender == 'female' and "Women" not in cleaned_groups:
            cleaned_groups.append("Women")
        self.selected_target_groups = cleaned_groups

    def calculate_weight(self, max_score, current_score, rank_index, total, min_score):
        if rank_index > 0:
            return ((max_score - current_score) / rank_index) * 1.05
        return ((max_score - min_score) / total) * 1.05

    def filter_courses(self, course_ids):
        self.df_subset = self.df[self.df['guid'].isin(course_ids)].copy()

    def compute_numeric_score(self):
        cols = ['prop_occupancy_left', 'prop_minimum_to_reach', self.gender_col]
        self.df_subset['numeric_score'] = self.df_subset[cols].sum(axis=1)
        self.df_subset.sort_values(by='numeric_score', ascending=False, inplace=True)
        self.df_subset.reset_index(drop=True, inplace=True)
        self.df_subset['rank_index'] = self.df_subset.index

    def compute_binary_boost(self):
        binary_cols = ['sponsored'] + [f'target_group_{tg}' for tg in self.selected_target_groups]
        self.df_subset['binary_sum'] = self.df_subset[binary_cols].sum(axis=1)
        max_score = self.df_subset['numeric_score'].max()
        min_score = self.df_subset['numeric_score'].min()
        total = len(self.df_subset)
        self.df_subset['weight'] = self.df_subset.apply(
            lambda row: self.calculate_weight(max_score, row['numeric_score'], row['rank_index'], total, min_score),
            axis=1
        )
        self.df_subset['binary_boost'] = self.df_subset['weight'] * self.df_subset['binary_sum']

    def calculate_final_scores(self):
        self.df_subset['final_score'] = self.df_subset['numeric_score'] + self.df_subset['binary_boost']
        self.df_final = self.df_subset.sort_values(by='final_score', ascending=False).reset_index(drop=True)

    def rank(self, course_ids):
        self.filter_courses(course_ids)
        self.compute_numeric_score()
        self.compute_binary_boost()
        self.calculate_final_scores()
        return self.df_final


In [8]:

def compute_consensus_ranking(user_order, platform_order, df):
    # Validate that both rankings contain the same courses
    assert set(user_order) == set(platform_order), "user_order and platform_order must match"
    courses = user_order

    # Rank lookup tables
    user_rank = {c: i for i, c in enumerate(user_order)}
    platform_rank = {c: i for i, c in enumerate(platform_order)}

    # Voting weights
    w_user, w_plat = 0.50, 0.50

    # Build pairwise margins
    m = {}
    for i, j in itertools.combinations(courses, 2):
        sign_user = 1 if user_rank[i] < user_rank[j] else -1
        sign_plat = 1 if platform_rank[i] < platform_rank[j] else -1
        vote = w_user * sign_user + w_plat * sign_plat

        if vote > 0:
            m[(i, j)] = abs(vote)
        elif vote < 0:
            m[(j, i)] = abs(vote)
        else:
            m[(i, j)] = 0.1
            m[(j, i)] = 0.1

    # Define ILP model
    model = pulp.LpProblem("Kemeny", pulp.LpMinimize)
    x = pulp.LpVariable.dicts('x', (courses, courses), 0, 1, cat='Binary')

    # Objective: minimize total conflict
    model += pulp.lpSum(weight * x[j][i] for (i, j), weight in m.items())

    # Antisymmetry and totality
    for i, j in itertools.permutations(courses, 2):
        model += x[i][j] + x[j][i] == 1

    # Transitivity
    for i, j, k in itertools.permutations(courses, 3):
        model += x[i][j] + x[j][k] + x[k][i] >= 1

    # Solve
    model.solve(pulp.PULP_CBC_CMD(msg=False))

    # Build consensus order
    consensus_order = sorted(
        courses,
        key=lambda c: sum(x[c][d].value() for d in courses if d != c),
        reverse=True
    )

    # Desired output columns
    desired_columns = [
        'guid', 'course_name', 'district', 'number_of_sessions', 'course_start_date',
        'course_end_time', 'target_group', 'registration_email_clean',
        'contact_person_last_name', 'contact_person_first_name', 'registration_email_clean',
        'locations_address', 'locations_appointments', 'price_amount',
        'price_discount_possible', 'price_additional', 'website_uri', 'category'
    ]

    # Filter and order results
    consensus_df = df[df['guid'].isin(consensus_order)].copy()
    consensus_df['__rank__'] = consensus_df['guid'].apply(lambda x: consensus_order.index(x))
    consensus_df.sort_values(by='__rank__', inplace=True)
    consensus_df.drop(columns='__rank__', inplace=True)

    # Filter columns that exist in the DataFrame
    columns_present = [col for col in desired_columns if col in consensus_df.columns]
    final_df = consensus_df[columns_present]

    return final_df


In [9]:
# read in the data
import pandas as pd
import openpyxl

df= pd.read_excel(
    "sub_20_eng.xlsx",
    sheet_name="sub_20_eng",
    header=0
    )

df= df.dropna(axis=1, how='all')

In [10]:
df_new = prepare_course_data(df)

In [15]:
user_query = "Englisch"

user_preferences = {
    'district': 'Neukölln',
    'budget_max': 50,
    'preferred_days': ['Saturday', 'Sunday'],
    'start_after': pd.to_datetime('2025-04-30'),
    'end_before': pd.to_datetime('2025-07-31')
}

user_gender = 'female'

selected_target_groups = [
    "People with disabilities",
    "Older adults / older people"
]


In [16]:

semantic_engine = SemanticSearchEngine()
semantic_engine.build_index(df)
sem_matches = semantic_engine.query(user_query, df, top_n=25)

reranker = user_preferenceReranker()
ranked_df = reranker.rerank(sem_matches, user_preferences)

# Get list of top course IDs after semantic + preference re-rank
user_ranked_ids = ranked_df['guid'].tolist()

platform_ranker = platform_preferenceRanker(df, user_gender, selected_target_groups)
platform_ranked_df = platform_ranker.rank(course_ids=user_ranked_ids)

# This is your second ranking: Platform-Informed Ranking
platform_ranked_ids = platform_ranked_df['guid'].tolist()

Batches: 100%|██████████| 53/53 [00:08<00:00,  6.04it/s]


In [17]:
consensus_df = compute_consensus_ranking(user_ranked_ids, platform_ranked_ids, df)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
print(consensus_df)  # or consensus_df.to_csv("output.csv", index=False)


        guid                                        course_name  \
1508  719680                 Englisch B2/C1 Online Reading Club   
1246  717441               Englisch C1 Konversation & Grammatik   
1080  716012                Englisch B1+ Conversation & Grammar   
71    692843  Englisch B1/B2 - Let's practise your English (...   
63    692835  Englisch A2/B1 f√ºr die Generation 55+ - A new...   
62    692834            Englisch A2 - Let's talk (Pr√§senzkurs)   
1591  720619                      Englisch B1/B2 Just for Kicks   
57    692826  Englisch B1 f√ºr die Generation 55+ - Mit Freu...   
1505  719623              Englisch C1 Lernintensiv am Vormittag   
1319  718052  Englisch A2 f√ºr die Generation 55+ - Improve ...   
1314  718005   Englisch B1 - The gaming experience (Onlinekurs)   
1295  717829    Englisch B1 - Early Morning English (Flexikurs)   
1034  715599               Englisch B1 Konversation & Grammatik   
76    692848      Englisch C1/C2 - Speaking freely (Hybridkurs