In [2]:
# read in the data
import pandas as pd
import openpyxl

df= pd.read_excel(
    "subset_20percent_english_cleaned_Deep.xlsx",
    sheet_name="Sheet1",
    header=0
    )

df= df.dropna(axis=1, how='all')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1681 entries, 0 to 1680
Data columns (total 55 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   guid                               1681 non-null   int64         
 1   course_number                      1681 non-null   object        
 2   course_name                        1681 non-null   object        
 3   course_subtitle                    867 non-null    object        
 4   district                           1681 non-null   object        
 5   event_type                         1679 non-null   object        
 6   minimum_participants               1681 non-null   int64         
 7   current_participants               1681 non-null   int64         
 8   maximum_participants               1681 non-null   int64         
 9   number_of_sessions                 1681 non-null   int64         
 10  start_date                         1

In [3]:
# The following code assumes that the DataFrame `df` has been loaded and contains the necessary columns.
import numpy as np

# Assume df is your DataFrame already loaded

# 1. prop_occupancy_left
# (max - current) / max

df['prop_occupancy_left'] = (df['maximum_participants'] - df['current_participants']) / df['maximum_participants']

# 2. prop_minimum_to_reach
# (min - current) / min
# If negative, set to 0

df['prop_minimum_to_reach'] = (df['minimum_participants'] - df['current_participants']) / df['minimum_participants']
df['prop_minimum_to_reach'] = df['prop_minimum_to_reach'].clip(lower=0)

# 3. Simulate number_of_women
# Random number between 0 and current_participants
np.random.seed(42)  # For reproducibility

df['number_of_women'] = df['current_participants'].apply(lambda x: np.random.randint(0, x+1) if x > 0 else 0)

# Calculate % women and prop men
# Avoid division by zero

df['percent_women'] = np.where(df['current_participants'] > 0,
                               df['number_of_women'] / df['current_participants'],
                               0)
df['prop_men'] = np.where(df['current_participants'] > 0,
                          (df['current_participants'] - df['number_of_women']) / df['current_participants'],
                          0)

# 4. Simulate sponsored column
# 1 with 25% probability

df['sponsored'] = np.random.choice([1, 0], size=len(df), p=[0.25, 0.75])

# 5. Create gap columns
# 80% target - actual proportion

df['gap_to_80_percent_women'] = 0.8 - df['percent_women']
df['gap_to_80_percent_men'] = 0.8 - df['prop_men']

# 6. Create binary columns for target group cleaned
# First, find the 6 distinct types (if not already known)

target_groups = df['target_group_cleaned'].dropna().unique()

# Create one-hot encoding (binary columns)
for group in target_groups:
    df[f'target_group_{group}'] = df['target_group_cleaned'].apply(lambda x: 1 if x == group else 0)

# Final DataFrame
print(df.head())


     guid    course_number                                        course_name  \
0  604369          FK3.840  Body Percussion ‚Äì the Rhythmic Sound of the ...   
1  610723  FK4.D-ik-179-5a                                       Deutsch B1.1   
2  656403   SZ810-29-04-01                     Deutsch Integrationskurs A1.1    
3  658796   Mi404-B192S-13                          Deutsch Orientierungskurs   
4  683442       Sp4.224.W9                        Deutsch Alphabetisierung W9   

                             course_subtitle                  district  \
0                                        NaN  Friedrichshain-Kreuzberg   
1                              Aufbaumodul 5  Friedrichshain-Kreuzberg   
2                    Basissprachkurs Modul 1       Steglitz-Zehlendorf   
3                                        NaN                     Mitte   
4  Deutsch Alphabetisierung-Integrationskurs                   Spandau   

  event_type  minimum_participants  current_participants  \
0     Co

In [4]:
# Weight calculation and Course Ranking using OOP

class CourseRanker:
    ALLOWED_TARGET_GROUPS = [
        "People with a migration background",
        "Illiterate people",
        "Women",
        "People with disabilities",
        "Older adults / older people",
        "Other target groups",
        "Children",
        "Adolescents / young people"
    ]

    def __init__(self, df, user_gender, selected_target_groups):
        self.df = df.copy()
        self.user_gender = user_gender.lower()
        self.gender_col = 'gap_to_80_percent_women' if self.user_gender == 'female' else 'gap_to_80_percent_men'

        # Ensure 'Women' is included if gender is female
        cleaned_groups = [tg for tg in selected_target_groups if tg in self.ALLOWED_TARGET_GROUPS]
        if self.user_gender == 'female' and "Women" not in cleaned_groups:
            cleaned_groups.append("Women")

        self.selected_target_groups = cleaned_groups

    def calculate_weight(self, max_numeric_score, current_numeric_score, rank_index, total_courses, min_numeric_score):
        if rank_index > 0:
            weight = ((max_numeric_score - current_numeric_score) / rank_index) * 1.05
        else:
            weight = ((max_numeric_score - min_numeric_score) / total_courses) * 1.05
        return weight

    def filter_courses(self, course_ids):
        self.df_subset = self.df[self.df['guid'].isin(course_ids)].copy()

    def compute_numeric_score(self):
        numeric_cols = ['prop_occupancy_left', 'prop_minimum_to_reach', self.gender_col]
        self.df_subset['numeric_score'] = self.df_subset[numeric_cols].sum(axis=1)
        self.df_subset = self.df_subset.sort_values(by='numeric_score', ascending=False).reset_index(drop=True)
        self.df_subset['rank_index'] = self.df_subset.index

    def compute_binary_boost(self):
        binary_cols = ['sponsored'] + [f'target_group_{tg}' for tg in self.selected_target_groups]
        self.df_subset['binary_sum'] = self.df_subset[binary_cols].sum(axis=1)
        max_score = self.df_subset['numeric_score'].max()
        min_score = self.df_subset['numeric_score'].min()
        total = len(self.df_subset)

        self.df_subset['weight'] = self.df_subset.apply(
            lambda row: self.calculate_weight(
                max_score,
                row['numeric_score'],
                row['rank_index'],
                total,
                min_score
            ), axis=1
        )
        self.df_subset['binary_boost'] = self.df_subset['weight'] * self.df_subset['binary_sum']

    def calculate_final_scores(self):
        self.df_subset['final_score'] = self.df_subset['numeric_score'] + self.df_subset['binary_boost']
        self.df_final = self.df_subset.sort_values(by='final_score', ascending=False).reset_index(drop=True)

    def rank(self, course_ids):
        self.filter_courses(course_ids)
        self.compute_numeric_score()
        self.compute_binary_boost()
        self.calculate_final_scores()
        return self.df_final


In [5]:
import pandas as pd
import numpy as np

# ------------------------
# STEP 1: Create mock dataset
# ------------------------

np.random.seed(42)
n_rows = 100

df_mock = pd.DataFrame({
    'guid': [f'Course-{i}' for i in range(n_rows)],
    'current_participants': np.random.randint(5, 30, size=n_rows),
    'minimum_participants': np.random.randint(5, 10, size=n_rows),
    'maximum_participants': np.random.randint(30, 40, size=n_rows),
    'prop_occupancy_left': np.random.rand(n_rows),
    'prop_minimum_to_reach': np.random.rand(n_rows),
    'gap_to_80_percent_women': np.random.uniform(-0.3, 0.8, size=n_rows),
    'gap_to_80_percent_men': np.random.uniform(-0.3, 0.8, size=n_rows),
    'sponsored': np.random.choice([0, 1], size=n_rows, p=[0.75, 0.25]),
})

# Add binary target group columns
target_groups = [
    "People with a migration background", "Illiterate people", "Women",
    "People with disabilities", "Older adults / older people",
    "Other target groups", "Children", "Adolescents / young people"
]

for tg in target_groups:
    df_mock[f'target_group_{tg}'] = np.random.choice([0, 1], size=n_rows)

# ------------------------
# STEP 2: Sample 20 Course IDs
# ------------------------

sample_course_ids = df_mock['guid'].sample(20, random_state=42).tolist()

# ------------------------
# STEP 3: Run the ranking
# ------------------------

# Assume the user is female and selects 2 target groups
user_gender = 'female'
selected_target_groups = [
    "People with disabilities",
    "Older adults / older people"
]

# Instantiate and rank
ranker = CourseRanker(df_mock, user_gender, selected_target_groups)
ranked_df = ranker.rank(sample_course_ids)

# Display top 5 results
print(ranked_df[['guid', 'final_score', 'numeric_score', 'binary_sum', 'weight']].head())




        guid  final_score  numeric_score  binary_sum    weight
0  Course-80     2.048286       1.499345           3  0.182980
1  Course-22     1.943120       1.847879           1  0.095241
2  Course-33     1.865406       1.497332           3  0.122691
3  Course-77     1.854092       1.723614           1  0.130479
4  Course-31     1.654663       1.441108           2  0.106777


In [10]:
print(ranked_df[['guid']].head(20))

         guid
0   Course-80
1   Course-22
2   Course-33
3   Course-77
4   Course-31
5    Course-4
6   Course-39
7   Course-18
8   Course-90
9   Course-45
10  Course-83
11  Course-12
12  Course-53
13   Course-0
14  Course-70
15  Course-76
16  Course-44
17  Course-73
18  Course-30
19  Course-10
