# Data Anonymization Pipeline

In [3]:
import pandas as pd
import uuid

output_path = "data/01_anonymized/"

## Users

We grap the users enrolled in the course (specified by the CourseID 443) from the database. Anonymization will take place later since we need the information to map the fetched data to the users and can not anonymize it yet. We will use the `uuid` library to generate unique identifiers for each user.

In [4]:
%%sql
select distinct u.id, u.login, u.first_name, u.last_name, u.email
from jhi_user u
left join user_groups ug on u.id = ug.user_id
left join course c on  ug.user_groups = c.student_group_name
where c.id = 443

Unnamed: 0,id,login,first_name,last_name,email
0,29602,go69tex,Chen,Zhang,chen.zhang@tum.de
1,29248,go49soy,Antanas,Baltuska,antanas.baltuska@tum.de
2,30118,go36buv,Niklas,Pattscheck,niklas.pattscheck@tum.de
3,29704,go36qak,Tjorben,Huber,tjorben.huber@tum.de
4,28961,go36caj,Shu-Xiang,Yang,shu-xiang.yang@tum.de
...,...,...,...,...,...
1273,13968,ge78luy,Tim,Schäfer,ge78luy@mytum.de
1274,9212,ge38wel,Yaser A. Y.,Shawa,ge38wel@mytum.de
1275,32686,go73geh,Suhui,Liu,go73geh@mytum.de
1276,21076,ge97jim,Ufuk,Yarışan,ge97jim@mytum.de


In [5]:
# generate uuid for user
jhi_user["uuid"] = [str(uuid.uuid4()) for _ in range(len(jhi_user))]

def find_uuid_by_id(user_id):
    values = jhi_user.loc[jhi_user["id"] == user_id, "uuid"].values
    if len(values) == 0:
        return "NOT_FOUND"
    return values[0]

def find_uuid_by_login(login):
    values = jhi_user.loc[jhi_user["login"] == login, "uuid"].values
    if len(values) == 0:
        return "NOT_FOUND"
    return values[0]

def find_uuid_by_email_or_first_name_and_last_name(email, first_name, last_name):
    values = jhi_user.loc[(jhi_user["email"] == email) | ((jhi_user["first_name"] == first_name) & (jhi_user["last_name"] == last_name)), "uuid"].values
    if len(values) == 0:
        return "NOT_FOUND"
    return values[0]

def find_uuid_by_first_name_and_last_name_and_email(first_name, last_name, email):
    values = jhi_user.loc[(jhi_user["first_name"] == first_name) & (jhi_user["last_name"] == last_name) & (jhi_user["email"] == email), "uuid"].values
    if len(values) == 0:
        return "NOT_FOUND"
    return values[0]

## Competency
We fetch the competencies for the course with ID 443.

In [6]:
%%sql
select c.id, c.taxonomy, c.title, c.mastery_threshold, c.soft_due_date
from competency c
where c.course_id = 443

Unnamed: 0,id,taxonomy,title,mastery_threshold,soft_due_date
0,608,U,Fundamentals of Imperative Program Verification,60,2025-05-22 22:00:00
1,609,A,Pre-/Postconditions,60,2025-05-22 22:00:00
2,610,A,Loop Invariants,60,2025-05-29 22:00:00
3,611,U,"Basics: Expressions, Values and Variables",60,2025-06-12 22:00:00
4,612,U,Basics: Functions,60,2025-06-12 22:00:00
5,613,Y,OCaml Syntax and Semantics,90,2025-06-12 22:00:00
6,614,Y,Data Structures and Pattern Matching,90,2025-06-19 22:00:00
7,615,Y,Functional Programming Paradigms,90,2025-06-26 22:00:00
8,616,Y,Higher-Order Functions,90,2025-06-26 22:00:00
9,617,Y,Polymorphism,90,2025-06-26 22:00:00


In [7]:
competency.to_csv(output_path + "competency.csv", index=False)

## Learning Path
We fetch the learning path for the course with ID 443. We anonymize the user's id.

In [8]:
%%sql
select lp.user_id, lp.progress, lp.started_by_student
from learning_path lp
where lp.course_id = 443

Unnamed: 0,user_id,progress,started_by_student
0,29602,6,1
1,29248,33,0
2,30118,33,0
3,29704,17,0
4,28961,11,1
...,...,...,...
1286,13968,0,0
1287,9212,0,0
1288,32686,0,0
1289,21076,0,0


In [9]:
learning_path["user_id"] = learning_path["user_id"].apply(find_uuid_by_id)
learning_path.to_csv(output_path + "learning_path.csv", index=False)

## Competency Participation
We fetch the competency participation for the course with ID 443. We anonymize the user's id.

In [10]:
%%sql
select cu.user_id, cu.competency_id, cu.progress, cu.confidence, cu.confidence_reason, cu.last_modified_date
from competency_user cu
join competency c on cu.competency_id = c.id
where c.course_id = 443

Unnamed: 0,user_id,competency_id,progress,confidence,confidence_reason,last_modified_date
0,2320,608,68,1.25,MORE_HIGH_WEIGHTED_EXERCISES,2025-08-03 17:19:26
1,2633,608,0,1.00,NO_REASON,2025-05-16 17:00:14
2,2732,608,68,1.25,MORE_HIGH_WEIGHTED_EXERCISES,2025-07-31 12:24:41
3,2733,608,22,1.00,NO_REASON,2025-08-06 09:19:38
4,5171,608,0,1.00,NO_REASON,2025-05-16 17:00:08
...,...,...,...,...,...,...
10327,31971,625,17,1.00,NO_REASON,2025-05-09 07:30:26
10328,32008,625,0,1.00,NO_REASON,2025-05-11 10:13:41
10329,32517,625,17,1.00,NO_REASON,2025-05-31 14:57:12
10330,32579,625,17,1.00,NO_REASON,2025-07-05 15:31:37


In [11]:
competency_user["user_id"] = competency_user["user_id"].apply(find_uuid_by_id)
competency_user.to_csv(output_path + "competency_user.csv", index=False)

## Science Events
todo

In [12]:
%%sql
select se.*
from science_event se
join jhi_user u on se.identity = u.login
where u.id in (
    select distinct u.id
    from jhi_user u
    left join user_groups ug on u.id = ug.user_id
    left join course c on  ug.user_groups = c.student_group_name
    where c.id = 443
)

Unnamed: 0,id,identity,timestamp,event_type,resource_id
0,1,ge87yiv,2024-03-02 15:13:58,0,600.0
1,12,go69zav,2024-03-02 16:18:58,2,11534.0
2,21,go69tir,2024-03-02 16:54:55,2,11667.0
3,22,go69jip,2024-03-02 16:56:34,2,12462.0
4,48,ge59tug,2024-03-02 18:17:47,2,11591.0
...,...,...,...,...,...
3967285,11160822,go36tuz,2025-08-26 09:10:39,2,16481.0
3967286,11160838,go36coq,2025-08-26 09:29:51,4,443.0
3967287,11160839,go36coq,2025-08-26 09:30:09,3,621.0
3967288,11160841,go36coq,2025-08-26 09:30:22,4,443.0


In [13]:
science_event["identity"] = science_event["identity"].apply(find_uuid_by_login)
science_event.to_csv(output_path + "science_event.csv", index=False)

## Participations

In [14]:
%%sql
select p.student_id, p.exercise_id, p.test_run
from participation p
join exercise e on p.exercise_id = e.id
where e.course_id = 443

Unnamed: 0,student_id,exercise_id,test_run
0,25732,16636,False
1,29244,16636,False
2,23281,16636,False
3,29909,16636,False
4,30089,16636,False
...,...,...,...
26869,29244,17233,True
26870,30598,17233,True
26871,30701,17233,True
26872,25579,17233,True


In [15]:
participation["student_id"] = participation["student_id"].apply(find_uuid_by_id)
participation.to_csv(output_path + "participation.csv", index=False)

## Participant Score
We fetch the participant score for the course with ID 443. We anonymize the user's id.

In [16]:
%%sql
select ps.user_id, ps.exercise_id, ps.last_rated_score, ps.last_rated_points
from participant_score ps
join exercise e on ps.exercise_id = e.id
where e.course_id = 443

Unnamed: 0,user_id,exercise_id,last_rated_score,last_rated_points
0,28274,16644,100.0,20.0
1,29841,16644,100.0,20.0
2,15797,16644,80.0,16.0
3,26566,16644,0.0,0.0
4,25327,16644,100.0,20.0
...,...,...,...,...
18768,29884,17233,,
18769,29998,17233,,
18770,29244,17233,,
18771,30598,17233,,


In [17]:
participant_score["user_id"] = participant_score["user_id"].apply(find_uuid_by_id)
participant_score.to_csv(output_path + "participant_score.csv", index=False)

## Lecture
We fetch the lecture for the course with ID 443. We are only interested in the IDs, not the exact content.

In [18]:
%%sql
select l.id
from lecture l
where l.course_id = 443

Unnamed: 0,id
0,1430
1,1448
2,1454
3,1464
4,1473
5,1481
6,1489
7,1499
8,1508
9,1514


In [19]:
lecture.to_csv(output_path + "lecture.csv", index=False)

## Lecture Units
We fetch the lecture units for the course with ID 443. We are only interested in the IDs, not the exact content.

In [20]:
%%sql
select lu.id
from lecture_unit lu
join lecture l on lu.lecture_id = l.id
where l.course_id = 443

Unnamed: 0,id
0,6657
1,6210
2,6211
3,6212
4,6213
5,6287
6,6288
7,6289
8,6332
9,6333


In [21]:
lecture_unit.to_csv(output_path + "lecture_unit.csv", index=False)

## Exercise
We fetch the exercises for the course with ID 443. We are only interested in the IDs, not the exact content.

In [22]:
%%sql
select e.id
from exercise e
where e.course_id = 443

Unnamed: 0,id
0,16636
1,16637
2,16638
3,16639
4,16640
...,...
66,17229
67,17230
68,17231
69,17232


In [23]:
exercise.to_csv(output_path + "exercise.csv", index=False)

## Links between Competencies and Resources
We fetch the links between competencies and resources for the course with ID 443.

In [24]:
%%sql
select ce.*
from competency_exercise ce
join competency c on ce.competency_id = c.id
where c.course_id = 443

Unnamed: 0,competency_id,exercise_id,link_weight
0,608,16640,0.25
1,608,16643,0.25
2,608,16745,0.50
3,608,16848,0.50
4,609,16642,0.25
...,...,...,...
85,624,17218,0.50
86,625,16636,0.50
87,625,16637,0.50
88,625,16638,0.50


In [25]:
competency_exercise.to_csv(output_path + "competency_exercise.csv", index=False)

In [26]:
%%sql
select cl.*
from competency_lecture_unit cl
join competency c on cl.competency_id = c.id
where c.course_id = 443

Unnamed: 0,competency_id,lecture_unit_id,link_weight
0,608,6210,0.5
1,608,6212,0.5
2,608,6288,0.5
3,609,6213,0.5
4,609,6287,0.5
5,610,6289,0.5
6,610,6332,0.5
7,611,6334,0.5
8,611,6370,0.5
9,612,6372,0.5


In [27]:
competency_lecture_unit.to_csv(output_path + "competency_lecture_unit.csv", index=False)

## Exam Exercise Results
We fetch the participant score for the course with ID 469 (Digital exam was conducted in another course instance due to administrative preferences.). We anonymize the user's id.

In [28]:
%%sql
select ps.user_id, ps.exercise_id, ps.last_rated_score, ps.last_rated_points
from participant_score ps
join exercise e on ps.exercise_id = e.id
join exercise_group eg on e.exercise_group_id = eg.id
join exam ex on eg.exam_id = ex.id
where ex.course_id = 469

Unnamed: 0,user_id,exercise_id,last_rated_score,last_rated_points
0,30735,17432,75.0,6.0
1,30750,17432,75.0,6.0
2,30728,17432,0.0,0.0
3,22547,17432,50.0,4.0
4,14361,17432,50.0,4.0
...,...,...,...,...
6281,26581,17438,0.0,0.0
6282,22509,17438,0.0,0.0
6283,25585,17438,0.0,0.0
6284,25587,17438,0.0,0.0


In [29]:
participant_score_exam["user_id"] = participant_score_exam["user_id"].apply(find_uuid_by_id)
participant_score_exam.to_csv(output_path + "participant_score_exam.csv", index=False)

## Lime Survey Responses
We map the users to the corresponding uuid and anonymize the user.

In [30]:
lime_survey_pre = pd.read_csv("data/00_in/results-pre.csv")
lime_survey_pre["uuid"] = lime_survey_pre.apply(
    lambda row: find_uuid_by_first_name_and_last_name_and_email(
        row["firstname"], row["lastname"], row["email"]
    ),
    axis=1
)
lime_survey_pre = lime_survey_pre.drop(columns=["id", "submitdate", "lastpage", "startlanguage", "seed", "token", "firstname", "lastname", "email"])
lime_survey_pre.to_csv(output_path + "lime_survey_pre.csv", index=False)

In [31]:
lime_survey_post = pd.read_csv("data/00_in/results-post.csv")
lime_survey_post["uuid"] = lime_survey_post.apply(
    lambda row: find_uuid_by_first_name_and_last_name_and_email(
        row["firstname"], row["lastname"], row["email"]
    ),
    axis=1
)
lime_survey_post = lime_survey_post.drop(columns=["id", "submitdate", "lastpage", "startlanguage", "seed", "token", "firstname", "lastname", "email"])
lime_survey_post.to_csv(output_path + "lime_survey_post.csv", index=False)

## Tutorial Group Mapping


In [32]:
tutorial_mapping = pd.read_csv("data/00_in/tutorial_mapping.csv", delimiter=";")
tutorial_mapping["uuid"] = tutorial_mapping.apply(
    lambda row: find_uuid_by_email_or_first_name_and_last_name(
        row["E-MAIL"], row["VORNAME"], row["FAMILIENNAME"]
    ),
    axis=1
)
tutorial_mapping = tutorial_mapping[["uuid", "GRUPPE"]]
tutorial_mapping.rename(columns={"GRUPPE": "group"}, inplace=True)
tutorial_mapping.to_csv(output_path + "tutorial_mapping.csv", index=False)

## Finalize User

In [33]:
jhi_user[["uuid"]].to_csv(output_path + "jhi_user.csv", index=False)