In [9]:
import pandas as pd
import ast
import re
import json


csv_name = "pcex_challenges_kcs"
json_name = "pcexChallenges"

topic_kcs = pd.read_csv("topics_kcs.csv")
topic_kcs["KCs"] = topic_kcs["KCs"].apply(lambda x: [kc.strip() for kc in x.split(";")])

course_model = []

all_previous = set()
all_topics = topic_kcs["topic_name"].tolist()
all_kcs = [set(kcs) for kcs in topic_kcs["KCs"]]

for i, (topic, kcs) in enumerate(zip(all_topics, all_kcs)):
    current = set(kcs) - all_previous
    past = set().union(*all_kcs[:i])
    future = set().union(*all_kcs[i+1:])

    course_model.append({
        "topic": topic,
        "Past": sorted(list(past)),
        "Current": sorted(list(current)),
        "Future": sorted(list(future))
    })

    all_previous.update(current)

course_model_df = pd.DataFrame(course_model)
course_model_df.head(5)



Unnamed: 0,topic,Past,Current,Future
0,variables & operators,[],"[AddExpression, AdditiveExpression, Arithmetic...","[AndExpression, ArrayDataType, ArrayElement, A..."
1,console I/O,"[AddExpression, AdditiveExpression, Arithmetic...","[JavaStandardLibraryClass, JavaStandardLibrary...","[AndExpression, ArrayDataType, ArrayElement, A..."
2,booleans,"[AddExpression, AdditiveExpression, Arithmetic...","[AndExpression, BooleanDataType, BooleanExpres...","[ArrayDataType, ArrayElement, ArrayInitializat..."
3,block & scope,"[AddExpression, AdditiveExpression, AndExpress...","[DefinitionStatement, Statement, StatementBloc...","[ArrayDataType, ArrayElement, ArrayInitializat..."
4,while loops,"[AddExpression, AdditiveExpression, AndExpress...","[AssignmentExpression, IterationStatement, Whi...","[ArrayDataType, ArrayElement, ArrayInitializat..."


In [10]:
topic_kcs.head(3)

Unnamed: 0,index,topic_name,KCs
0,1,variables & operators,"[Variable, SimpleVariable, SimpleDataTypeValue..."
1,2,console I/O,"[JavaStandardLibraryClass, java.lang.System, J..."
2,3,booleans,"[SimpleDataType, BooleanDataType, BooleanValue..."


In [11]:
course_model_dict = {
    row["topic"]: {
        "Past": row["Past"],
        "Current": row["Current"],
        "Future": row["Future"]
    }
    for _, row in course_model_df.iterrows()
}

print(course_model_dict)

{'variables & operators': {'Past': [], 'Current': ['AddExpression', 'AdditiveExpression', 'ArithmeticAssignmentExpression', 'Expression', 'MultiplicativeExpression', 'Operator', 'SimpleDataType', 'SimpleDataTypeValue', 'SimpleVariable', 'Variable'], 'Future': ['AndExpression', 'ArrayDataType', 'ArrayElement', 'ArrayInitializationStatement', 'ArrayInitializer', 'ArrayLength', 'AssignmentExpression', 'BooleanDataType', 'BooleanExpression', 'BooleanValue', 'ClassDataType', 'ControlFlow', 'DefinitionStatement', 'EqualExpression', 'ExceptionClass', 'ExceptionHandlingStatement', 'Expression', 'False', 'ForEachStatement', 'ForStatement', 'IterationStatement', 'JavaStandardLibraryClass', 'JavaStandardLibraryMethod', 'JavaStandardLibraryObject', 'Method', 'MethodDefinition', 'MethodInvocation', 'MethodSignature', 'MultiDimensionalArrayDataType', 'MultiDimensionalArrayInitializationStatement', 'MultiDimensionalArrayInitializer', 'MultiDimensionalArrayVariable', 'NotEqualExpression', 'NotExpressi

In [12]:
activities = pd.read_csv(f"mappings/{csv_name}.csv")

def parse_kcs(s):
    # Handle missing or already-parsed
    if isinstance(s, list):
        return s
    if pd.isna(s):
        return []

    s = str(s).strip()
    if not s:
        return []

    s_clean = re.sub(r'(\[|,\s*)nan(,|\])', r'\1None\2', s)

    try:
        value = ast.literal_eval(s_clean)
    except Exception as e:
        print("Could not parse kcs string:", repr(s))
        print("Error:", e)
        return []

    if isinstance(value, list):
        return [kc for kc in value if isinstance(kc, str)]
    else:
        return []

activities["kcs"] = activities["kcs"].apply(parse_kcs)

alpha, beta, gamma = 0.2, 1.0, -1.5

def compute_score(activity_kcs, topic_sets):
    kc_set = set(activity_kcs)
    past = kc_set.intersection(topic_sets["Past"])
    current = kc_set.intersection(topic_sets["Current"])
    future = kc_set.intersection(topic_sets["Future"])
    score = alpha * len(past) + beta * len(current) + gamma * len(future)
    return {
        "past_count": len(past),
        "current_count": len(current),
        "future_count": len(future),
        "score": score,
        "past_kcs": sorted(list(past)),
        "current_kcs": sorted(list(current)),
        "future_kcs": sorted(list(future))
    }

recommendations = []

for topic, topic_sets in course_model_dict.items():
    for _, row in activities.iterrows():
        result = compute_score(row["kcs"], topic_sets)
        recommendations.append({
            "topic": topic,
            "content_name": row["content_name"],
            "content_type": row["content_type"],
            **result
        })

recommendations_df = pd.DataFrame(recommendations)

recommendations_df["rank"] = recommendations_df.groupby("topic")["score"] \
                                                .rank(ascending=False, method="first")

recommendations_df = recommendations_df.sort_values(["topic", "rank"])

recommendations_df.head(3)

# recommendationzs_df.to_csv("test.csv")

Unnamed: 0,topic,content_name,content_type,past_count,current_count,future_count,score,past_kcs,current_kcs,future_kcs,rank
731,2 dimensional arrays (fundamental operations),JPrintMedalsRowColumnTotal,animated_example,3,1,0,1.6,"[ArrayElement, ArrayInitializationStatement, A...",[MultiDimensionalArrayDataType],[],1.0
701,2 dimensional arrays (fundamental operations),JArrays2dBasic2,animated_example,2,1,0,1.4,"[ArrayElement, ForStatement]",[MultiDimensionalArrayDataType],[],2.0
702,2 dimensional arrays (fundamental operations),JArrays2dBasic3,animated_example,2,1,0,1.4,"[ArrayElement, ForStatement]",[MultiDimensionalArrayDataType],[],3.0


In [13]:
recommendations_df = recommendations_df.drop(columns=[
    "past_count", "current_count", "past_kcs", "current_kcs", "future_kcs", "rank"
])

# Avoid division-by-zero if all scores are equal
def scale_to_stars(group):
    min_score = group["score"].min()
    max_score = group["score"].max()

    if max_score == min_score:
        group["star_score"] = 5
    else:
        group["star_score"] = 5 * (group["score"] - min_score) / (max_score - min_score)

    return group

recommendations_df = recommendations_df.groupby("topic", group_keys=False).apply(scale_to_stars)
recommendations_df["star_score"] = recommendations_df["star_score"].round(1)

recommendations_df.head(3)

  recommendations_df = recommendations_df.groupby("topic", group_keys=False).apply(scale_to_stars)


Unnamed: 0,topic,content_name,content_type,future_count,score,star_score
731,2 dimensional arrays (fundamental operations),JPrintMedalsRowColumnTotal,animated_example,0,1.6,5.0
701,2 dimensional arrays (fundamental operations),JArrays2dBasic2,animated_example,0,1.4,4.7
702,2 dimensional arrays (fundamental operations),JArrays2dBasic3,animated_example,0,1.4,4.7


In [14]:
# sorted_df = recommendations_df.sort_values("score", ascending=False).copy()

# best_topic_df = (
#     sorted_df
#     .sort_values("score", ascending=False)
#     .drop_duplicates(subset=["content_name"], keep="first")
# )

# top2_unique_df = (
#     best_topic_df
#     .sort_values(["topic", "score"], ascending=[True, False])
#     .groupby("topic")
#     .head(2)
#     .reset_index(drop=True)
# )

# print(f"{len(top2_unique_df)} unique activities assigned (max 2 per topic)")
# top2_unique_df[["topic", "content_name", "score"]].head(12)


In [15]:
# sorted_df = recommendations_df.sort_values("score", ascending=False).copy()

# best_topic_df = (
#     sorted_df
#     .drop_duplicates(subset=["content_name"], keep="first")
# )

# threshold = 1
# threshold_unique_df = best_topic_df[best_topic_df["score"] > threshold].reset_index(drop=True)

# print(f"{len(threshold_unique_df)} unique activities with score > {threshold}")
# threshold_unique_df[["topic", "content_name", "score"]].head(12)


In [16]:
# topic_kc_map = dict(zip(topic_kcs["topic_name"], topic_kcs["KCs"]))

# animated_map = (
#     recommendations_df
#     .groupby("topic")[["content_name", "score"]]
#     .apply(lambda g: [f"{row.content_name} - {row.score:.2f}" for row in g.itertuples()])
#     .to_dict()
# )

# topics_json = []
# for topic, kcs in topic_kc_map.items():
#     topics_json.append({
#         "topic": topic,
#         "animatedExamples": animated_map.get(topic, []),
#         "kcs": kcs
#     })

topic_groups = {}

for topic, group in recommendations_df.groupby("topic"):
    objects = []

    for _, row in group.iterrows():
        obj = {
            "name": row["content_name"],
            "rate": float(row["star_score"]),
            "futureKcs": row["future_count"] > 0
        }
        objects.append(obj)

    topic_groups[topic] = {
        "topic": topic,
        json_name: objects
    }

with open(f"{json_name}.json", "w") as f:
    json.dump(topic_groups, f, indent=2)

print(topic_groups)

{'2 dimensional arrays (fundamental operations)': {'topic': '2 dimensional arrays (fundamental operations)', 'pcexChallenges': [{'name': 'JPrintMedalsRowColumnTotal', 'rate': 5.0, 'futureKcs': False}, {'name': 'JArrays2dBasic2', 'rate': 4.7, 'futureKcs': False}, {'name': 'JArrays2dBasic3', 'rate': 4.7, 'futureKcs': False}, {'name': 'JSodaSurverySodaAvg', 'rate': 4.7, 'futureKcs': False}, {'name': 'JArraySwapAdjacentElements', 'rate': 4.0, 'futureKcs': False}, {'name': 'JArrayBasic3', 'rate': 3.6, 'futureKcs': False}, {'name': 'JArrayRotateLeftTwice', 'rate': 3.6, 'futureKcs': False}, {'name': 'JBooleanDryHot3', 'rate': 3.6, 'futureKcs': False}, {'name': 'JPhoneAge2', 'rate': 3.6, 'futureKcs': False}, {'name': 'JSearchArrayCountsEach', 'rate': 3.6, 'futureKcs': False}, {'name': 'JThreeBoolean2', 'rate': 3.6, 'futureKcs': False}, {'name': 'JThreeBoolean3', 'rate': 3.6, 'futureKcs': False}, {'name': 'JAdjacentConsecutives', 'rate': 3.3, 'futureKcs': False}, {'name': 'JAdjacentGreater', 'r