In [None]:
!pip install fuzzywuzzy

In [None]:
import json
from collections import OrderedDict
from pathlib import Path

import pandas as pd
import requests
from fuzzywuzzy import fuzz, process
from tqdm import tqdm

from bs4 import BeautifulSoup

In [None]:
data_path = Path("output.json")
data = json.load(data_path.open())
df = pd.read_csv("map_concepts_activities_topics_Python_MasteryGrids_latest_course.csv")

In [None]:
def get_parsed_text(code):
    data = {"code": code, "mode": "simple"}
    response = requests.post("http://acos.cs.hut.fi/python-parser", data=data)
    try:
        output = [v for k, v in response.json()["lines"].items()][0]
        return output
    except:
        return []


def get_smart_content(code_types, keywords=None):
    output = []
    for t in code_types:
        matched_content = df[df["component_name"] == t]
        for i, row in matched_content.iterrows():
            if keywords:
                extacted = process.extractOne(row["topic_name"], keywords)
                if extacted and extacted[1] > 70:
                    output.append({"url": row["url"], "topic_name": row["topic_name"]})
            else:
                output.append({"url": row["url"], "topic_name": row["topic_name"]})
    return {"Matched Smart Content": output}

In [None]:
chapterwise = {}
for chapter in tqdm(data):
    code = [
        ele["code_segment"] for ele in data[chapter]["content"] if ele["code_segment"]
    ]
    code_snippets = [ele["execcode"] for item in code for ele in item]

    keywords = data[chapter]["concepts"]
    matched_key = []
    matched = []
    for code_snippet in code_snippets:
        code_types = get_parsed_text(code_snippet)
        matched_content = get_smart_content(code_types)
        matched.append(matched_content)

        matched_content = get_smart_content(code_types, keywords)
        matched_key.append(matched_content)

    number = chapter[:-5].split("-")[0]
    chapterwise[number] = {}

    chapterwise[number][f"Chapter Name"] = chapter[:-5].replace("-", " ")

    chapterwise[number][f"Number of Sections"] = len(data[chapter]["all_topics"]) - 2

    chapterwise[number][f"Number of New Topics"] = len(data[chapter]["concepts"])

    chapterwise[number][f"Number of Code Snippets"] = len(code_snippets)

    chapterwise[number][f"Matched Content Based on Code Snippets In the Chapter"] = len(
        [ele for x in matched for ele in x["Matched Smart Content"]]
    )

    chapterwise[number][f"Matched Content (Without Keywords)"] = matched

    chapterwise[number][f"Matched Content Filtered By Keywords"] = len(
        [ele for x in matched_key for ele in x["Matched Smart Content"]]
    )

    chapterwise[number][f"Matched Content (With Keywords)"] = matched_key

In [None]:
chapterwise = OrderedDict(sorted(chapterwise.items()))
for ele in chapterwise:
    print("--------------------------")
    for key, value in chapterwise[ele].items():
        if not (
            key
            in ("Matched Content (With Keywords)", "Matched Content (Without Keywords)")
        ):
            print(f"{key} : {value}")
    print("--------------------------\n")

json.dump(chapterwise, Path("ChapterwiseSmartContent.json").open("w"), indent=2)

In [None]:
sectionwise = {}
for chapter in tqdm(data):
    chapter_name = chapter[:-5].replace("-", " ")
    for number in range(len(data[chapter]["content"])):
        element = data[chapter]["content"][number]

        if element["code_segment"]:
            text = " ".join(element["text"])
            concepts = data[chapter]["concepts"]

            keywords = [process.extractOne(element["Topic"], concepts)[0]]
            print(keywords, "\t", element["Topic"])

            code = element["code_segment"]
            code_snippets = [ele["execcode"] for ele in code]

            matched_key = []
            matched = []
            for code_snippet in code_snippets:
                code_types = get_parsed_text(code_snippet)
                matched_content = get_smart_content(code_types)
                matched.append(matched_content)

                matched_content = get_smart_content(code_types, keywords)
                matched_key.append(matched_content)

            sectionwise[f"{chapter_name}_{number}"] = {}

            sectionwise[f"{chapter_name}_{number}"][f"Chapter Name"] = chapter_name

            sectionwise[f"{chapter_name}_{number}"][f"Section Name"] = element["Topic"]

            sectionwise[f"{chapter_name}_{number}"][
                f"{chapter_name}_{number} of New Topics"
            ] = len(keywords)

            sectionwise[f"{chapter_name}_{number}"][
                f"Matched Content Based on Code Snippets In the Chapter"
            ] = len([ele for x in matched for ele in x["Matched Smart Content"]])

            sectionwise[f"{chapter_name}_{number}"][
                f"Matched Content (Without Keywords)"
            ] = matched

            sectionwise[f"{chapter_name}_{number}"][
                f"Matched Content Filtered By Keywords"
            ] = len([ele for x in matched_key for ele in x["Matched Smart Content"]])

            sectionwise[f"{chapter_name}_{number}"][
                f"Matched Content (With Keywords)"
            ] = matched_key

In [None]:
sectionwise = OrderedDict(sorted(sectionwise.items()))
for ele in sectionwise:
    print("--------------------------")
    for key, value in sectionwise[ele].items():
        if not (
            key
            in ("Matched Content (With Keywords)", "Matched Content (Without Keywords)")
        ):
            print(f"{key} : {value}")
    print("--------------------------\n")

json.dump(sectionwise, Path("SectionwiseSmartContent.json").open("w"), indent=2)