In [None]:
import os

working_directory = '/home/jacky_macpro/workspaces/project_2/'
data_directory = os.path.join(working_directory, 'data')

In [None]:
import pandas as pd
import numpy as np


In [None]:
topics = pd.read_csv(data_directory + 'topics.csv')
correlations = pd.read_csv(data_directory + 'correlations.csv')
content = pd.read_csv(data_directory + 'content.csv')

In [None]:
def explore_curriculum_data(topics_df, content):
    """
    Perform exploratory data analysis on curriculum recommendation datasets

    Parameters:
    topics_df (pd.DataFrame): Topics dataset
    content (pd.DataFrame): Content dataset

    Returns:
    dict: Dictionary containing analysis results
    """
    analysis = {}

    # Basic dataset info
    analysis['topics_shape'] = topics_df.shape
    analysis['content_shape'] = content.shape

    # Check missing values
    analysis['topics_missing'] = topics_df.isnull().sum()
    analysis['content_missing'] = content.isnull().sum()

    # Analyze topics dataset
    analysis['topic_level_counts'] = topics_df['level'].value_counts()
    analysis['topic_language_counts'] = topics_df['language'].value_counts()

    # Analyze content dataset
    analysis['content_language_counts'] = content['language'].value_counts()
    analysis['content_kind_counts'] = content['kind'].value_counts()

    # Calculate text length statistics
    content['title_length'] = content['title'].str.len()
    content['description_length'] = content['description'].fillna('').str.len()

    analysis['content_title_stats'] = content['title_length'].describe()
    analysis['content_desc_stats'] = content['description_length'].describe()

    # Check relationships
    topics_per_language = topics_df.groupby('language').size()
    content_per_language = content.groupby('language').size()
    analysis['topics_to_content_ratio'] = (topics_per_language / content_per_language).fillna(0)

    return analysis

# Run the analysis
results = explore_curriculum_data(topics, content)

# Print key findings
print("Dataset Dimensions:")
print(f"Topics dataset: {results['topics_shape'][0]:,} rows × {results['topics_shape'][1]} columns")
print(f"Content dataset: {results['content_shape'][0]:,} rows × {results['content_shape'][1]} columns\n")

print("Topic Levels Distribution:")
print(results['topic_level_counts'])
print("\nTop 5 Languages by Topic Count:")
print(results['topic_language_counts'].head())
print("\nTop 5 Content Types:")
print(results['content_kind_counts'].head())
print("\nContent Title Length Statistics:")
print(results['content_title_stats'].round(2))

In [None]:
print(topics.shape)
topics.head(10)

In [None]:
print(content.shape)
content.head(10)

In [None]:
print(correlations.shape)
correlations.head()

In [None]:
#creating mapping dictionaries for parent and title
title_map = topics.set_index("id")["title"].to_dict()
parent_map = topics.set_index("id")["parent"].to_dict()

#addign parent title columns
topics["parent_title"] = topics["parent"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#addding grandpa title
topics["grandpa"] = topics["parent"].apply(lambda x: parent_map.get(x))
topics["grandpa_title"] = topics["grandpa"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#adding great grandpa title
topics["ggrandpa"] = topics["grandpa"].apply(lambda x: parent_map.get(x))
topics["ggrandpa_title"] = topics["ggrandpa"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#fill in missing parent values
topics["parent"] = topics["parent"].fillna(topics["id"])

sub_df = pd.read_csv(working_directory + "sample_submission.csv")

topics["sub"] = topics["id"].isin(sub_df["topic_id"])
topics.shape


In [None]:
def extract_number(x, is_subtopic=False):
    chapter, rest = x.split(":", 1)

    if is_subtopic:
        chapter = chapter.split(".", 1)[0]
        x = f"{chapter}:{rest}"
    chapter = int(chapter)

    return chapter, x

def extract_chapters(df, is_subtopic):
    titles = df["title"].fillna("").values

    chapters = np.zeros(len(titles))
    new_titles = np.array(titles)

    for i, title in enumerate(titles):
        try:
            chapters[i], new_titles[i] = extract_number(title, is_subtopic)
        except:
            pass

    df["chapter"], df["title"] = chapters, new_titles
    df["chapter"] = df["chapter"].astype(int)

    df.loc[df["chapter"] == 0, "chapter"] = None
    return df



In [None]:

content["t"] = content["title"].fillna("") + " | " + content["kind"].fillna("") + " | " + content["description"].fillna("")

topics = extract_chapters(topics, False)
content = extract_chapters(content, True)

topics["t"] = topics["title"] + " @ " + topics["parent_title"] + " @ " + topics["grandpa_title"] + " @ " + topics["ggrandpa_title"] + " | " + topics["description"].fillna("")
