In [1]:
#drive mounting for timestamping
from google.colab import drive
drive.mount('/content/drive')

working_directory = "/content/drive/MyDrive/Colab Notebooks/DAS 435 Jacky/project_2/"

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
topics = pd.read_csv(working_directory + '/topics.csv')
correlations = pd.read_csv(working_directory + '/correlations.csv')
content = pd.read_csv(working_directory + '/content.csv')

In [4]:
submission = pd.read_csv(working_directory + '/sample_submission.csv')

In [5]:
def explore_curriculum_data(topics_df, content_df):
    """
    Perform exploratory data analysis on curriculum recommendation datasets

    Parameters:
    topics_df (pd.DataFrame): Topics dataset
    content_df (pd.DataFrame): Content dataset

    Returns:
    dict: Dictionary containing analysis results
    """
    analysis = {}

    # Basic dataset info
    analysis['topics_shape'] = topics_df.shape
    analysis['content_shape'] = content_df.shape

    # Check missing values
    analysis['topics_missing'] = topics_df.isnull().sum()
    analysis['content_missing'] = content_df.isnull().sum()

    # Analyze topics dataset
    analysis['topic_level_counts'] = topics_df['level'].value_counts()
    analysis['topic_language_counts'] = topics_df['language'].value_counts()

    # Analyze content dataset
    analysis['content_language_counts'] = content_df['language'].value_counts()
    analysis['content_kind_counts'] = content_df['kind'].value_counts()

    # Calculate text length statistics
    content_df['title_length'] = content_df['title'].str.len()
    content_df['description_length'] = content_df['description'].fillna('').str.len()

    analysis['content_title_stats'] = content_df['title_length'].describe()
    analysis['content_desc_stats'] = content_df['description_length'].describe()

    # Check relationships
    topics_per_language = topics_df.groupby('language').size()
    content_per_language = content_df.groupby('language').size()
    analysis['topics_to_content_ratio'] = (topics_per_language / content_per_language).fillna(0)

    return analysis

# Run the analysis
results = explore_curriculum_data(topics, content)

# Print key findings
print("Dataset Dimensions:")
print(f"Topics dataset: {results['topics_shape'][0]:,} rows × {results['topics_shape'][1]} columns")
print(f"Content dataset: {results['content_shape'][0]:,} rows × {results['content_shape'][1]} columns\n")

print("Topic Levels Distribution:")
print(results['topic_level_counts'])
print("\nTop 5 Languages by Topic Count:")
print(results['topic_language_counts'].head())
print("\nTop 5 Content Types:")
print(results['content_kind_counts'].head())
print("\nContent Title Length Statistics:")
print(results['content_title_stats'].round(2))

Dataset Dimensions:
Topics dataset: 76,972 rows × 9 columns
Content dataset: 154,047 rows × 8 columns

Topic Levels Distribution:
level
4     38810
3     14898
5      9744
6      6210
2      4874
1      1104
7      1028
0       171
8       119
9        12
10        2
Name: count, dtype: int64

Top 5 Languages by Topic Count:
language
en    36161
es    13910
pt     4177
ar     3701
fr     3701
Name: count, dtype: int64

Top 5 Content Types:
kind
video       61487
document    33873
html5       32563
exercise    25925
audio         199
Name: count, dtype: int64

Content Title Length Statistics:
count    154038.00
mean         33.83
std          18.11
min           1.00
25%          21.00
50%          31.00
75%          44.00
max         177.00
Name: title_length, dtype: float64


In [6]:
print(topics.shape)
topics.head(10)

(76972, 9)


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
5,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True
6,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
7,t_000c0c854f0b,Nepal,,f2e966,source,1,en,t_f9c4ef0d6290,False
8,t_000d1fb3f2f5,2.1.2 - Logarithms,,e77b55,aligned,5,en,t_b897d168db90,True
9,t_000feba42136,As vacinas,,8e286a,source,4,pt,t_dfc8ec591573,True


In [7]:
print(content.shape)
content.head(10)

(154047, 10)


Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license,title_length,description_length
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,,48.0,53
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,,30.0,29
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,,23.0,31
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND,19.0,118
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA,32.0,32
5,c_00019840d110,5.12E: Regulation of the Calvin Cycle,,html5,LEARNING OBJECTIVES\n\nOutline the three major...,en,CSU and Merlot,CC BY-NC-SA,37.0,0
6,c_0001ec56e20f,Reflexionemos sobre lo que vemos y escuchamos,,document,Lección\n\n7\n\nReflexionemos sobre lo que\nve...,es,Publicado por el Lic. Edelberto Andino(edelber...,CC BY-NC-SA,45.0,0
7,c_00025aaa1533,अंग्रेजी ओके प्लीज 1.2,source_url=http://www.prathamopenschool.org/Co...,video,,mr,,,22.0,76
8,c_00027d03ca7d,4.E: Genomes and Chromosomes (Exercises),,html5,4.3\n\n(BPA) Answer the following questions wi...,en,CSU and Merlot,CC BY-NC-SA,40.0,0
9,c_000314eb850f,La banca 12: los bonos del tesoro (deuda pública),Introducción a la deuda y los fondos públicos....,video,,es,,,49.0,194


In [8]:
correlations.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4


In [9]:
#print(submission.shape)
submission.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231


In [10]:
import os
import sys
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Constant for defining sequence length
MAX_LEN = 128  # Adjust as required


class VecModel(nn.Module):
    """
    VecModel is a neural network wrapper for a pretrained transformer model,
    with optional batch normalization and linear projection layer for output vectors.

    Attributes:
    -----------
    backbone : torch.nn.Module
        The base transformer model loaded from Hugging Face's transformers library.
    has_top : bool
        Flag to determine if a batch normalization and linear projection layer should be applied.
    bn : torch.nn.BatchNorm1d (optional)
        Batch normalization layer for the output vector.
    top : torch.nn.Linear (optional)
        Linear layer to apply a projection to the output vector if `has_top` is True.
    """

    def __init__(self, model_name: str, size: int, has_top: bool = True):
        """
        Initializes the VecModel with a specified transformer model backbone,
        batch normalization, and projection layer.

        Parameters:
        -----------
        model_name : str
            Name of the pretrained model to be used as backbone.
        size : int
            Dimension of the output vector after the transformer model.
        has_top : bool, default=True
            Indicates if batch normalization and projection layer are included.
        """
        super(VecModel, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.backbone = AutoModel.from_config(config)
        self.has_top = has_top

        # Define batch normalization and projection layers if has_top is True
        if self.has_top:
            self.bn = nn.BatchNorm1d(size)
            self.top = nn.Linear(size, size)

    def forward(self, ids: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        """
        Defines the forward pass through the model.

        Parameters:
        -----------
        ids : torch.Tensor
            Tensor containing token ids.
        mask : torch.Tensor
            Attention mask to indicate which tokens are padded.

        Returns:
        --------
        torch.Tensor
            Normalized output vector after optional batch normalization and projection.
        """
        # Pass input through the transformer model
        outputs = self.backbone(input_ids=ids, attention_mask=mask)[0]

        # Masking and mean pooling on the hidden states
        masked_output = (outputs[:, 1:MAX_LEN//2, :] * mask[:, 1:MAX_LEN//2, None])
        out = masked_output.mean(dim=1)

        # Apply batch normalization and projection if required
        if self.has_top:
            out = self.top(self.bn(out))

        return F.normalize(out, p=2, dim=1)

    def save(self, path: str):
        """
        Saves the model state to the specified file path.

        Parameters:
        -----------
        path : str
            File path where the model state should be saved.
        """
        torch.save(self.state_dict(), path)

    def load(self, path: str):
        """
        Loads the model state from the specified file path.

        Parameters:
        -----------
        path : str
            File path from which to load the model state.
        """
        self.load_state_dict(torch.load(path, map_location='cpu'))


In [11]:
#ignoring the topics without content
print('before dropping', topics.shape)
topics = topics[topics["has_content"]].reset_index(drop=True)
print('after dropping',topics.shape)

before dropping (76972, 9)
after dropping (61517, 9)


In [12]:
#creating mapping dictionaries for parent and title
title_map = topics.set_index("id")["title"].to_dict()
parent_map = topics.set_index("id")["parent"].to_dict()

#addign parent title columns
topics["parent_title"] = topics["parent"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#addding grandpa title
topics["grandpa"] = topics["parent"].apply(lambda x: parent_map.get(x))
topics["grandpa_title"] = topics["grandpa"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#adding great grandpa title
topics["ggrandpa"] = topics["grandpa"].apply(lambda x: parent_map.get(x))
topics["ggrandpa_title"] = topics["ggrandpa"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#fill in missing parent values
topics["parent"] = topics["parent"].fillna(topics["id"])

topics["sub"] = topics["id"].isin(sub_df["topic_id"])
topics.shape


(61517, 10)
(61517, 12)
(61517, 14)


NameError: name 'sub_df' is not defined

In [None]:
topics['topic_description'] = topics['topic_description'] = topics.apply(
    lambda row: str(row['title']) + ' ' + row['description'] + ' ' + row['category']
    if pd.notna(row['description']) and row['description'] != np.nan else
    str(row['title']) + ' ' + row['category'], axis=1
)
topics = topics[['id', 'title', 'topic_description', 'parent', 'language', 'level']]
topics.head()

In [None]:
content['content_description'] = content.apply(
    lambda row: str(row['title']) + ' ' + str(row['description']) + ' ' + row['text'] + ' ' + row['kind']
    if pd.notna(row['text']) and row['text'] != np.nan else
    str(row['title']) + ' ' + str(row['description']) + ' ' + row['kind'], axis=1
)
content = content[['id', 'title', 'content_description', 'language']]
content.head()