In [None]:
import pandas as pd
import numpy as np
import json
import glob
import re
import ast
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def load_dataset(folder_path):
    data = []
    json_files = sorted(glob.glob(f"{folder_path}/sample_*.json"))
    
    print(f"Loading {len(json_files)} files...")
    for json_file in json_files:
        with open(json_file, 'r', encoding='utf-8') as f:
            data.append(json.load(f))
    
    return data

data = load_dataset('/content/drive/MyDrive/code_classification_dataset')
df = pd.DataFrame(data)

In [None]:
def clean_text(text):
    if not text or not isinstance(text, str):
        return ""
    
    cleaned = text
    cleaned = re.sub(r'\$([^\$]+?)\$', r' \1 ', cleaned)
    cleaned = re.sub(r'\$\$\$([^\$]+?)\$\$\$', r' \1 ', cleaned)
    cleaned = re.sub(r'\\[ld]?dots', '...', cleaned)
    cleaned = re.sub(r'\\frac\{([^}]+)\}\{([^}]+)\}', r'\1 divided by \2', cleaned)
    cleaned = re.sub(r'\\sum', 'sum', cleaned)
    cleaned = re.sub(r'\\prod', 'product', cleaned)
    cleaned = re.sub(r'\\(le|leq)', '<=', cleaned)
    cleaned = re.sub(r'\\(ge|geq)', '>=', cleaned)
    cleaned = re.sub(r'\\n?eq', '!=', cleaned)
    cleaned = re.sub(r'\\times', '*', cleaned)
    cleaned = re.sub(r'\\cdot', '*', cleaned)
    cleaned = re.sub(r'\\[a-zA-Z]+\{([^}]+)\}', r'\1', cleaned)
    cleaned = re.sub(r'\\[a-zA-Z]+', '', cleaned)
    cleaned = re.sub(r'[^a-zA-Z0-9\s\+\-\*\/\=\<\>\!\.\,]', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    cleaned = cleaned.lower()
    
    return cleaned

In [None]:
df['description_clean'] = df['prob_desc_description'].fillna("").apply(clean_text)

In [None]:
df['code_clean'] = df['source_code'].fillna("").apply(clean_text)

In [None]:
focus_tags = ['math', 'graphs', 'strings', 'number theory',
              'trees', 'geometry', 'games', 'probabilities']

def parse_tags(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return []
    return x

df["tags_list"] = df["tags"].apply(parse_tags)
df["tags_filtered"] = df["tags_list"].apply(
    lambda tags: [t for t in tags if t in focus_tags]
)

df_focus = df[df["tags_filtered"].map(len) > 0].reset_index(drop=True)

In [None]:
df_focus = pd.read_csv("/content/drive/MyDrive/code_classification_dataset/df_focus.csv")

In [None]:
df_focus

Unnamed: 0,prob_desc_time_limit,prob_desc_sample_outputs,src_uid,prob_desc_notes,prob_desc_description,prob_desc_output_spec,prob_desc_input_spec,prob_desc_output_to,prob_desc_input_from,lang,...,prob_desc_sample_inputs,exec_outcome,source_code,prob_desc_created_at,tags,hidden_unit_tests,description_clean,code_clean,tags_list,tags_filtered
0,2 seconds,"[""2\n2 4\n3 3\n3 1""]",591372383cf3624f69793c41370022de,,"Numbers $$$1, 2, 3, \dots n$$$ (each integer f...","For each test case, in the first line, print t...",The first line contains one integer $$$t$$$ ($...,standard output,standard input,Python 3,...,"[""1\n4""]",PASSED,\ndef ii(): return int(input())\ndef mi(): ret...,1602407100,"['greedy', 'constructive algorithms', 'math', ...",,"numbers 1, 2, 3, ... n each integer from 1 to ...","def ii return int input def mi return map int,...","[constructive algorithms, data structures, gre...",['math']
1,3 seconds,"[""4\n10\n4\n0""]",afcd41492158e68095b01ff1e88c3dd4,"NoteIn the first test case of the example, the...","There are $$$n$$$ positive integers $$$a_1, a_...",For $$$t$$$ test cases print the answers in th...,The first line of the input contains one integ...,standard output,standard input,Python 3,...,"[""4\n6\n40 6 40 3 20 1\n1\n1024\n4\n2 4 8 16\n...",PASSED,a = int(input())\nfor i in range(a):\n f = ...,1576321500,"['number theory', 'greedy']",,"there are n positive integers a 1, a 2, ..., a...",a = int input for i in range a f = int input k...,"[greedy, number theory]",['number theory']
2,2 seconds,"[""5"", ""16"", ""18""]",e52ec2fa5bcf5d2027d57b0694b4e15a,NoteIn the first example it is possible to con...,You are given an undirected graph consisting o...,Print one integer — the minimum number of coin...,The first line contains two integers $$$n$$$ a...,standard output,standard input,Python 3,...,"[""3 2\n1 3 3\n2 3 5\n2 1 1"", ""4 0\n1 3 3 7"", ""...",PASSED,def read_nums():\n return [int(x) for x in ...,1545921300,"['dsu', 'greedy', 'graphs']",,you are given an undirected graph consisting o...,def read nums return int x for x in input .spl...,"[dsu, graphs, greedy]",['graphs']
3,1 second,"[""2\n5000 9\n1\n7 \n4\n800 70 6 9000 \n1\n1000...",cd2519f4a7888b2c292f05c64a9db13a,,A positive (strictly greater than zero) intege...,Print $$$t$$$ answers to the test cases. Each ...,The first line contains an integer $$$t$$$ ($$...,standard output,standard input,PyPy 3,...,"[""5\n5009\n7\n9876\n10000\n10""]",PASSED,t = int(input())\nfor i in range(t):\n canP...,1590154500,"['implementation', 'math']",,a positive strictly greater than zero integer ...,t = int input for i in range t canprintlength ...,"[implementation, math]",['math']
4,1 second,"[""1"", ""0""]",a34f2aa89fe0e78b495b20400d73acf1,NoteThe first test case corresponds to the tre...,You are given a tree with $$$n$$$ vertices. Yo...,Print a single integer — the minimum number o...,The first line contains an integer $$$n$$$ ($$...,standard output,standard input,PyPy 3,...,"[""6\n4 5\n2 6\n3 2\n1 2\n2 4"", ""4\n2 4\n4 1\n3...",PASSED,import sys\nfrom collections import defaultdic...,1593873900,"['graphs', 'constructive algorithms', 'graph m...",,you are given a tree with n vertices. you are ...,import sys from collections import defaultdict...,"[brute force, constructive algorithms, dfs and...","['graphs', 'trees']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,2 seconds,"[""abac\nac\nbcdaf\nzzzzzz""]",ac77e2e6c86b5528b401debe9f68fc8e,NoteThe first test case is explained in the st...,Alice guesses the strings that Bob made for he...,Output $$$t$$$ answers to test cases. Each ans...,The first line contains a single positive inte...,standard output,standard input,Python 3,...,"[""4\nabbaac\nac\nbccddaaf\nzzzzzzzzzz""]",PASSED,x = int (input())\n\nz = [0 for i in range(x)]...,1592318100,"['implementation', 'strings']",,alice guesses the strings that bob made for he...,x = int input z = 0 for i in range x for y in ...,"[implementation, strings]",['strings']
2674,2 seconds,"[""YES"", ""NO""]",c659bdeda1c1da08cfc7f71367222332,Note First example: you can simply swap two l...,Some dwarves that are finishing the StUDY (Sta...,"Print ""YES"", if the dwarves belong to the same...",The first line contains the first dwarf's geno...,standard output,standard input,Python 3,...,"[""ab\nba"", ""aa\nab""]",PASSED,"p=input()\np=p.replace('',' ')\np=p.split()\nq...",1336145400,"['implementation', 'strings']",,some dwarves that are finishing the study stat...,"p=input p=p.replace , p=p.split q=input q=q.re...","[implementation, strings]",['strings']
2675,2 seconds,"[""2"", ""0"", ""3""]",c0c29565e465840103a4af884f951cda,NoteIn the first example two seconds are neede...,In the school computer room there are n server...,Print the minimum number of seconds required t...,The first line contains positive number n (1 ≤...,standard output,standard input,Python 2,...,"[""2\n1 6"", ""7\n10 11 10 11 10 11 11"", ""5\n1 2 ...",PASSED,"n = input()\nlst = map(float, raw_input().spli...",1450537200,"['implementation', 'math']",,in the school computer room there are n server...,"n = input lst = map float, raw input .split me...","[implementation, math]",['math']
2676,2 seconds,"[""2\n1\n-1\n0""]",942123e43a83d5a4cea95a6781064e28,"NoteIn the first test case, in the first move,...",You are given an array $$$a[0 \ldots n-1]$$$ o...,"For each test case, output a single integer — ...",The first line contains a single integer $$$t$...,standard output,standard input,Python 3,...,"[""4\n4\n3 2 7 6\n3\n3 2 6\n1\n7\n7\n4 9 2 1 18...",PASSED,t = int(input())\nfor i in range(t):\n n = in...,1592318100,"['greedy', 'math']",,you are given an array a 0 ... n-1 of length n...,t = int input for i in range t n = int input l...,"[greedy, math]",['math']


In [None]:
def parse_tags(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return [t.strip(" '[]") for t in str(x).split(',') if t.strip(" '[]")]

df_focus['tags_filtered'] = df_focus['tags_filtered'].apply(parse_tags)
df_focus['tags_list'] = df_focus['tags_list'].apply(
    lambda lst: sorted(set(t.strip() for t in lst if t.strip()))
)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df_focus['tags_filtered'])

print(Y.shape)
print(mlb.classes_)

In [None]:
feature_cols = ['description_clean', 'code_clean']
X = df_focus[feature_cols]

In [None]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
msss = MultilabelStratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)

for train_idx, test_idx in msss.split(X, Y):
    X_train = df_focus.iloc[train_idx]
    X_test  = df_focus.iloc[test_idx]
    Y_train = Y[train_idx]
    Y_test  = Y[test_idx]

In [None]:
train_df = df_focus.iloc[train_idx].copy()
test_df = df_focus.iloc[test_idx].copy()

In [None]:
def label_proportions(Y):
    return Y.sum(axis=0) / len(Y)

print("Full proportions:",  label_proportions(Y))
print("Train proportions:", label_proportions(Y_train))
print("Test proportions:",  label_proportions(Y_test))


Full proportions: [0.03920836 0.06198656 0.20238984 0.5257655  0.13069455 0.034354
 0.15758028 0.12098581]
Train proportions: [0.03912436 0.0619469  0.20214252 0.52445272 0.13041453 0.0344667
 0.15742897 0.12063344]
Test proportions: [0.03954802 0.06214689 0.20338983 0.53107345 0.13182674 0.03389831
 0.15819209 0.12241055]


In [None]:
train_df.to_csv("/content/drive/MyDrive/code_classification_dataset/train_df.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/code_classification_dataset/test_df.csv", index=False)
