In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.rcParams["figure.figsize"] = (6, 4)
sns.set(style="whitegrid")


In [None]:
BASE_A = Path("../task_a/data/raw")
BASE_B = Path("../task_b/data/raw")
BASE_C = Path("../task_c/data/raw")

def load_splits(base):
    train = pd.read_parquet(base / "train.parquet")
    val   = pd.read_parquet(base / "validation.parquet")
    test  = pd.read_parquet(base / "test.parquet")
    return train, val, test


In [None]:
a_train, a_val, a_test = load_splits(BASE_A)
b_train, b_val, b_test = load_splits(BASE_B)
c_train, c_val, c_test = load_splits(BASE_C)

print("Task A train:", a_train.shape, "val:", a_val.shape, "test:", a_test.shape)
print("Task B train:", b_train.shape, "val:", b_val.shape, "test:", b_test.shape)
print("Task C train:", c_train.shape, "val:", c_val.shape, "test:", c_test.shape)


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

a_train["label"].value_counts().sort_index().plot(kind="bar", ax=axes[0])
axes[0].set_title("Task A – Train label distribution")
axes[0].set_xlabel("label"); axes[0].set_ylabel("count")

b_train["label"].value_counts().sort_index().plot(kind="bar", ax=axes[1])
axes[1].set_title("Task B – Train label distribution")
axes[1].set_xlabel("label")

c_train["label"].value_counts().sort_index().plot(kind="bar", ax=axes[2])
axes[2].set_title("Task C – Train label distribution")
axes[2].set_xlabel("label")

plt.tight_layout()
plt.show()


In [None]:
def add_length_cols(df):
    df = df.copy()
    code = df["code"].astype(str)
    df["n_chars"] = code.str.len()
    df["n_lines"] = code.str.count("\n") + 1
    return df

a_len = add_length_cols(a_train)
b_len = add_length_cols(b_train)
c_len = add_length_cols(c_train)

for name, df_ in [("Task A", a_len), ("Task B", b_len), ("Task C", c_len)]:
    print(f"\n{name}")
    print(df_[["n_chars", "n_lines"]].describe().round(1))


In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10, 10))

for (name, df_), row in zip(
    [("Task A", a_len), ("Task B", b_len), ("Task C", c_len)],
    range(3),
):
    sns.histplot(df_["n_chars"], bins=50, ax=axes[row, 0])
    axes[row, 0].set_title(f"{name} – char length")

    sns.histplot(df_["n_lines"], bins=50, ax=axes[row, 1])
    axes[row, 1].set_title(f"{name} – line count")

plt.tight_layout()
plt.show()
