1: DATA PREPROCESSING

In [2]:
# ================================
# DELIVERABLE 1: DATA PREPROCESSING
# ================================

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_json("problems.jsonl", lines=True)

# Fill missing text fields
text_cols = ["title", "description", "input_description", "output_description"]
for col in text_cols:
    df[col] = df[col].fillna("")

# Ensure valid labels
df["problem_score"] = pd.to_numeric(df["problem_score"], errors="coerce")
df = df.dropna(subset=["problem_class", "problem_score"])

# Save statistics for inverse scaling (regression)
score_mean = df["problem_score"].mean()
score_std = df["problem_score"].std()

# Normalize regression target
df["problem_score_norm"] = (df["problem_score"] - score_mean) / score_std

print("Dataset loaded & preprocessed")


Dataset loaded & preprocessed


2: FEATURE EXTRACTION

In [4]:
# ==================================
# DELIVERABLE 2: FEATURE EXTRACTION
# ==================================

from sklearn.base import BaseEstimator, TransformerMixin

def combine_text(df):
    sample_io = df["sample_io"].apply(
        lambda x: " ".join(map(str, x)) if isinstance(x, list) else str(x)
    ) if "sample_io" in df.columns else ""

    return (
        df["title"] + " " +
        df["description"] + " " +
        df["input_description"] + " " +
        df["output_description"] + " " +
        sample_io
    )

class TextExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return combine_text(X)

class HandcraftedTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        text = combine_text(X).str.lower()
        features = pd.DataFrame(index=text.index)

        features["char_len"] = text.str.len()
        features["word_count"] = text.str.split().apply(len)
        features["digit_count"] = text.str.count(r"\d")
        features["math_symbols"] = text.str.count(r"[\+\-\*/=%]")
        features["line_count"] = text.str.count(r"\n")

        keywords = [
            "graph", "tree", "dp", "dynamic", "greedy",
            "dfs", "bfs", "binary", "search", "sort",
            "mod", "prime", "gcd", "lcm", "array",
            "matrix", "string", "bitmask"
        ]

        for kw in keywords:
            features[f"kw_{kw}"] = text.str.count(rf"\b{kw}\b")

        return features.fillna(0)

print("Feature extractors ready")


Feature extractors ready


3: TRAIN–TEST SPLIT

In [6]:
# ================================
# DELIVERABLE 3: TRAIN TEST SPLIT
# ================================

from sklearn.model_selection import train_test_split

X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    df,
    df["problem_class"],
    df["problem_score_norm"],
    test_size=0.2,
    random_state=42,
    stratify=df["problem_class"]
)

print("Train-test split completed")


Train-test split completed


4: Classification Model

In [8]:
# =================================
# DELIVERABLE 4: CLASSIFICATION
# =================================

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

tfidf_pipeline = Pipeline([
    ("text", TextExtractor()),
    ("tfidf", TfidfVectorizer(
        max_features=50000,
        ngram_range=(1, 3),
        stop_words="english",
        min_df=3,
        max_df=0.9
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42))
])

handcrafted_pipeline = Pipeline([
    ("hc", HandcraftedTransformer()),
    ("scale", StandardScaler())
])

features = FeatureUnion([
    ("tfidf", tfidf_pipeline),
    ("handcrafted", handcrafted_pipeline)
])

clf_pipeline = Pipeline([
    ("features", features),
    ("model", ExtraTreesClassifier(
        n_estimators=600,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

clf_pipeline.fit(X_train, y_class_train)
y_pred_class = clf_pipeline.predict(X_test)

print("Classification Accuracy:", accuracy_score(y_class_test, y_pred_class))
print("Confusion Matrix:\n", confusion_matrix(y_class_test, y_pred_class))


Classification Accuracy: 0.5042527339003645
Confusion Matrix:
 [[ 24 114  15]
 [  9 376   4]
 [ 13 253  15]]


In [9]:
# ============================================
#(IMPROVED): CLASSIFICATION MODEL
# Random Forest (Accuracy Boosted)
# ============================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---- Improved TF-IDF ----
tfidf_pipeline = Pipeline([
    ("text", TextExtractor()),
    ("tfidf", TfidfVectorizer(
        max_features=40000,
        ngram_range=(1, 2),     # Less noise than (1,3)
        min_df=5,
        max_df=0.85,
        stop_words="english",
        sublinear_tf=True
    ))
])

# ---- Handcrafted features ----
handcrafted_pipeline = Pipeline([
    ("hc", HandcraftedTransformer()),
    ("scale", StandardScaler())
])

# ---- Feature Union ----
features = FeatureUnion([
    ("tfidf", tfidf_pipeline),
    ("handcrafted", handcrafted_pipeline)
])

# ---- Random Forest Classifier ----
rf_clf = Pipeline([
    ("features", features),
    ("model", RandomForestClassifier(
    n_estimators=600,
    max_depth=35,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced",   # ✅ FIX
    random_state=42,
    n_jobs=-1
)
    )
])

# ---- Train ----
rf_clf.fit(X_train, y_class_train)

# ---- Evaluate ----
y_pred = rf_clf.predict(X_test)

print("Improved Classification Accuracy:", accuracy_score(y_class_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_class_test, y_pred))
print("\nClassification Report:\n", classification_report(y_class_test, y_pred))


Improved Classification Accuracy: 0.5309842041312273

Confusion Matrix:
 [[ 81  45  27]
 [ 43 303  43]
 [ 51 177  53]]

Classification Report:
               precision    recall  f1-score   support

        easy       0.46      0.53      0.49       153
        hard       0.58      0.78      0.66       389
      medium       0.43      0.19      0.26       281

    accuracy                           0.53       823
   macro avg       0.49      0.50      0.47       823
weighted avg       0.51      0.53      0.49       823



5: REGRESSION MODEL

In [11]:
# ===============================
# DELIVERABLE 5: REGRESSION (FIXED)
# ===============================

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
import joblib
import math

# Train regressor on ORIGINAL scores (NO normalization)
reg_pipeline = Pipeline([
    ("features", features),
    ("model", ExtraTreesRegressor(
        n_estimators=800,
        random_state=42,
        n_jobs=-1
    ))
])

# FIT
reg_pipeline.fit(X_train, y_reg_train)

# PREDICT
y_pred_reg = reg_pipeline.predict(X_test)

# EVALUATE directly (NO inverse transform)
mae = mean_absolute_error(y_reg_test, y_pred_reg)
rmse = math.sqrt(mean_squared_error(y_reg_test, y_pred_reg))

print(f"Regression MAE: {mae:.3f}")
print(f"Regression RMSE: {rmse:.3f}")



Regression MAE: 0.757
Regression RMSE: 0.922


6: MODEL SAVING: MODEL SAVING

In [13]:
# ===============================
# DELIVERABLE 6: MODEL SAVING
# ===============================

import pickle

with open("autojudge_classifier.pkl", "wb") as f:
    pickle.dump(clf_pipeline, f)

with open("autojudge_regressor.pkl", "wb") as f:
    pickle.dump(reg_pipeline, f)

print("Models saved successfully")


Models saved successfully
