In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

In [3]:
# ==========================================
# 1. CONFIGURATION & DATA LOADING
# ==========================================
TRAIN_PATH = "data/public/train.csv"
TEST_PATH = "data/public/test.csv"
SUBMISSION_PATH = "data/public/submission.csv"

# Leakage: Metrics accumulated AFTER publication
LEAKAGE_COLS = [
    "fork_count",
    "views",
    "downloads",
    "comments_count",
    "notebook_usage",
    "medal",
    "is_featured",
    "is_trending",
    "engagement_rate",
    "virality_score",
    "quality_score",
]

# Irrelevant: All-null or ID metadata
IRRELEVANT_COLS = [
    "usability_score",
    "file_format",
    "column_count",
    "row_count",
    "license_type",
    "content_type",
    "author_username",
]


def load_and_filter(path, is_train=True):
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"File not found: {path}")
        return None

    # Filter for notebooks only (as per problem description)
    if "content_type" in df.columns:
        df = df[df["content_type"] == "notebook"].copy()

    # Drop prohibited columns
    cols_to_drop = [c for c in LEAKAGE_COLS + IRRELEVANT_COLS if c in df.columns]
    df = df.drop(columns=cols_to_drop)

    return df


# Load Data
print("Loading data...")
train_df = load_and_filter(TRAIN_PATH, is_train=True)
test_df = load_and_filter(TEST_PATH, is_train=False)

if train_df is None or test_df is None:
    exit()

# Target Transformation: log(1 + y)
# This matches the evaluation metric: MAE of logs
y = np.log1p(train_df["upvotes"])
X = train_df.drop(columns=["upvotes"])
X_test = test_df.copy()

# Store IDs for submission
test_ids = X_test["content_id"]
X = X.drop(columns=["content_id"])
X_test = X_test.drop(columns=["content_id"])



Loading data...


# 2. FEATURE ENGINEERING

## A. Date Features

In [None]:
def process_dates(df):
    for col in ["created_date", "last_updated"]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], dayfirst=True, errors="coerce")

            # Feature 1: Numerical Timestamp
            df[f"{col}_ts"] = df[col].astype(np.int64) // 10**9 // 86400

            # Feature 2: Day of Week
            df[f"{col}_dow"] = df[col].dt.dayofweek
            
            df = df.drop(columns=[col])
    return df

X = process_dates(X)
X_test = process_dates(X_test)

Processing text...
Processing tags...


## B. Interaction Terms

In [None]:
if "days_since_creation" in X.columns and "update_count" in X.columns:
    X["update_freq"] = X["days_since_creation"] / (X["update_count"] + 1)
    X_test["update_freq"] = X_test["days_since_creation"] / (X_test["update_count"] + 1)

## C. Text Features

In [None]:
tfidf = TfidfVectorizer(max_features=100, stop_words="english")
title_train = tfidf.fit_transform(X["title"].fillna(""))
title_test = tfidf.transform(X_test["title"].fillna(""))

title_cols = [f"title_{i}" for i in range(title_train.shape[1])]
title_train_df = pd.DataFrame(title_train.toarray(), columns=title_cols, index=X.index)
title_test_df = pd.DataFrame(title_test.toarray(), columns=title_cols, index=X_test.index)

X = X.drop(columns=["title"])
X_test = X_test.drop(columns=["title"])


## D. Multi-Label Features

In [None]:
def process_multilabel(train, test, col_name, top_n=30):
    # 1. Split strings by pipe '|'
    train_split = train[col_name].fillna("").astype(str).apply(lambda x: x.split("|") if x else [])
    test_split = test[col_name].fillna("").astype(str).apply(lambda x: x.split("|") if x else [])

    # 2. Binarize
    mlb = MultiLabelBinarizer(sparse_output=False)
    mlb.fit(train_split)

    # Transform
    train_enc = pd.DataFrame(
        mlb.transform(train_split),
        columns=[f"{col_name}_{c}" for c in mlb.classes_],
        index=train.index,
    )
    test_enc = pd.DataFrame(
        mlb.transform(test_split),
        columns=[f"{col_name}_{c}" for c in mlb.classes_],
        index=test.index,
    )

    # 3. Keep only Top N most frequent to reduce noise/memory
    if train_enc.shape[1] > top_n:
        top_cols = train_enc.sum().sort_values(ascending=False).head(top_n).index
        return train_enc[top_cols], test_enc[top_cols]

    return train_enc, test_enc


print("Processing tags...")
# Libraries
libs_train, libs_test = process_multilabel(X, X_test, "libraries_used", top_n=50)
X = X.drop(columns=["libraries_used"])
X_test = X_test.drop(columns=["libraries_used"])

# Topics
topics_train, topics_test = process_multilabel(X, X_test, "all_topics", top_n=30)
X = X.drop(columns=["all_topics"])
X_test = X_test.drop(columns=["all_topics"])

## E. Categorical Encoding

In [None]:
tier_mapping = {"Novice": 0, "Contributor": 1, "Expert": 2, "Master": 3, "Grandmaster": 4}
X["author_tier"] = X["author_tier"].map(tier_mapping)
X_test["author_tier"] = X_test["author_tier"].map(tier_mapping)
cat_cols = ["programming_language", "primary_topic"]
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X[cat_cols] = X[cat_cols].fillna("Missing")
X_test[cat_cols] = X_test[cat_cols].fillna("Missing")

ohe_train = pd.DataFrame(
    ohe.fit_transform(X[cat_cols]), columns=ohe.get_feature_names_out(), index=X.index
)
ohe_test = pd.DataFrame(
    ohe.transform(X_test[cat_cols]), columns=ohe.get_feature_names_out(), index=X_test.index
)

X = X.drop(columns=cat_cols)
X_test = X_test.drop(columns=cat_cols)

## F. Numerical Imputation

In [7]:
imputer = SimpleImputer(strategy="median")
X_num = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
X_test_num = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

X_final = pd.concat([X_num, title_train_df, libs_train, topics_train, ohe_train], axis=1)
X_test_final = pd.concat([X_test_num, title_test_df, libs_test, topics_test, ohe_test], axis=1)

# 3. MODEL TRAINING

In [None]:
xgb_reg = xgb.XGBRegressor(
    objective="reg:absoluteerror",
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
)

param_dist = {
    "n_estimators": [100, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0],
}

search = RandomizedSearchCV(
    estimator=xgb_reg,
    param_distributions=param_dist,
    n_iter=10,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=1,
    random_state=42,
)

print("Starting hyperparameter tuning...")
search.fit(X_final, y)

print(f"Best Log-MAE: {-search.best_score_:.4f}")
print(f"Best Params: {search.best_params_}")

best_model = search.best_estimator_


Starting hyperparameter tuning...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Log-MAE: 0.9701
Best Params: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.8}


# 4. PREDICTION & SUBMISSION

In [None]:
log_preds = best_model.predict(X_test_final)

preds = np.expm1(log_preds)
preds = np.maximum(preds, 0)

submission = pd.DataFrame({"content_id": test_ids, "upvotes": preds})

submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Submission saved to {SUBMISSION_PATH}")

Submission saved to data/public/submission.csv
