In [3]:
import polars as pl

from src.data import get_dataframes

_ = pl.Config.set_tbl_rows(20)
_ = pl.Config.set_fmt_str_lengths(100)

df_train, df_test = get_dataframes()


In [4]:
df_train = pl.concat(
    [
        df_train,
        df_train.select(
            "id",
            pl.col("project_b").alias("project_a"),
            pl.col("project_a").alias("project_b"),
            pl.col("weight_b").alias("weight_a"),
            pl.col("weight_a").alias("weight_b"),
        ),
    ]
)

In [5]:
from src.data import get_projects_info

projects = (
    pl.concat(
        [
            df_train.get_column("project_a"),
            df_train.get_column("project_b"),
            df_test.get_column("project_a"),
            df_test.get_column("project_b"),
        ]
    )
    .unique()
    .to_list()
)

df_projects = get_projects_info(projects)

In [6]:
from src.features import add_github_projects_data, extract_ratio_features

df_train_full = add_github_projects_data(df_train, df_projects)
df_train_full = extract_ratio_features(df_train_full)

df_test_full = add_github_projects_data(df_test, df_projects)
df_test_full = extract_ratio_features(df_test_full)

In [7]:
from src.features import (
    extract_activity_features,
    extract_temporal_features,
)

df_train_full = extract_temporal_features(df_train_full)
df_train_full = extract_activity_features(df_train_full)

df_test_full = extract_temporal_features(df_test_full)
df_test_full = extract_activity_features(df_test_full)


In [8]:
features = [
    "is_private",
    "has_homepage",
    "size",
    "stars",
    "watchers",
    "has_projects",
    "has_pages",
    "has_wiki",
    "has_discussions",
    "forks",
    "is_archived",
    "is_disabled",
    "open_issues",
    "subscribers_count",
    "is_private_b",
    "has_homepage_b",
    "size_b",
    "stars_b",
    "watchers_b",
    "has_projects_b",
    "has_pages_b",
    "has_wiki_b",
    "has_discussions_b",
    "forks_b",
    "is_archived_b",
    "is_disabled_b",
    "open_issues_b",
    "subscribers_count_b",
    "stars_ratio",
    "watchers_ratio",
    "forks_ratio",
    "size_ratio",
    # "subscribers_count_ratio",
    "stars_decay",
    "stars_decay_b",
    "forks_decay",
    "forks_decay_b",
    "age_days",
    "age_days_b",
    "days_since_update",
    "days_since_update_b",
    "log_stars",
    "log_stars_b",
    "log_watchers",
    "log_watchers_b",
    "log_forks",
    "log_forks_b",
]

X = df_train_full.select(features).to_numpy()

y = df_train_full.get_column("weight_a").to_numpy()


In [9]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import KFold

lgb_train_data = lgb.Dataset(X, label=y)

# Define parameters
params = {
    "objective": "regression",
    "metric": "mse",
    "force_col_wise": True,
    "num_leaves": 100,
}

# Perform 5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Create training and validation datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    # Train model
    model = lgb.train(params, train_data, valid_sets=[val_data])

    # Make predictions and calculate MSE
    y_pred = model.predict(X_val)
    mse = np.mean((y_val - y_pred) ** 2)
    cv_scores.append(mse)

# Calculate mean and std of MSE scores
cv_scores = np.array(cv_scores)
mean_mse = cv_scores.mean()
std_mse = cv_scores.std()

print(f"Cross-validation MSE: {mean_mse:.4f} (+/- {std_mse:.4f})")

[LightGBM] [Info] Total Bins 3770
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 42
[LightGBM] [Info] Start training from score 0.495267
[LightGBM] [Info] Total Bins 3770
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 42
[LightGBM] [Info] Start training from score 0.503440
[LightGBM] [Info] Total Bins 3770
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 42
[LightGBM] [Info] Start training from score 0.494855
[LightGBM] [Info] Total Bins 3770
[LightGBM] [Info] Number of data points in the train set: 3819, number of used features: 42
[LightGBM] [Info] Start training from score 0.505230
[LightGBM] [Info] Total Bins 3770
[LightGBM] [Info] Number of data points in the train set: 3820, number of used features: 42
[LightGBM] [Info] Start training from score 0.501208
Cross-validation MSE: 0.0190 (+/- 0.0009)


In [10]:
# Train model on the entire dataset
model = lgb.train(
    params,
    lgb_train_data,
)

[LightGBM] [Info] Total Bins 3770
[LightGBM] [Info] Number of data points in the train set: 4774, number of used features: 42
[LightGBM] [Info] Start training from score 0.500000


In [11]:
X_test = df_test_full.select(features).to_numpy()

lgb_test_data = lgb.Dataset(X_test)

test_predictions = model.predict(X_test)
test_predictions = pl.Series(test_predictions).round(6).clip(0)

In [12]:
importance = model.feature_importance()

feature_importance = pl.DataFrame({"feature": features, "importance": importance}).sort(
    "importance", descending=True
)

feature_importance.plot.bar(x="importance", y="feature")


In [14]:
import datetime

df_test.select(pl.col("id"), pl.Series(test_predictions).alias("pred")).write_csv(
    f"../data/submissions/submission_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}-mse_{mean_mse:.6f}.csv"
)
