# Exploration

A small exploration of the datasets to learn more about the data and the structure of the challenge.

## Data

In [1]:
import polars as pl

_ = pl.Config.set_tbl_rows(20)
_ = pl.Config.set_fmt_str_lengths(100)

df_train = pl.read_csv("https://raw.githubusercontent.com/deepfunding/mini-contest/refs/heads/main/dataset.csv")
df_test = pl.read_csv("https://raw.githubusercontent.com/deepfunding/mini-contest/refs/heads/main/test.csv")

print(f"Train rows: {len(df_train)}")
print(f"Test rows: {len(df_test)}")


Train rows: 2387
Test rows: 1023


In [2]:
df_test.sample(5)

id,project_a,project_b
i64,str,str
8,"""https://github.com/prettier-solidity/prettier-plugin-solidity""","""https://github.com/bluealloy/revm"""
2972,"""https://github.com/formatjs/formatjs""","""https://github.com/emotion-js/emotion"""
3327,"""https://github.com/safe-global/safe-smart-account""","""https://github.com/ethers-io/ethers.js"""
3369,"""https://github.com/libp2p/go-libp2p""","""https://github.com/bluealloy/revm"""
2837,"""https://github.com/streetsidesoftware/cspell""","""https://github.com/google/flatbuffers"""


In [3]:
df_train = df_train.with_columns(
    pl.col("project_a").str.split("github.com/").list.last().alias("project_a"),
    pl.col("project_b").str.split("github.com/").list.last().alias("project_b"),
)

df_test = df_test.with_columns(
    pl.col("project_a").str.split("github.com/").list.last().alias("project_a"),
    pl.col("project_b").str.split("github.com/").list.last().alias("project_b"),
)

In [4]:
projects = pl.concat([
    df_train.get_column("project_a"),
    df_train.get_column("project_b"),
    df_test.get_column("project_a"),
    df_test.get_column("project_b")
]).unique().to_list()

print(f"Total projects: {len(projects)}")

Total projects: 117


In [5]:
from typing import Dict
import httpx
import os


def get_repository_info(repository_id: str, client: httpx.Client) -> Dict:
    """
    Fetch repository information from GitHub API for a given repo URL.

    Args:
        repo_url: GitHub repository URL
        client: httpx.Client instance to use for requests

    Returns:
        Dict containing repository information or empty dict if request fails
    """
    api_url = f"https://api.github.com/repos/{repository_id}"

    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}",
        "X-GitHub-Api-Version": "2022-11-28",
    }

    try:
        response = client.get(api_url, headers=headers)
        response.raise_for_status()
        return response.json()
    except httpx.HTTPError:
        print(f"Error fetching data for {repository_id}")
        print(response.text)
        return {}


In [6]:
from tqdm import tqdm

data = []

with httpx.Client(
    transport=httpx.HTTPTransport(retries=5, verify=False),
    follow_redirects=True,
    limits=httpx.Limits(max_keepalive_connections=5, max_connections=10),
) as client:
    for project_id in tqdm(projects, desc="Fetching project information"):
        info = get_repository_info(project_id, client)
        if info:
            data.append(info)

df_projects = pl.DataFrame(data)

Fetching project information: 100%|██████████| 117/117 [00:32<00:00,  3.59it/s]


Let's pick only a subset of the columns to start with.

In [7]:
df_projects_data = df_projects.select(
    pl.col("full_name").str.to_lowercase().alias("project_id"),
    pl.col("stargazers_count").alias("stars"),
    pl.col("watchers_count").alias("watchers"),
    pl.col("forks_count").alias("forks"),
    pl.col("open_issues_count").alias("open_issues"),
    pl.col("size"),
    pl.col("language"),
    pl.col("description"),
)

In [8]:
df_train_data = (df_train
    .join(
    df_projects_data, left_on="project_a", right_on="project_id", how="left")
    .join(df_projects_data, left_on="project_b", right_on="project_id", how="left", suffix="_b")
)

In [9]:
df_train_data = df_train_data.with_columns(
    (pl.col("stars") / (pl.col("stars") + pl.col("stars_b"))).alias("stars_ratio"),
    (pl.col("watchers") / (pl.col("watchers") + pl.col("watchers_b"))).alias("watchers_ratio"),
    (pl.col("forks") / (pl.col("forks") + pl.col("forks_b"))).alias("forks_ratio"),
    (pl.col("size") / (pl.col("size") + pl.col("size_b"))).alias("size_ratio")
)

We can see how well each ratio does in the "train" dataset.

In [10]:
from sklearn.metrics import mean_squared_error

print(f"Stars MSE: {mean_squared_error(df_train_data['weight_a'], df_train_data['stars_ratio'])}")
print(f"Watchers MSE: {mean_squared_error(df_train_data['weight_a'], df_train_data['watchers_ratio'])}")
print(f"Forks MSE: {mean_squared_error(df_train_data['weight_a'], df_train_data['forks_ratio'])}")
print(f"Size MSE: {mean_squared_error(df_train_data['weight_a'], df_train_data['size_ratio'])}")

Stars MSE: 0.19531818587976274
Watchers MSE: 0.19531818587976274
Forks MSE: 0.1873469186282249
Size MSE: 0.24181621061342762


Do the same for the test dataset.

In [11]:
df_test_data = df_test.join(df_projects_data, left_on="project_a", right_on="project_id", how="left")
df_test_data = df_test_data.join(df_projects_data, left_on="project_b", right_on="project_id", how="left", suffix="_b")
df_test_data = df_test_data.with_columns(
    (pl.col("stars") / (pl.col("stars") + pl.col("stars_b"))).alias("stars_ratio"),
    (pl.col("watchers") / (pl.col("watchers") + pl.col("watchers_b"))).alias("watchers_ratio"),
    (pl.col("forks") / (pl.col("forks") + pl.col("forks_b"))).alias("forks_ratio"),
    (pl.col("size") / (pl.col("size") + pl.col("size_b"))).alias("size_ratio"),
)

Generate a submission file.

In [13]:
weight_column = "stars_ratio"
y_pred = df_test_data.select(pl.col("id"), pl.col(weight_column).alias("pred"))

In [14]:
y_pred.write_csv('../data/submissions/baseline-stars-ratio-2024-12-25-10-28.csv')