These first two notebook code blocks were ran through google colab after uploading my cleaned dataset to google drive, with purpose to split data into train, validate, test. The rest of the code was ran locally through VS code, and then uploaded.

In [None]:
import os
import pandas as pd
from git import Repo
from pathlib import Path
import ast
from tqdm import tqdm

# === PATH SETUP ===
save_dir = "/Users/paulmitchell/Downloads/Gen_AI_Homework_2"
csv_path = os.path.join(save_dir, "results.csv")
clone_dir = os.path.join(save_dir, "cloned_repos")
processed_log = os.path.join(save_dir, "processed_repos.txt")
output_csv = os.path.join(save_dir, "raw_methods_dataset.csv")
index_file = os.path.join(save_dir, "last_repo_index.txt")
os.makedirs(clone_dir, exist_ok=True)

# === LOAD REPOS FROM CSV ===
df = pd.read_csv(csv_path)
repo_names = df["name"].dropna().tolist()
repo_urls = [f"https://github.com/{name}" for name in repo_names]
print(f"📦 Loaded {len(repo_urls)} total repositories to process.")

📦 Loaded 7677 total repositories to process.


In [None]:
index_file = "/Users/paulmitchell/Downloads/Gen_AI_Homework_2/last_repo_index.txt"
repo_index = 4268
method_count = 108324

with open(index_file, "w") as f:
    f.write(f"{repo_index}\n{method_count}")

print("✅ Updated resume checkpoint.")

✅ Updated resume checkpoint.


In [None]:
import os
import pandas as pd
import ast
import subprocess
from pathlib import Path
from tqdm import tqdm

# === PARAMETERS ===
max_total = 120_000
max_per_file = 5
max_ifs_per_repo = 100

index_file = os.path.join(save_dir, "last_repo_index.txt")
output_csv = os.path.join(save_dir, "raw_methods_dataset.csv")
processed_log = os.path.join(save_dir, "processed_repos.txt")
csv_path = output_csv

# === AUTO-VALIDATE resume state ===
if os.path.exists(csv_path) and os.path.exists(index_file):
    df = pd.read_csv(csv_path)
    with open(index_file, "r") as f:
        lines = f.read().splitlines()
        saved_count = int(lines[1]) if len(lines) > 1 else 0
        actual_count = len(df)
        if actual_count != saved_count:
            print(f"⚠️ Mismatch: index file says {saved_count}, but CSV has {actual_count} rows.")
        else:
            print(f"✅ Verified: {actual_count} methods match the index file.")

# === INITIAL VALUES ===
start_index = 0
current_total = 0

# === LOAD DATASET IF EXISTS ===
if os.path.exists(output_csv):
    print("📂 Resuming from existing dataset...")
    df_existing = pd.read_csv(output_csv)
    dataset = df_existing.to_dict(orient="records")
    print(f"🔁 Loaded {len(dataset)} existing samples.")
else:
    dataset = []

# === LOAD PROCESSED REPOS ===
if os.path.exists(processed_log):
    with open(processed_log, "r") as f:
        processed = set(f.read().splitlines())
else:
    processed = set()

# === LOAD INDEX AND METHOD COUNT IF EXISTS ===
if os.path.exists(index_file):
    with open(index_file, "r") as f:
        lines = f.read().splitlines()
        if len(lines) >= 1:
            start_index = int(lines[0])
        if len(lines) >= 2:
            current_total = int(lines[1])
    print(f"⏩ Resuming from repo #{start_index + 1} with {current_total} methods")

# === SAVE INDEX HELPER ===
def save_progress(index_file_path, repo_index, method_count):
    with open(index_file_path, "w") as f:
        f.write(f"{repo_index}\n{method_count}")

# === CLONE REPO WITH TIMEOUT ===
def clone_repo(repo_url, dest_dir=clone_dir, timeout=60):
    repo_name = repo_url.strip("/").split("/")[-1]
    target_path = os.path.join(dest_dir, repo_name)
    if os.path.exists(target_path):
        return target_path
    try:
        subprocess.run(
            ["git", "clone", repo_url + ".git", target_path],
            check=True,
            timeout=timeout,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        )
    except subprocess.TimeoutExpired:
        print(f"⏱️ Timed out cloning {repo_url}")
        return None
    except subprocess.CalledProcessError:
        print(f"❌ Failed to clone {repo_url}")
        return None
    return target_path

# === EXTRACT METHODS WITH `if` ===
def extract_if_methods_from_file(file_path, max_per_file=5):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            source = f.read()
        tree = ast.parse(source)
    except Exception:
        return []

    methods = []

    class Visitor(ast.NodeVisitor):
        def visit_FunctionDef(self, node):
            if any(isinstance(n, ast.If) for n in ast.walk(node)):
                methods.append(node)
    Visitor().visit(tree)

    return methods[:max_per_file]

# === MASK FIRST `if` LINE ===
def mask_first_if(node: ast.FunctionDef):
    try:
        source = ast.unparse(node)
        lines = source.splitlines()
        for i, line in enumerate(lines):
            stripped = line.strip()
            if stripped.startswith("if ") and stripped.endswith(":"):
                condition = stripped[3:-1].strip()
                lines[i] = "<mask>:"
                return "\n".join(lines), condition
    except Exception:
        pass
    return None, None

# === MAIN LOOP ===
repo_bar = tqdm(repo_urls[start_index:], initial=start_index, total=len(repo_urls), dynamic_ncols=True)

for repo_index, repo_url in enumerate(repo_urls[start_index:], start=start_index):
    if repo_url in processed:
        continue

    repo_bar.set_description(f"🧪 Repo {repo_index + 1}/{len(repo_urls)} | {current_total}/120k methods")

    repo_path = clone_repo(repo_url)
    if not repo_path:
        continue

    py_files = list(Path(repo_path).rglob("*.py"))
    if not py_files:
        continue

    repo_ifs_collected = 0
    for file_path in py_files:
        methods = extract_if_methods_from_file(file_path, max_per_file=max_per_file)
        for method in methods:
            if repo_ifs_collected >= max_ifs_per_repo:
                break

            masked_func, target_block = mask_first_if(method)
            if masked_func and target_block:
                token_count = len(masked_func.split())
                dataset.append({
                    "cleaned_method": masked_func,
                    "target_block": target_block,
                    "tokens_in_method": token_count
                })
                current_total += 1
                repo_ifs_collected += 1

                if current_total >= max_total:
                    break
        if current_total >= max_total or repo_ifs_collected >= max_ifs_per_repo:
            break

    if repo_ifs_collected == 0:
        print(f"⚠️ No if-methods found in {repo_url}")

    # ✅ Log repo and progress
    with open(processed_log, "a") as f:
        f.write(repo_url + "\n")

    save_progress(index_file, repo_index + 1, current_total)
    pd.DataFrame(dataset).to_csv(output_csv, index=False)

    if current_total >= max_total:
        break

# === FINAL SAVE ===
pd.DataFrame(dataset).to_csv(output_csv, index=False)
print(f"\n✅ Done! Saved {len(dataset)} samples to {output_csv}")


✅ Verified: 16895 methods match the index file.
📂 Resuming from existing dataset...
🔁 Loaded 16895 existing samples.
⏩ Resuming from repo #693 with 16895 methods


🧪 Repo 694/7677 | 16895/120k methods:   9%|▉         | 692/7677 [13:17:44<?, ?it/s]
🧪 Repo 695/7677 | 16895/120k methods:   9%|▉         | 692/7677 [01:00<?, ?it/s]

⏱️ Timed out cloning https://github.com/macarthur-lab/clinvar


🧪 Repo 716/7677 | 17542/120k methods:   9%|▉         | 692/7677 [03:16<?, ?it/s]

⏱️ Timed out cloning https://github.com/ghadjeres/deepbach


🧪 Repo 748/7677 | 18346/120k methods:   9%|▉         | 692/7677 [05:05<?, ?it/s]

⏱️ Timed out cloning https://github.com/snwh/paper-icon-theme


🧪 Repo 795/7677 | 19612/120k methods:   9%|▉         | 692/7677 [09:02<?, ?it/s]

⏱️ Timed out cloning https://github.com/neuropsychology/neurokit.py


🧪 Repo 832/7677 | 20741/120k methods:   9%|▉         | 692/7677 [11:21<?, ?it/s]

⏱️ Timed out cloning https://github.com/tvwenger/maxfield


🧪 Repo 845/7677 | 21027/120k methods:   9%|▉         | 692/7677 [12:36<?, ?it/s]

⏱️ Timed out cloning https://github.com/raghakot/keras-vis


🧪 Repo 862/7677 | 21429/120k methods:   9%|▉         | 692/7677 [13:59<?, ?it/s]

⏱️ Timed out cloning https://github.com/picovoice/cheetah


🧪 Repo 876/7677 | 21722/120k methods:   9%|▉         | 692/7677 [15:37<?, ?it/s]

⏱️ Timed out cloning https://github.com/picovoice/leopard


🧪 Repo 927/7677 | 22963/120k methods:   9%|▉         | 692/7677 [19:56<?, ?it/s]

⏱️ Timed out cloning https://github.com/geometalab/osmdeepod


🧪 Repo 968/7677 | 24076/120k methods:   9%|▉         | 692/7677 [23:54<?, ?it/s]

⏱️ Timed out cloning https://github.com/microsoft/mssql-scripter


🧪 Repo 1003/7677 | 24940/120k methods:   9%|▉         | 692/7677 [26:19<?, ?it/s]

⏱️ Timed out cloning https://github.com/sideeffects/gamedevelopmenttoolset


🧪 Repo 1063/7677 | 26482/120k methods:   9%|▉         | 692/7677 [30:12<?, ?it/s]

⏱️ Timed out cloning https://github.com/linnabrown/run_dbcan


🧪 Repo 1069/7677 | 26570/120k methods:   9%|▉         | 692/7677 [31:16<?, ?it/s]

⏱️ Timed out cloning https://github.com/adamspannbauer/python_video_stab


🧪 Repo 1111/7677 | 27474/120k methods:   9%|▉         | 692/7677 [32:36<?, ?it/s]

⚠️ No if-methods found in https://github.com/hopsoft/docker-graphite-statsd


🧪 Repo 1130/7677 | 27967/120k methods:   9%|▉         | 692/7677 [34:05<?, ?it/s]

⏱️ Timed out cloning https://github.com/spiderclub/weibospider


🧪 Repo 1144/7677 | 28381/120k methods:   9%|▉         | 692/7677 [36:32<?, ?it/s]

⏱️ Timed out cloning https://github.com/chiefenne/pyaero


🧪 Repo 1159/7677 | 28569/120k methods:   9%|▉         | 692/7677 [38:34<?, ?it/s]

⏱️ Timed out cloning https://github.com/theislab/dca


🧪 Repo 1189/7677 | 29485/120k methods:   9%|▉         | 692/7677 [40:31<?, ?it/s]

⚠️ No if-methods found in https://github.com/anthcourtney/ansible-role-cis-amazon-linux


🧪 Repo 1199/7677 | 29668/120k methods:   9%|▉         | 692/7677 [41:30<?, ?it/s]

⚠️ No if-methods found in https://github.com/codingo/nosqlmap


🧪 Repo 1244/7677 | 30834/120k methods:   9%|▉         | 692/7677 [44:32<?, ?it/s]

⏱️ Timed out cloning https://github.com/srinivas11789/pcapxray


🧪 Repo 1273/7677 | 31707/120k methods:   9%|▉         | 692/7677 [48:38<?, ?it/s]

⏱️ Timed out cloning https://github.com/ezaquarii/vpn-at-home


🧪 Repo 1292/7677 | 32240/120k methods:   9%|▉         | 692/7677 [49:19<?, ?it/s]

⚠️ No if-methods found in https://github.com/migueldemoura/myazo


🧪 Repo 1336/7677 | 33151/120k methods:   9%|▉         | 692/7677 [52:55<?, ?it/s]

⏱️ Timed out cloning https://github.com/aertslab/scenicprotocol


🧪 Repo 1352/7677 | 33659/120k methods:   9%|▉         | 692/7677 [53:49<?, ?it/s]

⚠️ No if-methods found in https://github.com/georgefilipkin/pulsemixer


🧪 Repo 1358/7677 | 33779/120k methods:   9%|▉         | 692/7677 [54:31<?, ?it/s]

⚠️ No if-methods found in https://github.com/silviolleite/django-pwa


🧪 Repo 1381/7677 | 34299/120k methods:   9%|▉         | 692/7677 [55:57<?, ?it/s]

⚠️ No if-methods found in https://github.com/snowplow-referer-parser/referer-parser


🧪 Repo 1436/7677 | 35655/120k methods:   9%|▉         | 692/7677 [1:01:08<?, ?it/s]

⏱️ Timed out cloning https://github.com/beyretb/animalai-olympics


🧪 Repo 1451/7677 | 36055/120k methods:   9%|▉         | 692/7677 [1:02:47<?, ?it/s]

⏱️ Timed out cloning https://github.com/cellprofiler/python-bioformats


🧪 Repo 1452/7677 | 36055/120k methods:   9%|▉         | 692/7677 [1:03:47<?, ?it/s]

⏱️ Timed out cloning https://github.com/jomjol/water-meter-system-complete


🧪 Repo 1453/7677 | 36055/120k methods:   9%|▉         | 692/7677 [1:04:47<?, ?it/s]

⏱️ Timed out cloning https://github.com/aws/sagemaker-pytorch-inference-toolkit


🧪 Repo 1462/7677 | 36341/120k methods:   9%|▉         | 692/7677 [1:06:18<?, ?it/s]

⏱️ Timed out cloning https://github.com/djrrb/bungee


🧪 Repo 1486/7677 | 36780/120k methods:   9%|▉         | 692/7677 [1:07:55<?, ?it/s]

⚠️ No if-methods found in https://github.com/devicetree-org/devicetree-specification


🧪 Repo 1489/7677 | 36892/120k methods:   9%|▉         | 692/7677 [1:08:11<?, ?it/s]

⚠️ No if-methods found in https://github.com/pirate-crew/iptv


🧪 Repo 1491/7677 | 36892/120k methods:   9%|▉         | 692/7677 [1:09:12<?, ?it/s]

⏱️ Timed out cloning https://github.com/benmaier/netwulf


🧪 Repo 1511/7677 | 37361/120k methods:   9%|▉         | 692/7677 [1:13:22<?, ?it/s]

⏱️ Timed out cloning https://github.com/chrissimpkins/codeface


🧪 Repo 1562/7677 | 38827/120k methods:   9%|▉         | 692/7677 [1:19:00<?, ?it/s]

⏱️ Timed out cloning https://github.com/harvitronix/reinforcement-learning-car


🧪 Repo 1573/7677 | 39084/120k methods:   9%|▉         | 692/7677 [1:20:52<?, ?it/s]

⏱️ Timed out cloning https://github.com/adamewing/bamsurgeon


🧪 Repo 1580/7677 | 39220/120k methods:   9%|▉         | 692/7677 [1:20:58<?, ?it/s]

⚠️ No if-methods found in https://github.com/custom-components/weatheralerts


🧪 Repo 1603/7677 | 39930/120k methods:   9%|▉         | 692/7677 [1:25:08<?, ?it/s]

⏱️ Timed out cloning https://github.com/openworm/openworm


🧪 Repo 1618/7677 | 40319/120k methods:   9%|▉         | 692/7677 [1:26:21<?, ?it/s]

⚠️ No if-methods found in https://github.com/jasonmcintosh/rabbitmq-zabbix


🧪 Repo 1651/7677 | 41041/120k methods:   9%|▉         | 692/7677 [1:31:37<?, ?it/s]

⏱️ Timed out cloning https://github.com/sofianehamlaoui/lockdoor-framework


🧪 Repo 1660/7677 | 41324/120k methods:   9%|▉         | 692/7677 [1:32:43<?, ?it/s]

⚠️ No if-methods found in https://github.com/pubref/rules_protobuf


🧪 Repo 1668/7677 | 41540/120k methods:   9%|▉         | 692/7677 [1:33:54<?, ?it/s]

⏱️ Timed out cloning https://github.com/juglab/n2v


🧪 Repo 1688/7677 | 41991/120k methods:   9%|▉         | 692/7677 [1:35:35<?, ?it/s]

⏱️ Timed out cloning https://github.com/titipata/pubmed_parser


🧪 Repo 1696/7677 | 42140/120k methods:   9%|▉         | 692/7677 [1:36:54<?, ?it/s]

⏱️ Timed out cloning https://github.com/misslav/librequake


🧪 Repo 1723/7677 | 42819/120k methods:   9%|▉         | 692/7677 [1:41:22<?, ?it/s]

⏱️ Timed out cloning https://github.com/murali-group/beeline


🧪 Repo 1737/7677 | 43249/120k methods:   9%|▉         | 692/7677 [1:43:34<?, ?it/s]

⏱️ Timed out cloning https://github.com/maptiler/epsg.io


🧪 Repo 1786/7677 | 44417/120k methods:   9%|▉         | 692/7677 [1:48:41<?, ?it/s]

⏱️ Timed out cloning https://github.com/igvteam/igv-reports


🧪 Repo 1796/7677 | 44628/120k methods:   9%|▉         | 692/7677 [1:50:44<?, ?it/s]

⏱️ Timed out cloning https://github.com/kanjivg/kanjivg


🧪 Repo 1801/7677 | 44724/120k methods:   9%|▉         | 692/7677 [1:50:50<?, ?it/s]

⚠️ No if-methods found in https://github.com/open-security-group-osg/hiddeneyereborn


🧪 Repo 1827/7677 | 45237/120k methods:   9%|▉         | 692/7677 [1:53:47<?, ?it/s]

⏱️ Timed out cloning https://github.com/epistasislab/pmlb


🧪 Repo 1832/7677 | 45350/120k methods:   9%|▉         | 692/7677 [1:54:58<?, ?it/s]

⏱️ Timed out cloning https://github.com/rcbyron/hey-athena-client


🧪 Repo 1861/7677 | 46017/120k methods:   9%|▉         | 692/7677 [1:56:42<?, ?it/s]

⏱️ Timed out cloning https://github.com/luminosoinsight/wordfreq


🧪 Repo 1875/7677 | 46333/120k methods:   9%|▉         | 692/7677 [1:58:13<?, ?it/s]

⏱️ Timed out cloning https://github.com/afagarap/gru-svm


🧪 Repo 1896/7677 | 46972/120k methods:   9%|▉         | 692/7677 [1:59:38<?, ?it/s]

⚠️ No if-methods found in https://github.com/beeware/beeware


🧪 Repo 1900/7677 | 46991/120k methods:   9%|▉         | 692/7677 [1:59:52<?, ?it/s]

❌ Failed to clone https://github.com/dreamnettech/dreampower


🧪 Repo 1902/7677 | 47001/120k methods:   9%|▉         | 692/7677 [2:00:53<?, ?it/s]

⏱️ Timed out cloning https://github.com/jenskutilek/sudo-font


🧪 Repo 1959/7677 | 48390/120k methods:   9%|▉         | 692/7677 [2:06:29<?, ?it/s]

⚠️ No if-methods found in https://github.com/stunkymonkey/nautilus-open-any-terminal


🧪 Repo 1964/7677 | 48458/120k methods:   9%|▉         | 692/7677 [2:07:33<?, ?it/s]

⏱️ Timed out cloning https://github.com/larsenwork/monoid


🧪 Repo 2034/7677 | 50237/120k methods:   9%|▉         | 692/7677 [2:13:49<?, ?it/s]

⏱️ Timed out cloning https://github.com/tategallery/collection


🧪 Repo 2080/7677 | 51441/120k methods:   9%|▉         | 692/7677 [2:16:59<?, ?it/s]

⏱️ Timed out cloning https://github.com/undercasetype/fraunces


🧪 Repo 2096/7677 | 51813/120k methods:   9%|▉         | 692/7677 [2:19:46<?, ?it/s]

⏱️ Timed out cloning https://github.com/jofrhwld/fave


🧪 Repo 2101/7677 | 51903/120k methods:   9%|▉         | 692/7677 [2:20:55<?, ?it/s]

⏱️ Timed out cloning https://github.com/huangjunye/qpong


🧪 Repo 2159/7677 | 53425/120k methods:   9%|▉         | 692/7677 [2:25:38<?, ?it/s]

⏱️ Timed out cloning https://github.com/nyu-mlab/iot-inspector-client


🧪 Repo 2163/7677 | 53582/120k methods:   9%|▉         | 692/7677 [2:25:54<?, ?it/s]

⚠️ No if-methods found in https://github.com/qiwihui/reinforcement-learning-an-introduction-chinese


🧪 Repo 2169/7677 | 53746/120k methods:   9%|▉         | 692/7677 [2:26:25<?, ?it/s]

⚠️ No if-methods found in https://github.com/tomsquest/docker-radicale


🧪 Repo 2204/7677 | 54524/120k methods:   9%|▉         | 692/7677 [2:30:59<?, ?it/s]

⏱️ Timed out cloning https://github.com/minaskar/zeus


🧪 Repo 2220/7677 | 54955/120k methods:   9%|▉         | 692/7677 [2:32:46<?, ?it/s]

❌ Failed to clone https://github.com/gangverk/flask-swagger


🧪 Repo 2225/7677 | 55041/120k methods:   9%|▉         | 692/7677 [2:34:02<?, ?it/s]

⏱️ Timed out cloning https://github.com/jackmckew/pandas_alive


🧪 Repo 2238/7677 | 55248/120k methods:   9%|▉         | 692/7677 [2:35:22<?, ?it/s]

⏱️ Timed out cloning https://github.com/bruuuuuuce/pkuautosubmit


🧪 Repo 2259/7677 | 55846/120k methods:   9%|▉         | 692/7677 [2:37:40<?, ?it/s]

⏱️ Timed out cloning https://github.com/ryanlayer/samplot


🧪 Repo 2330/7677 | 57740/120k methods:   9%|▉         | 692/7677 [2:45:11<?, ?it/s]

⏱️ Timed out cloning https://github.com/afagarap/malware-classification


🧪 Repo 2345/7677 | 58061/120k methods:   9%|▉         | 692/7677 [2:45:48<?, ?it/s]

⚠️ No if-methods found in https://github.com/derv82/wifite


🧪 Repo 2356/7677 | 58289/120k methods:   9%|▉         | 692/7677 [2:47:48<?, ?it/s]

⏱️ Timed out cloning https://github.com/franck-dernoncourt/neuroner


🧪 Repo 2360/7677 | 58421/120k methods:   9%|▉         | 692/7677 [2:49:48<?, ?it/s]

⏱️ Timed out cloning https://github.com/natethegreate/hent-ai


🧪 Repo 2362/7677 | 58430/120k methods:   9%|▉         | 692/7677 [2:50:49<?, ?it/s]

⏱️ Timed out cloning https://github.com/alexmyg/andropytool


🧪 Repo 2374/7677 | 58761/120k methods:   9%|▉         | 692/7677 [2:53:04<?, ?it/s]

⏱️ Timed out cloning https://github.com/residentmario/geoplot


🧪 Repo 2386/7677 | 59097/120k methods:   9%|▉         | 692/7677 [2:54:49<?, ?it/s]

⏱️ Timed out cloning https://github.com/rimochan/librian


🧪 Repo 2404/7677 | 59581/120k methods:   9%|▉         | 692/7677 [2:57:22<?, ?it/s]

⏱️ Timed out cloning https://github.com/jni/skan


🧪 Repo 2424/7677 | 60085/120k methods:   9%|▉         | 692/7677 [2:59:47<?, ?it/s]

⏱️ Timed out cloning https://github.com/mlbvn/d2l-vn


🧪 Repo 2449/7677 | 60640/120k methods:   9%|▉         | 692/7677 [3:01:55<?, ?it/s]

⏱️ Timed out cloning https://github.com/cdanielmachado/carveme


🧪 Repo 2465/7677 | 61050/120k methods:   9%|▉         | 692/7677 [3:04:44<?, ?it/s]

⏱️ Timed out cloning https://github.com/ehendrix23/tesla_dashcam


🧪 Repo 2468/7677 | 61096/120k methods:   9%|▉         | 692/7677 [3:06:01<?, ?it/s]

⏱️ Timed out cloning https://github.com/simonarvin/eyeloop


🧪 Repo 2493/7677 | 61862/120k methods:   9%|▉         | 692/7677 [3:07:45<?, ?it/s]

⚠️ No if-methods found in https://github.com/kkrypt0nn/python-discord-bot-template


🧪 Repo 2517/7677 | 62369/120k methods:   9%|▉         | 692/7677 [3:09:58<?, ?it/s]

⏱️ Timed out cloning https://github.com/unitedstates/images


🧪 Repo 2541/7677 | 63020/120k methods:   9%|▉         | 692/7677 [3:11:50<?, ?it/s]

⏱️ Timed out cloning https://github.com/araffin/rl-baselines-zoo


🧪 Repo 2559/7677 | 63372/120k methods:   9%|▉         | 692/7677 [3:13:36<?, ?it/s]

⚠️ No if-methods found in https://github.com/jarun/pdd


🧪 Repo 2573/7677 | 63620/120k methods:   9%|▉         | 692/7677 [3:15:04<?, ?it/s]

⏱️ Timed out cloning https://github.com/koriavinash1/deepbrainseg


🧪 Repo 2595/7677 | 64074/120k methods:   9%|▉         | 692/7677 [3:18:36<?, ?it/s]

⏱️ Timed out cloning https://github.com/krishnaswamylab/phate


🧪 Repo 2611/7677 | 64608/120k methods:   9%|▉         | 692/7677 [3:19:52<?, ?it/s]

⏱️ Timed out cloning https://github.com/alleninstitute/deepinterpolation


🧪 Repo 2623/7677 | 64990/120k methods:   9%|▉         | 692/7677 [3:21:59<?, ?it/s]

⏱️ Timed out cloning https://github.com/bcgsc/nanosim


🧪 Repo 2629/7677 | 65059/120k methods:   9%|▉         | 692/7677 [3:23:05<?, ?it/s]

⏱️ Timed out cloning https://github.com/hsvgbkhgbv/sqddpg


🧪 Repo 2632/7677 | 65082/120k methods:   9%|▉         | 692/7677 [3:24:07<?, ?it/s]

⏱️ Timed out cloning https://github.com/kaist-maclab/pytsmod


🧪 Repo 2633/7677 | 65082/120k methods:   9%|▉         | 692/7677 [3:25:07<?, ?it/s]

⏱️ Timed out cloning https://github.com/kth/devops-course


🧪 Repo 2654/7677 | 65679/120k methods:   9%|▉         | 692/7677 [3:28:02<?, ?it/s]

⏱️ Timed out cloning https://github.com/rivermont/spidy


🧪 Repo 2655/7677 | 65726/120k methods:   9%|▉         | 692/7677 [3:28:05<?, ?it/s]

⚠️ No if-methods found in https://github.com/rdflib/rdflib-jsonld


🧪 Repo 2674/7677 | 66497/120k methods:   9%|▉         | 692/7677 [3:30:18<?, ?it/s]

❌ Failed to clone https://github.com/shijl0925/python-sonarqube-api


🧪 Repo 2679/7677 | 66640/120k methods:   9%|▉         | 692/7677 [3:31:22<?, ?it/s]

⏱️ Timed out cloning https://github.com/broadinstitute/abc-enhancer-gene-prediction


🧪 Repo 2696/7677 | 66939/120k methods:   9%|▉         | 692/7677 [3:34:23<?, ?it/s]

⏱️ Timed out cloning https://github.com/nfstream/nfstream


🧪 Repo 2758/7677 | 68378/120k methods:   9%|▉         | 692/7677 [3:39:37<?, ?it/s]

⏱️ Timed out cloning https://github.com/picovoice/picovoice


🧪 Repo 2759/7677 | 68378/120k methods:   9%|▉         | 692/7677 [3:40:37<?, ?it/s]

⏱️ Timed out cloning https://github.com/donnnno/arcticons


🧪 Repo 2761/7677 | 68389/120k methods:   9%|▉         | 692/7677 [3:41:41<?, ?it/s]

⏱️ Timed out cloning https://github.com/mctorch/mctorch


🧪 Repo 2764/7677 | 68462/120k methods:   9%|▉         | 692/7677 [3:42:55<?, ?it/s]

⏱️ Timed out cloning https://github.com/bab2min/kiwipiepy


🧪 Repo 2766/7677 | 68529/120k methods:   9%|▉         | 692/7677 [3:44:06<?, ?it/s]

⏱️ Timed out cloning https://github.com/trelau/pyocct


🧪 Repo 2768/7677 | 68533/120k methods:   9%|▉         | 692/7677 [3:45:24<?, ?it/s]

⏱️ Timed out cloning https://github.com/belval/textrecognitiondatagenerator


🧪 Repo 2781/7677 | 68914/120k methods:   9%|▉         | 692/7677 [3:47:29<?, ?it/s]

⏱️ Timed out cloning https://github.com/layumi/university1652-baseline


🧪 Repo 2817/7677 | 69593/120k methods:   9%|▉         | 692/7677 [3:50:03<?, ?it/s]

⏱️ Timed out cloning https://github.com/tensorlayer/srgan


🧪 Repo 2831/7677 | 69880/120k methods:   9%|▉         | 692/7677 [3:52:02<?, ?it/s]

⏱️ Timed out cloning https://github.com/bmw-innovationlab/bmw-yolov4-training-automation
⚠️ No if-methods found in https://github.com/chilcote/outset


🧪 Repo 2849/7677 | 70284/120k methods:   9%|▉         | 692/7677 [3:53:15<?, ?it/s]

⚠️ No if-methods found in https://github.com/etherchina/solidity-doc-cn


🧪 Repo 2862/7677 | 70645/120k methods:   9%|▉         | 692/7677 [3:55:37<?, ?it/s]

⏱️ Timed out cloning https://github.com/theislab/scgen


🧪 Repo 2863/7677 | 70645/120k methods:   9%|▉         | 692/7677 [3:56:37<?, ?it/s]

⏱️ Timed out cloning https://github.com/tum-ens/urbs


🧪 Repo 2871/7677 | 70781/120k methods:   9%|▉         | 692/7677 [3:57:46<?, ?it/s]

⏱️ Timed out cloning https://github.com/sayonari/twitchtransfreenext


🧪 Repo 2946/7677 | 73163/120k methods:   9%|▉         | 692/7677 [4:06:57<?, ?it/s]

⏱️ Timed out cloning https://github.com/cruizperez/microbeannotator


🧪 Repo 2965/7677 | 73584/120k methods:   9%|▉         | 692/7677 [4:09:53<?, ?it/s]

⏱️ Timed out cloning https://github.com/kk7nc/rmdl


🧪 Repo 2975/7677 | 73818/120k methods:   9%|▉         | 692/7677 [4:11:36<?, ?it/s]

⏱️ Timed out cloning https://github.com/albertogeniola/meross-homeassistant


🧪 Repo 2983/7677 | 73960/120k methods:   9%|▉         | 692/7677 [4:12:50<?, ?it/s]

⏱️ Timed out cloning https://github.com/justin-tan/high-fidelity-generative-compression


🧪 Repo 2984/7677 | 73960/120k methods:   9%|▉         | 692/7677 [4:13:50<?, ?it/s]

⏱️ Timed out cloning https://github.com/brainglobe/brainrender


🧪 Repo 2996/7677 | 74223/120k methods:   9%|▉         | 692/7677 [4:14:45<?, ?it/s]

⚠️ No if-methods found in https://github.com/tiangolo/meinheld-gunicorn-docker


🧪 Repo 3003/7677 | 74378/120k methods:   9%|▉         | 692/7677 [4:15:54<?, ?it/s]

⏱️ Timed out cloning https://github.com/gepetto/example-robot-data


🧪 Repo 3077/7677 | 76701/120k methods:   9%|▉         | 692/7677 [4:22:57<?, ?it/s]

⚠️ No if-methods found in https://github.com/microsoft/ai


🧪 Repo 3079/7677 | 76701/120k methods:   9%|▉         | 692/7677 [4:24:20<?, ?it/s]

⏱️ Timed out cloning https://github.com/nasa-jpl/osr-rover-code


🧪 Repo 3101/7677 | 77276/120k methods:   9%|▉         | 692/7677 [4:27:36<?, ?it/s]

⏱️ Timed out cloning https://github.com/garrettj403/scienceplots


🧪 Repo 3115/7677 | 77770/120k methods:   9%|▉         | 692/7677 [4:29:56<?, ?it/s]

⏱️ Timed out cloning https://github.com/insightsoftwareconsortium/itkwidgets


🧪 Repo 3159/7677 | 78766/120k methods:   9%|▉         | 692/7677 [4:36:16<?, ?it/s]

⏱️ Timed out cloning https://github.com/flatironinstitute/deepfri


🧪 Repo 3183/7677 | 79383/120k methods:   9%|▉         | 692/7677 [4:37:55<?, ?it/s]

⚠️ No if-methods found in https://github.com/cthru/hitrava


🧪 Repo 3185/7677 | 79383/120k methods:   9%|▉         | 692/7677 [4:38:57<?, ?it/s]

⏱️ Timed out cloning https://github.com/ultralytics/xview-yolov3


🧪 Repo 3188/7677 | 79428/120k methods:   9%|▉         | 692/7677 [4:39:02<?, ?it/s]

⚠️ No if-methods found in https://github.com/tiangolo/uvicorn-gunicorn-starlette-docker


🧪 Repo 3209/7677 | 80086/120k methods:   9%|▉         | 692/7677 [4:41:15<?, ?it/s]

⏱️ Timed out cloning https://github.com/misaogura/flashtorch


🧪 Repo 3245/7677 | 80939/120k methods:   9%|▉         | 692/7677 [4:46:25<?, ?it/s]

⏱️ Timed out cloning https://github.com/open-numbers/ddf--gapminder--systema_globalis


🧪 Repo 3274/7677 | 81659/120k methods:   9%|▉         | 692/7677 [4:49:10<?, ?it/s]

⏱️ Timed out cloning https://github.com/ibm/max-object-detector


🧪 Repo 3286/7677 | 81955/120k methods:   9%|▉         | 692/7677 [4:52:19<?, ?it/s]

⏱️ Timed out cloning https://github.com/a-slide/pycoqc


🧪 Repo 3292/7677 | 82176/120k methods:   9%|▉         | 692/7677 [4:53:54<?, ?it/s]

⏱️ Timed out cloning https://github.com/haaleo/swarmlib


🧪 Repo 3298/7677 | 82211/120k methods:   9%|▉         | 692/7677 [4:55:03<?, ?it/s]

⏱️ Timed out cloning https://github.com/keisen/tf-keras-vis


🧪 Repo 3308/7677 | 82501/120k methods:   9%|▉         | 692/7677 [4:56:59<?, ?it/s]

⏱️ Timed out cloning https://github.com/sysdiglabs/promcat-resources


🧪 Repo 3311/7677 | 82530/120k methods:   9%|▉         | 692/7677 [4:58:02<?, ?it/s]

⏱️ Timed out cloning https://github.com/magedsaeed/farasapy


🧪 Repo 3323/7677 | 82761/120k methods:   9%|▉         | 692/7677 [5:01:29<?, ?it/s]

⏱️ Timed out cloning https://github.com/freedoom/freedoom


🧪 Repo 3327/7677 | 82805/120k methods:   9%|▉         | 692/7677 [5:03:21<?, ?it/s]

⏱️ Timed out cloning https://github.com/inveniosoftware/invenio


🧪 Repo 3335/7677 | 83059/120k methods:   9%|▉         | 692/7677 [5:04:37<?, ?it/s]

⏱️ Timed out cloning https://github.com/letmaik/rawpy


🧪 Repo 3337/7677 | 83096/120k methods:   9%|▉         | 692/7677 [5:05:53<?, ?it/s]

⏱️ Timed out cloning https://github.com/alexander-akhmetov/python-telegram


🧪 Repo 3339/7677 | 83101/120k methods:   9%|▉         | 692/7677 [5:06:54<?, ?it/s]

⏱️ Timed out cloning https://github.com/aresdevo/animaide


🧪 Repo 3372/7677 | 83880/120k methods:   9%|▉         | 692/7677 [5:10:29<?, ?it/s]

⏱️ Timed out cloning https://github.com/brainglobe/brainreg


🧪 Repo 3375/7677 | 83910/120k methods:   9%|▉         | 692/7677 [5:11:33<?, ?it/s]

⏱️ Timed out cloning https://github.com/ageitgey/face_recognition


🧪 Repo 3376/7677 | 83910/120k methods:   9%|▉         | 692/7677 [5:12:33<?, ?it/s]

⏱️ Timed out cloning https://github.com/matterport/mask_rcnn


🧪 Repo 3378/7677 | 83927/120k methods:   9%|▉         | 692/7677 [5:13:34<?, ?it/s]

⏱️ Timed out cloning https://github.com/ytisf/thezoo


🧪 Repo 3380/7677 | 83976/120k methods:   9%|▉         | 692/7677 [5:14:36<?, ?it/s]

⏱️ Timed out cloning https://github.com/jiangxufeng/v2rayl


🧪 Repo 3383/7677 | 84079/120k methods:   9%|▉         | 692/7677 [5:14:48<?, ?it/s]

⚠️ No if-methods found in https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker


🧪 Repo 3390/7677 | 84213/120k methods:   9%|▉         | 692/7677 [5:15:57<?, ?it/s]

⏱️ Timed out cloning https://github.com/qlab/qlib


🧪 Repo 3391/7677 | 84224/120k methods:   9%|▉         | 692/7677 [5:16:25<?, ?it/s]

⚠️ No if-methods found in https://github.com/tiangolo/uwsgi-nginx-docker


🧪 Repo 3397/7677 | 84307/120k methods:   9%|▉         | 692/7677 [5:17:12<?, ?it/s]

⚠️ No if-methods found in https://github.com/tiangolo/meinheld-gunicorn-flask-docker


🧪 Repo 3402/7677 | 84380/120k methods:   9%|▉         | 692/7677 [5:17:19<?, ?it/s]

⚠️ No if-methods found in https://github.com/tiangolo/uvicorn-gunicorn-docker


🧪 Repo 3415/7677 | 84842/120k methods:   9%|▉         | 692/7677 [5:18:45<?, ?it/s]

⏱️ Timed out cloning https://github.com/victordomingos/optimize-images


🧪 Repo 3424/7677 | 84982/120k methods:   9%|▉         | 692/7677 [5:20:39<?, ?it/s]

⏱️ Timed out cloning https://github.com/kubeinit/kubeinit


🧪 Repo 3442/7677 | 85382/120k methods:   9%|▉         | 692/7677 [5:22:25<?, ?it/s]

⏱️ Timed out cloning https://github.com/vanheeringen-lab/seq2science


🧪 Repo 3454/7677 | 85704/120k methods:   9%|▉         | 692/7677 [5:24:33<?, ?it/s]

⏱️ Timed out cloning https://github.com/flareteam/flare-game


🧪 Repo 3482/7677 | 86724/120k methods:   9%|▉         | 692/7677 [5:29:14<?, ?it/s]

⏱️ Timed out cloning https://github.com/bitextor/bitextor


🧪 Repo 3511/7677 | 87576/120k methods:   9%|▉         | 692/7677 [5:36:05<?, ?it/s]

⏱️ Timed out cloning https://github.com/uberi/speech_recognition


🧪 Repo 3527/7677 | 88057/120k methods:   9%|▉         | 692/7677 [5:39:01<?, ?it/s]

⏱️ Timed out cloning https://github.com/kinwaicheuk/nnaudio


🧪 Repo 3540/7677 | 88313/120k methods:   9%|▉         | 692/7677 [5:42:07<?, ?it/s]

⏱️ Timed out cloning https://github.com/natumbri/mopidy-youtube


🧪 Repo 3545/7677 | 88429/120k methods:   9%|▉         | 692/7677 [5:43:13<?, ?it/s]

⏱️ Timed out cloning https://github.com/sumologic/sumologic-aws-lambda


🧪 Repo 3584/7677 | 89578/120k methods:   9%|▉         | 692/7677 [5:48:40<?, ?it/s]

⏱️ Timed out cloning https://github.com/joshiemoore/snakeware


🧪 Repo 3593/7677 | 89886/120k methods:   9%|▉         | 692/7677 [5:49:04<?, ?it/s]

⚠️ No if-methods found in https://github.com/graphite-project/docker-graphite-statsd


🧪 Repo 3608/7677 | 90228/120k methods:   9%|▉         | 692/7677 [5:51:05<?, ?it/s]

⏱️ Timed out cloning https://github.com/mrminimal64/timezonefinder


🧪 Repo 3620/7677 | 90623/120k methods:   9%|▉         | 692/7677 [5:52:40<?, ?it/s]

⏱️ Timed out cloning https://github.com/datadesk/census-data-downloader


🧪 Repo 3628/7677 | 90802/120k methods:   9%|▉         | 692/7677 [5:53:51<?, ?it/s]

⏱️ Timed out cloning https://github.com/google-research/open-covid-19-data


🧪 Repo 3659/7677 | 91679/120k methods:   9%|▉         | 692/7677 [5:57:21<?, ?it/s]

⏱️ Timed out cloning https://github.com/joelibaceta/video-to-ascii


🧪 Repo 3667/7677 | 91958/120k methods:   9%|▉         | 692/7677 [5:59:04<?, ?it/s]

⏱️ Timed out cloning https://github.com/obsidianforensics/hindsight


🧪 Repo 3681/7677 | 92248/120k methods:   9%|▉         | 692/7677 [6:01:02<?, ?it/s]

⏱️ Timed out cloning https://github.com/microsoft/vscode-tools-for-ai


🧪 Repo 3687/7677 | 92429/120k methods:   9%|▉         | 692/7677 [6:01:10<?, ?it/s]

⚠️ No if-methods found in https://github.com/dsaidgovsg/airflow-pipeline


🧪 Repo 3706/7677 | 92898/120k methods:   9%|▉         | 692/7677 [6:02:50<?, ?it/s]

⏱️ Timed out cloning https://github.com/donalffons/opencascade.js


🧪 Repo 3713/7677 | 93084/120k methods:   9%|▉         | 692/7677 [6:03:08<?, ?it/s]

⚠️ No if-methods found in https://github.com/jarun/googler


🧪 Repo 3716/7677 | 93091/120k methods:   9%|▉         | 692/7677 [6:03:23<?, ?it/s]

⚠️ No if-methods found in https://github.com/jarun/ddgr


🧪 Repo 3727/7677 | 93306/120k methods:   9%|▉         | 692/7677 [6:04:13<?, ?it/s]

⚠️ No if-methods found in https://github.com/jarun/imgp


🧪 Repo 3772/7677 | 94433/120k methods:   9%|▉         | 692/7677 [6:09:47<?, ?it/s]

⏱️ Timed out cloning https://github.com/amueller/word_cloud


🧪 Repo 3788/7677 | 95009/120k methods:   9%|▉         | 692/7677 [6:11:52<?, ?it/s]

⏱️ Timed out cloning https://github.com/psyop/cryptomatte


🧪 Repo 3795/7677 | 95129/120k methods:   9%|▉         | 692/7677 [6:13:04<?, ?it/s]

⏱️ Timed out cloning https://github.com/machawk1/wail


🧪 Repo 3843/7677 | 96486/120k methods:   9%|▉         | 692/7677 [6:17:22<?, ?it/s]

⏱️ Timed out cloning https://github.com/mocobeta/janome


🧪 Repo 3851/7677 | 96677/120k methods:   9%|▉         | 692/7677 [6:18:43<?, ?it/s]

⏱️ Timed out cloning https://github.com/fritzing/fritzing-parts


🧪 Repo 3905/7677 | 97941/120k methods:   9%|▉         | 692/7677 [6:23:51<?, ?it/s]

⏱️ Timed out cloning https://github.com/catharsisfonts/ysabeau


🧪 Repo 3914/7677 | 98132/120k methods:   9%|▉         | 692/7677 [6:25:24<?, ?it/s]

⏱️ Timed out cloning https://github.com/frappe/helm


🧪 Repo 3920/7677 | 98247/120k methods:   9%|▉         | 692/7677 [6:26:36<?, ?it/s]

⏱️ Timed out cloning https://github.com/chatopera/synonyms


🧪 Repo 3936/7677 | 98671/120k methods:   9%|▉         | 692/7677 [6:28:15<?, ?it/s]

⏱️ Timed out cloning https://github.com/emilianavt/openseeface


🧪 Repo 3984/7677 | 100027/120k methods:   9%|▉         | 692/7677 [6:34:27<?, ?it/s]

⏱️ Timed out cloning https://github.com/linkedai/flip


🧪 Repo 4009/7677 | 100876/120k methods:   9%|▉         | 692/7677 [6:36:39<?, ?it/s]

⏱️ Timed out cloning https://github.com/a-maliarov/amazoncaptcha


🧪 Repo 4023/7677 | 101241/120k methods:   9%|▉         | 692/7677 [6:39:26<?, ?it/s]

⏱️ Timed out cloning https://github.com/cleardusk/3ddfa


🧪 Repo 4031/7677 | 101559/120k methods:   9%|▉         | 692/7677 [6:42:05<?, ?it/s]

⏱️ Timed out cloning https://github.com/otrf/mordor


🧪 Repo 4052/7677 | 102134/120k methods:   9%|▉         | 692/7677 [6:43:43<?, ?it/s]

⚠️ No if-methods found in https://github.com/scikit-hep/scikit-hep


🧪 Repo 4059/7677 | 102279/120k methods:   9%|▉         | 692/7677 [6:45:09<?, ?it/s]

⏱️ Timed out cloning https://github.com/alexandrovlab/sigprofilerextractor


🧪 Repo 4068/7677 | 102441/120k methods:   9%|▉         | 692/7677 [6:46:24<?, ?it/s]

⏱️ Timed out cloning https://github.com/i-tu/hasklig


🧪 Repo 4076/7677 | 102590/120k methods:   9%|▉         | 692/7677 [6:47:53<?, ?it/s]

⏱️ Timed out cloning https://github.com/superdesk/superdesk


🧪 Repo 4077/7677 | 102590/120k methods:   9%|▉         | 692/7677 [6:48:53<?, ?it/s]

⏱️ Timed out cloning https://github.com/kivy/kivy-ios


🧪 Repo 4082/7677 | 102688/120k methods:   9%|▉         | 692/7677 [6:49:59<?, ?it/s]

⏱️ Timed out cloning https://github.com/oca/connector


🧪 Repo 4105/7677 | 103522/120k methods:   9%|▉         | 692/7677 [6:53:24<?, ?it/s]

⏱️ Timed out cloning https://github.com/yunyang1994/tensorflow-yolov3


🧪 Repo 4109/7677 | 103663/120k methods:   9%|▉         | 692/7677 [6:54:29<?, ?it/s]

⏱️ Timed out cloning https://github.com/contextlab/hypertools


🧪 Repo 4110/7677 | 103709/120k methods:   9%|▉         | 692/7677 [6:54:32<?, ?it/s]

⚠️ No if-methods found in https://github.com/seisman/how-to-write-makefile


🧪 Repo 4130/7677 | 104155/120k methods:   9%|▉         | 692/7677 [6:58:55<?, ?it/s]

⏱️ Timed out cloning https://github.com/geekalexis/fastmot


🧪 Repo 4137/7677 | 104366/120k methods:   9%|▉         | 692/7677 [7:00:52<?, ?it/s]

⏱️ Timed out cloning https://github.com/weecology/deepforest


🧪 Repo 4140/7677 | 104481/120k methods:   9%|▉         | 692/7677 [7:02:00<?, ?it/s]

⏱️ Timed out cloning https://github.com/spiralgenetics/truvari


🧪 Repo 4148/7677 | 104658/120k methods:   9%|▉         | 692/7677 [7:03:18<?, ?it/s]

⏱️ Timed out cloning https://github.com/bernhard-42/jupyter-cadquery


🧪 Repo 4154/7677 | 104760/120k methods:   9%|▉         | 692/7677 [7:04:47<?, ?it/s]

⏱️ Timed out cloning https://github.com/marksgraham/oct-converter


🧪 Repo 4166/7677 | 105147/120k methods:   9%|▉         | 692/7677 [7:06:50<?, ?it/s]

⏱️ Timed out cloning https://github.com/barrust/pyspellchecker


🧪 Repo 4171/7677 | 105427/120k methods:   9%|▉         | 692/7677 [7:08:12<?, ?it/s]

⏱️ Timed out cloning https://github.com/opensimulationinterface/open-simulation-interface


🧪 Repo 4185/7677 | 105794/120k methods:   9%|▉         | 692/7677 [7:10:40<?, ?it/s]

⏱️ Timed out cloning https://github.com/artyshko/smd


🧪 Repo 4189/7677 | 105904/120k methods:   9%|▉         | 692/7677 [7:11:45<?, ?it/s]

⏱️ Timed out cloning https://github.com/cl0udg0d/szhe_scan


🧪 Repo 4223/7677 | 106819/120k methods:   9%|▉         | 692/7677 [7:17:24<?, ?it/s]

⏱️ Timed out cloning https://github.com/neo23x0/loki


🧪 Repo 4227/7677 | 106944/120k methods:   9%|▉         | 692/7677 [7:18:38<?, ?it/s]

⏱️ Timed out cloning https://github.com/yaronzz/tidal-media-downloader


🧪 Repo 4229/7677 | 106968/120k methods:   9%|▉         | 692/7677 [7:19:52<?, ?it/s]

⏱️ Timed out cloning https://github.com/ddangelov/top2vec


🧪 Repo 4231/7677 | 106978/120k methods:   9%|▉         | 692/7677 [7:20:55<?, ?it/s]

⏱️ Timed out cloning https://github.com/pavlin-policar/opentsne


🧪 Repo 4255/7677 | 107825/120k methods:   9%|▉         | 692/7677 [7:24:40<?, ?it/s]

⏱️ Timed out cloning https://github.com/hgjazhgj/fgo-py


🧪 Repo 4262/7677 | 108108/120k methods:   9%|▉         | 692/7677 [7:26:33<?, ?it/s]

⏱️ Timed out cloning https://github.com/typenetwork/roboto-flex


🧪 Repo 4268/7677 | 108324/120k methods:   9%|▉         | 692/7677 [7:27:18<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import pandas as pd

input_path = "/Users/paulmitchell/Downloads/Gen AI 520/Hw2/raw_methods_dataset.csv"
output_path = "/Users/paulmitchell/Downloads/Gen AI 520/Hw2/cleaned_dataset.csv"

df = pd.read_csv(input_path)
print(f"Raw rows: {len(df)}")

# Drop empty rows
df = df.dropna(subset=["cleaned_method", "target_block"])

# Filter by token length
df = df[(df["tokens_in_method"] >= 10) & (df["tokens_in_method"] <= 300)]

# Remove duplicates
df = df.drop_duplicates(subset=["cleaned_method"])

print(f"Cleaned rows: {len(df)}")
df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to: {output_path}")


📦 Raw rows: 108324
🧼 Cleaned rows: 98614
✅ Cleaned dataset saved to: /Users/paulmitchell/Downloads/Gen AI 520/Hw2/cleaned_dataset.csv


In [None]:
import pandas as pd

# Change path to either cleaned or sampled file
df = pd.read_csv("/Users/paulmitchell/Downloads/Gen AI 520/Hw2/cleaned_dataset.csv")

# View 3 random samples
for i, row in df.sample(3, random_state=None).iterrows():
    print(f"\n🔍 Sample #{i}")
    print("Cleaned Method:\n", row["cleaned_method"])
    print("Target Block:", row["target_block"])
    print("Token Count:", row["tokens_in_method"])


🔍 Sample #74249
Cleaned Method:
 def report(self):
    results_by_status = collections.defaultdict(list)
    for result in self.results:
<mask>:
            results_by_status['successful'].append(result)
        elif result['failure'] or result['error']:
            results_by_status['failed'].append(result)
        elif result['interrupted']:
            results_by_status['interrupted'].append(result)
        else:
            results_by_status['unknown'].append(result)
    if self.options.summary_mode:
        self.report_failures(results_by_status['failed'])
    self.report_stats(len(self.test_case_classes), **results_by_status)
    if len(self.results) == 0:
        return False
    else:
        return len(results_by_status['failed']) + len(results_by_status['interrupted']) + len(results_by_status['unknown']) == 0
Target Block: result['success']
Token Count: 41

🔍 Sample #75137
Cleaned Method:
 def history_paths() -> list:
    """
    Get valid pathes to history from HISTORIES va

In [None]:
import pandas as pd

# Path to cleaned dataset in Drive
full_path = '/content/drive/MyDrive/Gen AI Homework 2/dataset/cleaned_dataset.csv'

# Load the full cleaned dataset
df = pd.read_csv(full_path)
print(f"Loaded dataset with {len(df)} rows")

# Take first 60k to match assignment's 50k train + 5k val + 5k test
df = df.iloc[:60000].reset_index(drop=True)

# Split datasets
train_df = df.iloc[:50000]
val_df = df.iloc[50000:55000]
test_df = df.iloc[55000:60000]

# Save splits to new CSVs
save_dir = '/content/drive/MyDrive/Gen AI Homework 2/dataset'
train_df.to_csv(f'{save_dir}/train.csv', index=False)
val_df.to_csv(f'{save_dir}/val.csv', index=False)
test_df.to_csv(f'{save_dir}/test.csv', index=False)

print("Saved:")
print("train.csv:", len(train_df))
print("val.csv:", len(val_df))
print("test.csv:", len(test_df))