In [126]:
%load_ext autoreload
%autoreload 2

%pwd
%matplotlib inline

import pandas as pd
from tqdm.notebook import tqdm

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [127]:
import pathlib, sys
import logging

DATA_FOLDER = pathlib.Path("/nfs/data/students/bsparks/mdti4py-dataset-pool/cdt4py/")
assert DATA_FOLDER.is_dir()

logger = logging.getLogger(name=__name__)
logger.handlers.clear()
logger.setLevel(level=logging.DEBUG)

handler = logging.StreamHandler(stream=sys.stdout)
handler.setFormatter(fmt=logging.Formatter('%(asctime)s | %(levelname)s : %(message)s'))
logger.addHandler(handler)

logger.info("Hello World")

2023-09-07 12:05:36,084 | INFO : Hello World


In [128]:
from typet5.data import GitRepo
from scripts.infer.structure import CrossDomainTypes4Py

dataset = CrossDomainTypes4Py(dataset_root=DATA_FOLDER)
logger.debug(dataset)



2023-09-07 12:05:39,456 | DEBUG : CrossDomainTypes4Py @ /nfs/data/students/bsparks/mdti4py-dataset-pool/cdt4py


In [129]:
from scripts.infer.structure import AuthorRepo
from typet5.data import GitRepo


class CDT4PyRepo(GitRepo):
    def __init__(self, author_repo: AuthorRepo) -> None:
        super().__init__(
            author=author_repo.author,
            name=author_repo.repo,
            url=None,
            stars=-1,
            forks=-1
        )
        
    def authorname(self) -> str:
        return f"{self.author}/{self.name}"
        

class CDT4PyFlaskRepo(CDT4PyRepo):
    def repo_dir(self, repos_dir: pathlib.Path) -> pathlib.Path:
        return repos_dir / "flask" / self.authorname()
    
class CDT4PyNumpyRepo(CDT4PyRepo):
    def repo_dir(self, repos_dir: pathlib.Path) -> pathlib.Path:
        return repos_dir / "numpy"  / self.authorname()

In [130]:
cdt4py_author_repos: dict[AuthorRepo, pathlib.Path] = {
    dataset.author_repo(repository): repository
    for repository in dataset.project_iter()
}
display(len(cdt4py_author_repos))

6706

# Remove BetterTypes4Py, Typilus and Type4Py datasets from CrossDomainTypes4Py

In [131]:
import pathlib
dataset_path = pathlib.Path("/nfs/data/students/bsparks/mdti4py-dataset-pool/cdt4py")

In [40]:
import pandas as pd
import shutil, os

src_mt4py_dataset = pathlib.Path("/nfs/data/students/bsparks/many-types-4-py-dataset")
dest_mt4py_dataset = pathlib.Path("/nfs/data/students/bsparks/mdti4py-datasets/many-types-4-py-dataset")

mt4py_split_df = pd.read_csv(src_mt4py_dataset / "data/dataset_split.csv", names=["split", "file"])
copied = 0
for file in tqdm(mt4py_split_df[mt4py_split_df.split == "train"].file):
    if not os.path.exists(src_fpath := os.path.join(str(src_mt4py_dataset), file)):
        continue
    
    dest_fpath = os.path.join(str(dest_mt4py_dataset), file)
    
    os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
    shutil.copy(src_fpath, dest_fpath)

    copied += 1

print(f"Copied {copied} files")

  0%|          | 0/211442 [00:00<?, ?it/s]

Copied 149528 files


In [None]:
!ls /nfs/data/students/bsparks/mdti4py-datasets/many-types-4-py-dataset/repos

In [41]:
!ls /nfs/data/students/bsparks/mdti4py-datasets

many-types-4-py-dataset  tt5-train  typilus


In [None]:
!pip install cd4py
!pip install "typing_extensions==4.5.0"

Number of source code files: 2,604,611                                                                                
[I 2023-09-07 14:03:50.700 ServerApp] Saving file at /experiments/00datasets/cdt4py.ipynb                            │Total number of tokens: 1,268,703,455                                                                                 
[I 2023-09-07 14:05:50.926 ServerApp] Saving file at /experiments/00datasets/cdt4py.ipynb                            │100%|█████████████████████████████████████████████████████████████████████| 2604611/2604611 [05:56<00:00, 7315.22it/s]
[W 2023-09-07 17:52:42.791 ServerApp] WebSocket ping timeout after 119980 ms.                                        │100%|█████████████████████████████████████████████████████████████████████| 2604611/2604611 [12:04<00:00, 3596.45it/s]
[I 2023-09-07 17:52:47.792 ServerApp] Starting buffering for 86de0c3c-afd7-4dac-bf0d-24991e3d37cb:2196c57f-60da-4220-│***********************Vectorize pre-processed source code files using TF-IDF**********************                   
8563-671aa53816b7                                                                                                    │100%|█████████████████████████████████████████████████████████████████████| 2604611/2604611 [21:26<00:00, 2023.87it/s]
[I 2023-09-08 17:42:51.424 ServerApp] 302 GET / (@127.0.0.1) 0.92ms                                                  │**************************Building KNN index and finding nearest neighbors*************************                   
[W 2023-09-08 17:42:53.855 LabApp] Could not determine jupyterlab build status without nodejs                        │100%|█████████████████████████████████████████████████████████████████████| 2604611/2604611 [05:55<00:00, 7336.19it/s]
[I 2023-09-08 17:42:54.006 ServerApp] Connecting to kernel cf8cb073-e5b3-4c1c-b385-c6ccc37efb55.                     │100%|██████████████████████████████████████████████████████████████████████| 2604611/2604611 [50:09<00:00, 865.38it/s]
[I 2023-09-08 17:42:54.058 ServerApp] Connecting to kernel 9fc37da9-cbb6-4566-928d-99f2fee6a601.                     │*******************************Finding exact and near duplicate files******************************                   
[I 2023-09-08 17:42:54.097 ServerApp] Connecting to kernel 86de0c3c-afd7-4dac-bf0d-24991e3d37cb.                     │100%|█████████████████████████████████████████████████████████████████████| 2604611/2604611 [07:46<00:00, 5580.31it/s]
[I 2023-09-08 17:42:54.332 ServerApp] Starting buffering for cf8cb073-e5b3-4c1c-b385-c6ccc37efb55:fc69da8c-75c8-4c77-│*********************Report duplication stats & saving detected duplicate files********************                   
9f70-ba2e4aa42f86                                                                                                    │Number of duplicated files: 2,417,250 (92.81%)                                                                        
[I 2023-09-08 17:42:54.333 ServerApp] Starting buffering for 9fc37da9-cbb6-4566-928d-99f2fee6a601:c7f7880b-5671-4bba-│Number of detected clusters: 184,937                                                                                  
be2d-b039572025d3                                                                                                    │Avg. number of files per clones: 13.07                                                                                
[I 2023-09-08 17:42:54.334 ServerApp] Starting buffering for 86de0c3c-afd7-4dac-bf0d-24991e3d37cb:3b942345-73e7-49ea-│Median number of files per clones: 4.00                                                                               
81d8-45e8e8e83e92                                                                                                    │Duplication ratio: 85.71%                                                                                             

In [36]:
!cd4py --help

usage: cd4py [-h] --p P --od OD --ot OT [--d D] [--th TH] [--k K] [--tr TR]

Code De-Duplication for Python

options:
  -h, --help  show this help message and exit
  --p P       Path to Python projects
  --od OD     Output folder to store detected duplicate files.
  --ot OT     Output folder to store tokenized files.
  --d D       Dimension of TF-IDF vectors [default: 2048].
  --th TH     Threshold to identify duplicate files [default: 0.95].
  --k K       Number of nearest neighbor [default: 10].
  --tr TR     Number trees to build the index. More trees gives higher
              precision but slower [default: 20].


In [None]:
%%time
#!cd4py --p /nfs/data/students/bsparks/mdti4py-dataset-pool --ot /nfs/data/students/bsparks/mdti4py-dataset-pool-toks --od /nfs/data/students/bsparks/mdti4py-datasets-dedup

# Run in bash shell

*********************************Tokenizing Python source code files*********************************
100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 423.72it/s]
Error tokenizing /nfs/data/students/bsparks/mdti4py-dataset-pool/many-types-4-py-dataset/repos/visit-dav/visit-deps/windowsbuild/MSVC2017/python/3.7.7/Lib/test/bad_coding2.py because encoding problem for '/nfs/data/students/bsparks/mdti4py-dataset-pool/many-types-4-py-dataset/repos/visit-dav/visit-deps/windowsbuild/MSVC2017/python/3.7.7/Lib/test/bad_coding2.py': utf-8
Error tokenizing /nfs/data/students/bsparks/mdti4py-dataset-pool/many-types-4-py-dataset/repos/visit-dav/visit-deps/windowsbuild/MSVC2017/python/3.7.7/Lib/test/bad_coding.py because unknown encoding for '/nfs/data/students/bsparks/mdti4py-dataset-pool/many-types-4-py-dataset/repos/visit-dav/visit-deps/windowsbuild/MSVC2017/python/3.7.7/Lib/test/bad_coding.py': uft-8
Error tokenizing /nfs/data/students/bsparks/mdti4py-dataset-pool/many-types-4

In [134]:
from dpu_utils.utils.dataloading import load_jsonl_gz
from libcst import codemod

import json, random, collections
import pprint

# Get size of each project
projects_by_size = collections.Counter({
    project: len(codemod.gather_files([project]))
    for project in tqdm(dataset.project_iter(), total=len(cdt4py_author_repos))
})

0it [00:00, ?it/s]

In [207]:
# Collect only cdt4py files
cdt4py_clusters = list[dict]()
for cluster in load_jsonl_gz("/nfs/data/students/bsparks/mdti4py-datasets-dedup/duplicates.jsonl.gz"):
    no_sites = list()
    for file in cluster:
        if all(forbidden not in file for forbidden in ["site-packages", "Lib", "lib", "Scripts"]):
            no_sites.append(file)
    
    pathed = map(pathlib.Path, no_sites)
    # print(pathed)
    cdt4py_files = list(filter(lambda p: p.is_relative_to(cdt4py_path), pathed))
    if not cdt4py_files:
        continue

    segmented_files = collections.defaultdict[tuple, list](list)
    for fpath in cdt4py_files:
        from_dataset = fpath.relative_to(cdt4py_path)
        category, author, user, *_ = from_dataset.parts
        key = (category, author, user)
        segmented_files[key].append(fpath)
    
    cdt4py_clusters.append(segmented_files)

In [208]:
#print(next(iter(projects_by_size)))
cdt4py_clusters_wip = cdt4py_clusters[:]
cd4py_dedupped = list[pathlib.Path]()

for project, _ in tqdm(projects_by_size.most_common(), total=len(projects_by_size)):
    from_dataset = project.relative_to(cdt4py_path)
    category, author, user, *_ = from_dataset.parts
    key = (category, author, user)

    redundant = set[int]()
    for i, cdt4py_cluster in enumerate(cdt4py_clusters_wip):
        if cluster_files := cdt4py_cluster.get(key):
            redundant.add(i)
            cd4py_dedupped.append(random.choice(cluster_files).relative_to(cdt4py_path))

    if redundant:
        cdt4py_clusters_wip = [cluster for i, cluster in enumerate(cdt4py_clusters_wip) if i not in redundant]

  0%|          | 0/8140 [00:00<?, ?it/s]

In [209]:
cd4py_dedupped = list(map(str, cd4py_dedupped))
print(cd4py_dedupped[:3])

files_by_segment = (
    pd.Series(cd4py_dedupped, name="file").str \
    .split(pat=os.sep, n=3, expand=True) \
    .rename(columns=dict(enumerate(["category", "user", "repository", "file"])))
)
files = pd.concat([
    files_by_segment[["category", "user", "repository"]].apply(os.sep.join, axis=1).rename("prefix"),
    files_by_segment["file"]
], axis="columns")
    
display(files.head(n=20))

['numpy/rayhaneHamoumi/arduino/python/DistanceSensor_Python.py', 'numpy/Peipeixuan/muxing-crowdfunding/mainapp/models.py', 'numpy/Peipeixuan/muxing-crowdfunding/proj02/settings.py']


Unnamed: 0,prefix,file
0,numpy/rayhaneHamoumi/arduino,python/DistanceSensor_Python.py
1,numpy/Peipeixuan/muxing-crowdfunding,mainapp/models.py
2,numpy/Peipeixuan/muxing-crowdfunding,proj02/settings.py
3,numpy/Peipeixuan/muxing-crowdfunding,mainapp/migrations/0005_auto_20200829_1456.py
4,numpy/Peipeixuan/muxing-crowdfunding,mainapp/migrations/0004_auto_20200829_1442.py
5,numpy/Peipeixuan/muxing-crowdfunding,manage.py
6,numpy/Peipeixuan/muxing-crowdfunding,mainapp/migrations/0002_auto_20200824_1438.py
7,flask/kichappa/QC,mpl_toolkits/axisartist/axisline_style.py
8,flask/kichappa/QC,pygments/lexers/ncl.py
9,flask/kichappa/QC,pygments/styles/colorful.py


In [210]:
print(files["prefix"].nunique())
print(files["prefix"].value_counts().head())
print(files["prefix"].value_counts().sum())

3582
prefix
flask/kichappa/QC                  12593
numpy/sviete/AIS-home-assistant     4672
flask/kfserving/kfserving           2800
flask/gyhd/python_study             2296
flask/brycepg/pylint-corpus         2040
Name: count, dtype: int64
129369


In [185]:
from sklearn.model_selection import train_test_split

weighted = files["prefix"].value_counts(normalize=True).cumsum()
validation_repositories, test_repositories = weighted[weighted < 0.8], weighted[weighted >= 0.8]

validation_split = files[files["prefix"].isin(validation_repositories.index)]
test_split = files[files["prefix"].isin(test_repositories.index)]

validation_files = validation_split.apply(os.sep.join, axis=1)
test_files = test_split.apply(os.sep.join, axis=1)

print(validation_files.shape[0] + test_files.shape[0])

156096


In [184]:
with (pathlib.Path("/nfs/data/students/bsparks/mdti4py-datasets-dedup") / "deduplicated.json").open("w") as f:
    json.dump(cdt4py_dedupped, f)

with (pathlib.Path("/nfs/data/students/bsparks/mdti4py-datasets-dedup") / "validation.json").open("w") as f:
    json.dump(validation_files.tolist(), f)

with (pathlib.Path("/nfs/data/students/bsparks/mdti4py-datasets-dedup") / "test.json").open("w") as f:
    json.dump(test_files.tolist(), f)

In [None]:
test_set = dataset.test_set()
print(len(test_set))

In [None]:
new_repos = []
for cdt4py_repo in (CDT4PyFlaskRepo, CDT4PyNumpyRepo):
    downloaded_repos = []
    for test_repo in tqdm(test_set, desc=f"{cdt4py_repo.__qualname__}"):
        repo = cdt4py_repo(dataset.author_repo(test_repo))
        if repo.repo_dir(dataset.dataset_root).is_dir():
            downloaded_repos.append(repo)

    # for r in tqdm.tqdm(downloaded_repos, desc=str(cdt4py_repo)):
    #    r.read_last_update(DATA_FOLDER)
    
    new_domain_repos = [r for r in downloaded_repos if "typeshed" not in r.name and "stub" not in r.name]
    logger.info(f"{cdt4py_repo.__qualname__}: {len(new_domain_repos)} / {len(downloaded_repos)} are not related to stubbing")
    
    new_repos.extend(new_domain_repos)

In [None]:
#loc_limit = 50000

acceptable_repos = []
all_repos = []
for rep in tqdm(new_repos):
    try:
        loc = rep.count_lines_of_code(DATA_FOLDER)
        # if loc < loc_limit:
        acceptable_repos.append(rep)
    except UnicodeDecodeError:
        # nothing we can do
        logger.warning(f"{rep.authorname()} does not pass due to encoding error")
    except Exception as e:
        logger.warning(f"{rep.authorname()} does not pass", exc_info=True)

    else:
        all_repos.append(rep)

In [None]:
print(
    f"{len(acceptable_repos)}/{len(all_repos)} repos pass readability checks."
)

In [None]:
test_repos = acceptable_repos[:]

In [None]:
# Bin by lines of code
import pandas as pd

def loc_binning(repos: list[GitRepo], title: str, kloc = loc_limit // 1000) -> None:
    LOC_BINS = [loc * 1000 for loc in range(0, kloc + 1, 4)]
    repo_loc = pd.DataFrame(
        [(repo.authorname(), repo.lines_of_code) for repo in repos],
        columns=["Repository", "Lines of Code"]
    )
    repo_loc.plot.hist(ylabel="Frequency", bins=LOC_BINS, title=title)

In [None]:
print(len(all_repos))
loc_binning(all_repos, title="All Test Repositories", kloc = int(1e6))

In [None]:
print(len(test_repos))
loc_binning(test_repos, title="Sub 50kLOC Test Repositories")

In [None]:
import operator

def count_repo_annots(repo: GitRepo) -> tuple[GitRepo, dict] | None:
    try:
        annotations = repo.collect_annotations(DATA_FOLDER)
        if repo.n_type_annots / rep.lines_of_code > 0.05:
            return repo, annotations
    except Exception:
        logger.warning(f"Failed to count annotations for {repo.name}")
        return None

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed

with ProcessPoolExecutor(max_workers=8) as executor:
    fs = [executor.submit(count_repo_annots, r) for r in small_repos]
    repo2annotations = [f.result() for f in tqdm(as_completed(fs), total=len(fs))]

repo2annotations: list[tuple[CDT4PyRepo, dict]] = [r for r in repo2annotations if r is not None]
useful_repos: list[CDT4PyRepo] = list(map(operator.itemgetter(0), repo2annotations))

logger.info(
    f"{len(useful_repos)}/{len(small_repos)} repos are parsable, have enough portions of type annotations"
)

del repo2annotations

In [None]:
# Bin by relative annotation count
def type_slots_filled(repos: list[GitRepo], title: str) -> None:
    anno_df = pd.DataFrame(
        [(repo.authorname(), repo.n_type_annots / repo.n_type_places * 100) for repo in repos],
        columns=["Repository", "Annotation Frequency"]
    )
    bins = [x for x in range(0, 100 + 1, 5)]
    anno_df.plot.hist(ylabel="Frequency", bins=bins, title=title)

In [None]:
len(useful_repos)

In [None]:
type_slots_filled(useful_repos, title="Percentile of Type Slots Filled")