In [1]:
import numpy as np
import pandas as pd
import time, json, random
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, KMeans
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy import stats
import matplotlib.pyplot as plt

RANDOM_STATE = 3


In [2]:
BORROW_CHECKER_TOKENS = json.load(open("./borrow_checker.json", "r")).keys()
DS = pd.read_csv("./borrow_only.csv", sep=',', header=[0])
X_data = DS.drop(columns=['file', 'scope'])
X_data = X_data.drop_duplicates()
X_data = X_data.drop(columns=['proj', 'commit', 'desc'])
for token in BORROW_CHECKER_TOKENS:
    if(X_data[token].max() != 0):
        X_data[token] = X_data[token] / X_data[token].max()
X_data = X_data[X_data["UseTree"] == 0]
X_data = X_data[X_data["ItemUse"] == 0]
X_data = X_data[X_data["UseGlob"] == 0]
X_data = X_data[X_data["UseList"] == 0]
X_data = X_data[X_data["UsePath"] == 0]
X_data = X_data[X_data["ItemForeignMod"] == 0]
X_data = X_data[X_data["ItemMod"] == 0]
print(len(X_data))


27143


In [3]:
dbscan_data = []
Z = [5,7,9]
E = [0.01, 0.001, 0.0001]
for z in Z:
    for e in E:
        db = DBSCAN(eps=e, min_samples=z).fit(X_data)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        dbscan_labels = pd.Series(labels, name="label")

        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        print("Epsilon", e)
        print("Z", z)
        print("Estimated number of clusters: %d" % n_clusters_)
        print("Estimated number of noise points: %d" % n_noise_)
        dbscan_data.append((z, e, n_clusters_, n_noise_))
print(dbscan_data)

Epsilon 0.01
Z 5
Estimated number of clusters: 160
Estimated number of noise points: 18755
Epsilon 0.001
Z 5
Estimated number of clusters: 102
Estimated number of noise points: 25909
Epsilon 0.0001
Z 5
Estimated number of clusters: 61
Estimated number of noise points: 26711
Epsilon 0.01
Z 7
Estimated number of clusters: 77
Estimated number of noise points: 19324
Epsilon 0.001
Z 7
Estimated number of clusters: 47
Estimated number of noise points: 26212
Epsilon 0.0001
Z 7
Estimated number of clusters: 23
Estimated number of noise points: 26910
Epsilon 0.01
Z 9
Estimated number of clusters: 51
Estimated number of noise points: 19642
Epsilon 0.001
Z 9
Estimated number of clusters: 26
Estimated number of noise points: 26381
Epsilon 0.0001
Z 9
Estimated number of clusters: 11
Estimated number of noise points: 27001
[(5, 0.01, 160, 18755), (5, 0.001, 102, 25909), (5, 0.0001, 61, 26711), (7, 0.01, 77, 19324), (7, 0.001, 47, 26212), (7, 0.0001, 23, 26910), (9, 0.01, 51, 19642), (9, 0.001, 26, 2

In [4]:
Z = 5
e = 0.001
db = DBSCAN(eps=e, min_samples=Z).fit(X_data)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
dbscan_labels = pd.Series(labels, name="label")

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Epsilon", e)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Epsilon 0.001
Estimated number of clusters: 102
Estimated number of noise points: 25909


In [5]:
X_df = pd.DataFrame(data=X_data)
dbscan_labels = pd.Series(labels, name="label", index=X_df.index)
full_dbscan_data = pd.concat([X_df, dbscan_labels], axis = 1)
full_dbscan_clusters = []
for label in range(n_clusters_):
    cluster = full_dbscan_data[full_dbscan_data["label"] == label]
    projects = set()
    for index1, row in cluster.iterrows():
        projects.add(DS.loc[index1, 'proj'])
    if len(projects) > 1:
        full_dbscan_clusters.append(cluster)
print("Clusters with cross-project bugs:",len(full_dbscan_clusters))

Clusters with cross-project bugs: 49


In [6]:
## Sampling
full_dbscan_clusters.sort(key=len, reverse=True)
DPs = []
for ID in range(1, len(full_dbscan_clusters)):
    samples = full_dbscan_clusters[ID-1]
    projs = set()
    for index1, row in samples.iterrows():
        DPs.append(row)
m = 100
sample = None
for i in range(1000):
    random.seed(RANDOM_STATE+i)
    df = pd.DataFrame(DPs)
    df_sample = pd.DataFrame(random.sample(DPs, k=30))
    sample_error = abs((df.mean(axis = 0) - df_sample.mean(axis = 0)).mean())
    if m > sample_error:
        m = sample_error
        sample = df_sample
print(m)
for index1, row in sample.iterrows():
    print("https://github.com/"+DS.loc[index1, 'proj']+"/commit/"+DS.loc[index1, 'commit']+" "+DS.loc[index1, 'file']+" "+str(DS.loc[index1, 'scope']))

1.8224230098277807e-05
https://github.com/tauri-apps/tauri/commit/85674d4fe6a2453ce969d6a75b6528f3c89bf1d1 ios_bundle.rs 1-Fn
https://github.com/meilisearch/meilisearch/commit/a45cc4b6189c2eb40758b9749c3d1c8b19b88d40 sum_of_words_position.rs 0-Fn
https://github.com/alacritty/alacritty/commit/5f7885749c4d7e48869b1fc0be4d430601cdbbfa grid.rs 13-Impl
https://github.com/tokio-rs/tokio/commit/8198ef38814c45f9dc02fcbf826225b5cf32a6bb harness.rs 1-Impl
https://github.com/starship/starship/commit/af43aeefba1cc12044f05a09a8b6f0ae309a556c time.rs 1-Fn
https://github.com/swc-project/swc/commit/9ba68c68639916b48b79ad831e3cf69d7b0c8051 typescript.rs 6-Fn
https://github.com/swc-project/swc/commit/716bfe05b298c750c08e56264485e48072539369 typescript.rs 1-Impl
https://github.com/AppFlowy-IO/AppFlowy/commit/420b8ca05dcc27a50158bbddb08ec78b05cea0c4 grid.rs 19-Impl
https://github.com/yewstack/yew/commit/c9deba05f13eac459f00198f878f272fc369516e mod.rs 0-Fn
https://github.com/AppFlowy-IO/AppFlowy/commit/7cc

In [8]:
full_dbscan_clusters.sort(key=len, reverse=True)
ID = 0
for cluster in full_dbscan_clusters:
    ID += 1
    print(len(cluster), "ID:", ID)
    samples = cluster.sample(min(50, len(cluster)), random_state=RANDOM_STATE)
    common_set = set()
    for index1, row in samples.iterrows():
        print("https://github.com/", end="")
        print(DS.loc[index1, 'proj'], end="/commit/")
        print(DS.loc[index1, 'commit'], end=" ")
        print(DS.loc[index1, 'file'], end=" ")
        print(DS.loc[index1, 'scope'])        
        print(DS.loc[index1, 'desc'])        
        row = row[:len(row)-1]
        row = row[row != 0]
        nonTerminals = set(row.to_dict().keys())
        print(nonTerminals)
        print("------")
        if len(common_set) == 0:
            common_set = nonTerminals
        else:
            common_set = common_set & nonTerminals
    for item in sorted(list(common_set)):
        print(item)
    print("=======\n")


168 ID: 1
https://github.com/meilisearch/meilisearch/commit/dc9ca2ebc99406a9422bdcbb1460d422f5b8523f documents_deletion.rs 1-Fn
&
{'ExprPath', 'PathSegment', 'ItemFn', 'Ident', 'ExprReference', 'mutability', 'ExprForLoop', 'Path', 'Expr'}
------
https://github.com/xi-editor/xi-editor/commit/7cd12f4ef6fb94cc78c521a853bbca8821482454 event_context.rs 1-Impl
mut-&
{'ExprPath', 'PathSegment', 'ImplItemMethod', 'Ident', 'ExprReference', 'mutability', 'ExprMethodCall', 'ExprClosure', 'Path', 'ItemImpl'}
------
https://github.com/AppFlowy-IO/AppFlowy/commit/bba8f8ae018401c461788c98f39a6d4370adfb62 workspace_sql.rs 1-Impl
&
{'ExprPath', 'ExprCall', 'PathSegment', 'ImplItemMethod', 'Ident', 'Local', 'ExprReference', 'mutability', 'Some', 'Path', 'ItemImpl'}
------
https://github.com/starship/starship/commit/af43aeefba1cc12044f05a09a8b6f0ae309a556c time.rs 1-Fn
&
{'ExprPath', 'ExprCall', 'PathSegment', 'ItemFn', 'Ident', 'ExprReference', 'mutability', 'Path', 'ExprIf', 'Expr'}
------
https://gith