In [None]:
import psycopg2
import pandas as pd
import pandas.io.sql as pdsql
import os

#TODO change password of user sa to sa (empty by default but no empty password allowed by psycopg2)
conn = psycopg2.connect("dbname=bcb user='sa' password='sa' host='localhost' port=5435")

In [None]:
# Fetch all clone pairs
cur = conn.cursor()
query = 'SELECT FUNCTION_ID_ONE, FUNCTION_ID_TWO, FUNCTIONALITY_ID, SYNTACTIC_TYPE, SIMILARITY_LINE FROM CLONES'
# WHERE SYNTACTIC_TYPE=1;'   
clone_pairs = pdsql.read_sql_query(query, conn)
clone_pairs.head()

In [None]:
def compute_type(row):
    if row["syntactic_type"] == 3:
        if row["similarity_line"] < 0.5:
            return int(4)
        else:
            return int(3)
    else:
        return int(row["syntactic_type"])
    
clone_pairs["type"] = clone_pairs.apply(compute_type, axis=1)
clone_pairs["type"] = clone_pairs["type"].astype(int)

clone_pairs.head()

In [None]:
function_types = clone_pairs["functionality_id"].unique()
function_types.sort()
print(function_types)

In [None]:
tvals = []
for ft in function_types:
    for t in range(1,5):
        sf = clone_pairs[(clone_pairs["type"] == t) & (clone_pairs["functionality_id"] == ft)]
        v = len(sf.index)
        tvals.append((ft, t, v))
clones_by_tf = pd.DataFrame(tvals)
clones_by_tf.columns = ["functionality", "type", "count"]
clones_by_tf.head()

In [None]:
clones_by_tf.plot.barh(y="count",  x="functionality", label="type", logx=True, stacked=True, figsize=(50,50))

cbtf_pivot = clones_by_tf.pivot(index="functionality", columns="type", values="count")
print(cbtf_pivot)
ax = cbtf_pivot.plot.barh(stacked=True, logx=False, grid=True, width=0.95, figsize=(6,15))
ax.set_xlim(0, 1500000)
plt.show()

ax = cbtf_pivot.plot.barh(stacked=True, logx=False, grid=True, width=0.95, figsize=(6,15))
ax.set_xlim(0, 50000)
plt.show()

In [None]:
def get_unique_code_ids(ds):
    code_ids_by_ct = [None,set(),set(),set(),set()]
    
    for _, r in ds.iterrows():
        id1 = r["function_id_one"]
        id2 = r["function_id_two"]
        ctype = int(r["type"])
        code_ids_by_ct[ctype].add(id1)
        code_ids_by_ct[ctype].add(id2)
    for i in range(1,5):
        print("T-%s: %s different snippets" % (i, len(code_ids_by_ct[i])))
    print("T4 - T1 snippets: %s different snippets" % len(code_ids_by_ct[4]-code_ids_by_ct[1]))
    print("T4 - T1+2 snippets: %s different snippets" % len((code_ids_by_ct[4]-code_ids_by_ct[2])-code_ids_by_ct[1]))
    print("T4 - T1+2+3 snippets: %s different snippets" % len(((code_ids_by_ct[4]-code_ids_by_ct[3])-code_ids_by_ct[2])-code_ids_by_ct[1]))
    return code_ids_by_ct
cids_by_ct = get_unique_code_ids(clone_pairs)


In [None]:
for i in range(1, 5):
    print("Number of T-%s clone pairs: %s" % (i, len(clone_pairs[clone_pairs["type"] == i].index)))

In [None]:
# Fetch all non-clone pairs
cur = conn.cursor()
query = 'SELECT FUNCTION_ID_ONE, FUNCTION_ID_TWO, FUNCTIONALITY_ID, SYNTACTIC_TYPE, SIMILARITY_LINE FROM FALSE_POSITIVES'
# WHERE SYNTACTIC_TYPE=1;'   
non_clones = pdsql.read_sql_query(query, conn)
non_clones.head()

In [None]:
function_types = clone_pairs["functionality_id"].unique()
function_types.sort()
print(function_types)

In [None]:
counts_by_ft = []
for ft in function_types:
    cp_ft = clone_pairs[clone_pairs["functionality_id"] == ft]
    nc_ft = non_clones[non_clones["functionality_id"] == ft]
    cl_ids = set(cp_ft["function_id_one"].to_list() + cp_ft["function_id_two"].to_list())
    noncl_ids = set(nc_ft["function_id_one"].to_list() + nc_ft["function_id_two"].to_list())
    shared = len(cl_ids.intersection(noncl_ids))
    nccount = len(noncl_ids) - shared
    ccount = len(cl_ids) - shared
    
    counts_by_ft.append((ft, "non-clones snippets", nccount))
    counts_by_ft.append((ft, "shared snippets", shared))
    counts_by_ft.append((ft, "clones snippets", ccount))

In [None]:
import matplotlib.pyplot as plt
#g = exp1_results.groupby(["visualization", "algorithm"]).mean()
#g.plot.barh(figsize=(15,8))
#plt.title("", fontweight="bold")
#ax = plt.gca()
#ax.yaxis.grid(True)
#ax.set_yticks([x*0.1 for x in range(10)])
#plt.show()

ft_snip_counts = pd.DataFrame(counts_by_ft)
ft_snip_counts.columns = ["functionality", "type", "count"]
ft_snip_counts.head(5)




In [None]:
import matplotlib.pyplot as plt
tfsc_pivot = ft_snip_counts.pivot(index="functionality", columns="type", values="count")
#tfsc_pivot = tfsc_pivot.reindex_axis(['clones snippets', 'shared snippets', 'non-clones snippets'], axis=1) 
tfsc_pivot = tfsc_pivot[['clones snippets', 'shared snippets', 'non-clones snippets']]
print(tfsc_pivot)
ax = tfsc_pivot.plot.barh(stacked=True, logx=False, grid=True, width=0.95, figsize=(6,15))
#ax.set_xlim(0, 1500000)
plt.show()

ax = tfsc_pivot.plot.barh(stacked=True, logx=False, grid=True, width=0.95, figsize=(6,15))
ax.set_xlim(0, 1750)
plt.show()

In [None]:
tfsc_pivot.columns