In [1]:
import numpy as np
import igraph as ig
import pandas as pd
from numpy.linalg import norm

def l1_row(row):
    row_sum = norm(row, ord=1)
    if row_sum == 0:
        return row
    else:
        return row / row_sum

def norm_rows(df):
    return df.apply(l1_row,axis=1)

# list of all projects / accounts - will specify the ordering
proj_meta = pd.read_csv('data/cargo-dependencies-meta.csv', delimiter=',', dtype={'ID': 'Int64'}).dropna();
proj_ids = proj_meta['ID'].tolist()
labels = proj_meta['Name'].tolist()
# acc = ['a1', 'a2', 'a3']
# labels = np.concatenate((proj,acc), axis=0)

proj_meta.head()

Unnamed: 0,ID,Name,Platform
0,30742,acacia,Cargo
1,30745,aio,Cargo
2,30746,advapi32-sys,Cargo
3,30747,alfred,Cargo
4,30748,algebloat,Cargo


In [2]:
deps = pd.read_csv('data/cargo-dependencies.csv', delimiter=',');
deps.head()

Unnamed: 0,FROM_ID,TO_ID
0,30742,31187
1,30742,31296
2,30742,31295
3,30742,31085
4,30742,428702


In [7]:
def get_contribs(csv_path, proj_names):
    raw_contrib = pd.read_csv(csv_path, delimiter=",").dropna();
    return raw_contrib[raw_contrib['Name'].isin(proj_names)].merge(proj_meta);

contrib = get_contribs('data/gh-contributions.csv', labels)
contributers = contrib['maintainer'].unique().tolist()
contrib.head()

  if (yield from self.run_code(code, result)):


Unnamed: 0,maintainer,repo,contributions,Name,ID,Platform
0,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,Asus-T100/kernel,53029,kernel,1748045,Cargo
1,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,julianjmaurer/kernel,52489,kernel,1748045,Cargo
2,69652caca27c8b940640ad396ab71f93cacec34f@linux...,Asus-T100/kernel,42306,kernel,1748045,Cargo
3,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,markgross/kernel,42034,kernel,1748045,Cargo
4,69652caca27c8b940640ad396ab71f93cacec34f@linux...,julianjmaurer/kernel,41961,kernel,1748045,Cargo


In [4]:
# create the contributions matrix
def adj_from_contrib(contrib, contributers, all_ids):
    adj = pd.DataFrame(0, columns=contributers, index=all_ids)
    for index, row in contrib.iterrows():
        adj.at[row['ID'], row['maintainer']] = row['contributions']
    return adj

adj_contrib = adj_from_contrib(contrib, contributers, proj_ids);

#Write adjacency matrix data to csv
# adj_contrib.to_csv('data/cargo-contrib-adj.csv')

adj_contrib.head()

Unnamed: 0,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davemloft.net,69652caca27c8b940640ad396ab71f93cacec34f@linux-foundation.org,4645f7897fd33786a2ee1264d590b3c400559d85@linuxfoundation.org,3697fa5d1b229f59097c7d14eeabccb8d36f99e1@localhost,4645f7897fd33786a2ee1264d590b3c400559d85@suse.de,2a53bac7a5d324865ef46ec4c38b2c0fba1456b4@tuxdriver.com,ad86ba2154032c9f55743a190faa2459a9d61d42@redhat.com,9dbbbf0688fedc85ad4da37637f1a64b8c718ee2@elte.hu,010521127f513270fe503d86ab8316ac5147f4b7@codeaurora.org,69652caca27c8b940640ad396ab71f93cacec34f@g5.osdl.org,...,2424f7e375ac81b78b3d01033b1a56d81924e5ae@gmail.com,b1c1d8736f20db3fb6c1c66bb1455ed43909f0d8@daiyang.de,504d327c2b1839a7d142d2c4d04ab89ddd38de1c@dacpac.com,8713c753354f70c8ab3f92819c83c11ddc65fcae@cdmckay.org,a17fed27eaa842282862ff7c1b9c8395a26ac320@mikepedersen.dk,251e43ad0c4bcd54011ba199850121588b94175e@gmail.com,f06251451de5eebca2cd34a4fd7936af28099c5e@users.noreply.github.com,d9f6a08f89e436fe15af44a948d5dd38dac9ea58@gmail.com,9a34a276a8450e86198f6412ebcc40b1fe8156af@neo9.fr,7478dba6682c631c29bb5d8db3571688fdd8aa67@shike2.com
30742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# ignore for now: create the maintainers matrix (for now empty)
# adj_maintain = pd.DataFrame(0, columns=contributers, index=proj_ids)

In [5]:
# create the dependency matrix
def adj_via_from_to(from_to, all_ids):
    adj = pd.DataFrame(0, columns=all_ids, index=all_ids)
    for index, row in from_to.iterrows():
        if row['FROM_ID'] in adj.index.tolist():
            adj.at[row['FROM_ID'], row['TO_ID']] = 1
    return adj

adjdf = adj_via_from_to(deps, proj_ids);


#Write adjacency matrix data to csv
# adjdf.to_csv('data/cargo-dep-adj.csv')

adjdf.head()

Unnamed: 0,30742,30745,30746,30747,30748,30750,30751,30753,30754,30755,...,3469060,3394154,3405396,3480649,3483337,3483781,3483812,2683436,31398,2682138
30742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# create the adjacency matrix from projects and contributions

# ---- parameters that can be tweaked:
# weight variables of edges of G
## project -> account
d = 4.0 / 7.0
m = 2.0 / 7.0
c = 1.0 / 7.0
## account -> project
m_ = 3.0 / 5.0
c_ = 2.0 / 5.0

#  ----------------------------------

# for now without maintainers
def adj_p_and_acc(D, C, M=0):
#     acc0 = pd.DataFrame(0, columns=contributers, index=contributers)

    p_p = d * norm_rows(D)
#     p_a = m * norm_rows(M) + c * norm_rows(C)
    p_a = c * norm_rows(C)
#     a_p = (m_ * M.transpose()).multiply(norm_rows(C.transpose())) + (c_ * norm_rows(C.transpose()))
    a_p = c_ * norm_rows(C.transpose())
    # combine the matrices
    return norm_rows(pd.concat([pd.concat([p_p,a_p], axis=0),pd.concat([p_a,pd.DataFrame(0, columns=contributers, index=contributers)], axis=0)], axis=1))

# use adj matrices from memory
# deps_adj = adj_via_from_to(deps, proj_ids)
# use adj matrices from csv 
deps_adj = pd.read_csv('data/cargo-dep-adj.csv', delimiter=',');
contrib_adj = pd.read_csv('data/cargo-contrib-adj.csv', delimiter=',');

# adj_all = adj_p_and_acc(adjdf, adj_contrib, adj_maintain)
adj_all = adj_p_and_acc(deps_adj, contrib_adj)
adj_all.head()

In [None]:
#Write final adjacency matrix data to csv
adj_all.to_csv('data/cargo-all-adj.csv')

In [6]:
# ---- parameters that can be tweaked:

# dapening factors
e_proj = 0.9
e_account = 0.1

# number of iterations
niter = 1000

# -----------------------------------

def g_from_adj(adjdf):
    adjmat = adjdf.values
    # construct the graph from the adjacency matrix
    g = ig.Graph.Adjacency(adjmat.astype(bool).tolist())
    g.es['weight'] = adjmat[adjmat.nonzero()]
    g.vs['label'] = labels
    return g

def page_rank(g):
    # reset = [e_proj] * len(proj) + [e_account] * len(acc)
    reset = [e_proj] * len(proj_ids)
    page_rank = g.personalized_pagerank(reset=reset, niter=niter);
    return pd.DataFrame({'ID': proj_ids, 'Name': labels, 'page_rank': page_rank})

# adjdf_from_csv for using the stored data
# adjdf for the data from memory
g = g_from_adj(adjdf)

page_rank = page_rank(g)
page_rank_top100 = page_rank.sort_values(by=['page_rank'], ascending=False).head(100)

In [9]:
#Write page_rank data to csv
page_rank.sort_values(by=['page_rank'], ascending=False).to_csv('data/cargo-pagerank.csv', index= False)

In [20]:
page_rank_top100

Unnamed: 0,ID,Name,page_rank
3563,1369785,num-traits,0.059415
1005,428702,rand,0.035209
518,31533,winapi,0.033656
425,31384,serde,0.026141
242,31115,libc,0.022697
4550,1629953,serde_derive,0.020821
16285,3414109,rustc-std-workspace-core,0.019508
396,31342,rustc-serialize,0.014597
7998,2353652,proc-macro2,0.013991
1542,783030,clippy,0.013728


In [None]:
layout = g.layout("large")
vertex_weights = [i * 100 for i in page_rank['page_rank'].tolist()]
adjmat = adjdf.values
edge_weights = [i * 10 for i in adjmat[adjmat.nonzero()]]
#ig.plot(g, vertex_size=vertex_weights, vertex_color=(["blue"] * len(proj_ids)), vertex_label_dist=1, edge_width=edge_weights, layout=layout)