In [6]:
import numpy as np
import igraph as ig
import pandas as pd
from numpy.linalg import norm

def l1_row(row):
    row_sum = norm(row, ord=1)
    if row_sum == 0:
        return row
    else:
        return row / row_sum

def norm_rows(df):
    return df.apply(l1_row,axis=1)

# list of all projects / accounts - will specify the ordering
proj_meta = pd.read_csv('data/cargo-dependencies-meta.csv', delimiter=',', dtype={'ID': 'Int64'}).dropna();
proj_ids = proj_meta['ID'].tolist()
labels = proj_meta['Name'].tolist()
# acc = ['a1', 'a2', 'a3']
# labels = np.concatenate((proj,acc), axis=0)

proj_meta.head()

Unnamed: 0,ID,Name,Platform
0,30742,acacia,Cargo
1,30745,aio,Cargo
2,30746,advapi32-sys,Cargo
3,30747,alfred,Cargo
4,30748,algebloat,Cargo


In [4]:
deps = pd.read_csv('data/cargo-dependencies.csv', delimiter=',');
deps.head()

Unnamed: 0,FROM_ID,TO_ID
0,30742,31187
1,30742,31296
2,30742,31295
3,30742,31085
4,30742,428702


In [22]:
raw_contrib = pd.read_csv('data/gh-contributions.csv', delimiter=",").dropna();
rust_contrib = raw_contrib[raw_contrib['Name'].isin(labels)];
rust_contrib.head()

Unnamed: 0,maintainer,repo,contributions,Name
260,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,Asus-T100/kernel,53029,kernel
283,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,julianjmaurer/kernel,52489,kernel
308,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,jigpu/input,51297,input
400,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,Wenzel/kvm,48096,kvm
1081,69652caca27c8b940640ad396ab71f93cacec34f@linux...,Asus-T100/kernel,42306,kernel


In [26]:
contributers = rust_contrib['maintainer'].unique().tolist()
contrib = rust_contrib.merge(proj_meta)
contrib.head()

Unnamed: 0,maintainer,repo,contributions,Name,ID,Platform
0,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,Asus-T100/kernel,53029,kernel,1748045,Cargo
1,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,julianjmaurer/kernel,52489,kernel,1748045,Cargo
2,69652caca27c8b940640ad396ab71f93cacec34f@linux...,Asus-T100/kernel,42306,kernel,1748045,Cargo
3,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davem...,markgross/kernel,42034,kernel,1748045,Cargo
4,69652caca27c8b940640ad396ab71f93cacec34f@linux...,julianjmaurer/kernel,41961,kernel,1748045,Cargo


In [29]:
# create the contributions matrix
def adj_from_contrib(contrib, contributers, all_ids):
    adj = pd.DataFrame(0, columns=contributers, index=all_ids)
    for index, row in contrib.iterrows():
        adj.at[row['ID'], row['maintainer']] = row['contributions']
    return adj

adj_contrib = adj_from_contrib(contrib, contributers, proj_ids);
adj_contrib.head()

Unnamed: 0,fe08d3c717adf2ae63592e4c9aec6e3e404d8e3e@davemloft.net,69652caca27c8b940640ad396ab71f93cacec34f@linux-foundation.org,8081031fe20c252106f58a6c5b2ce840596f0a83@gnu.org,4645f7897fd33786a2ee1264d590b3c400559d85@linuxfoundation.org,746aced590a184ef4169166e753f7c211e32e5fc@tensorflow.org,a6723cc3f76163bf7adb636a73ac3b0ceb3e6b9b@pobox.com,0ba4af815980827fd28782804f80675b937ef955@gmx.at,e7aaeebdf2ba19688320140447f4e2358a6055d6@nondot.org,3697fa5d1b229f59097c7d14eeabccb8d36f99e1@localhost,7b7a306ffc9c5a3f3ff25b7f8591088725a44427@odileeds.org,...,4f1ea4f09db2aaafb0a92c0b9e57751121ed6647@xiala.net,9e1f59042135645134cacd36170d31d95d24c465@redhat.com,a28d868c774dff7f8f368eb3f092f513ba819c6a@coreos.com,80b24d51bbcd1ba9a2ce6b5fe97e28c44f9ff374@arandomurl.com,55107e193e648a27778fa98736b2e8e24b3cd6e1@cygnus,949b85f5a39b28e610920959d99aa52020044ae1@google.com,2e8175c327a6ef2d99764f30d3e9e7982997195e@kiste.(none),e48e2233792641fdb3ce137e343feb7922da11a6@mawhrin.net,fba2c9f764caacc5f58f566a6af2a5167419c18f@gmail.com,7478dba6682c631c29bb5d8db3571688fdd8aa67@shike2.com
30742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# create the maintainers matrix (for now empty)
adj_maintain = pd.DataFrame(0, columns=contributers, index=proj_ids)

In [7]:
# create the dependency matrix
def adj_via_from_to(from_to, all_ids):
    adj = pd.DataFrame(0, columns=all_ids, index=all_ids)
    for index, row in from_to.iterrows():
        if row['FROM_ID'] in adj.index.tolist():
            adj.at[row['FROM_ID'], row['TO_ID']] = 1
    return adj

adjdf = adj_via_from_to(deps, proj_ids);
adjdf.head()

Unnamed: 0,30742,30745,30746,30747,30748,30750,30751,30753,30754,30755,...,3469060,3394154,3405396,3480649,3483337,3483781,3483812,2683436,31398,2682138
30742,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# create the adjacency matrix from projects and contributions

# ---- parameters that can be tweaked:
# weight variables of edges of G
## project -> account
d = 4.0 / 7.0
m = 2.0 / 7.0
c = 1.0 / 7.0
## account -> project
m_ = 3.0 / 5.0
c_ = 2.0 / 5.0

# dapening factors
e_proj = 0.9
e_account = 0.1

# number of iterations
niter = 1000

#  ----------------------------------

def adj_p_and_acc(D, C, M):
    acc = contributers
    acc0 = pd.DataFrame(np.zeros((np.size(acc),np.size(acc))), columns=acc, index=acc)

    p_p = d * norm_rows(D)
    p_a = m * norm_rows(M) + c * norm_rows(C)
    a_p = (m_ * M.transpose()).multiply(norm_rows(C.transpose())) + (c_ * norm_rows(C.transpose()))
    # combine the matrices
    return norm_rows(pd.concat([pd.concat([p_p,a_p], axis=0),pd.concat([p_a,acc0], axis=0)], axis=1))

adj_all = adj_p_and_acc(adjdf, adj_contrib, adj_maintain)
adj_all.head()

MemoryError: 

In [None]:
#Write adjacency matrix data to csv
# adjdf.to_csv('data/cargo-adjdf.csv')
# adjdf_from_csv = pd.read_csv('data/cargo-adjdf.csv', delimiter=',');

In [6]:
def g_from_adj(adjdf):
    adjmat = adjdf.values
    # construct the graph from the adjacency matrix
    g = ig.Graph.Adjacency(adjmat.astype(bool).tolist())
    g.es['weight'] = adjmat[adjmat.nonzero()]
    g.vs['label'] = labels
    return g

def page_rank(g):
    # reset = [e_proj] * len(proj) + [e_account] * len(acc)
    reset = [e_proj] * len(proj_ids)
    page_rank = g.personalized_pagerank(reset=reset, niter=niter);
    return pd.DataFrame({'ID': proj_ids, 'Name': labels, 'page_rank': page_rank})

# adjdf_from_csv for using the stored data
# adjdf for the data from memory
g = g_from_adj(adjdf)

page_rank = page_rank(g)
page_rank_top100 = page_rank.sort_values(by=['page_rank'], ascending=False).head(100)

In [9]:
#Write page_rank data to csv
page_rank.sort_values(by=['page_rank'], ascending=False).to_csv('data/cargo-pagerank.csv', index= False)

In [20]:
page_rank_top100

Unnamed: 0,ID,Name,page_rank
3563,1369785,num-traits,0.059415
1005,428702,rand,0.035209
518,31533,winapi,0.033656
425,31384,serde,0.026141
242,31115,libc,0.022697
4550,1629953,serde_derive,0.020821
16285,3414109,rustc-std-workspace-core,0.019508
396,31342,rustc-serialize,0.014597
7998,2353652,proc-macro2,0.013991
1542,783030,clippy,0.013728


In [None]:
layout = g.layout("large")
vertex_weights = [i * 100 for i in page_rank['page_rank'].tolist()]
adjmat = adjdf.values
edge_weights = [i * 10 for i in adjmat[adjmat.nonzero()]]
#ig.plot(g, vertex_size=vertex_weights, vertex_color=(["blue"] * len(proj_ids)), vertex_label_dist=1, edge_width=edge_weights, layout=layout)