In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("../assets/cran_packages_dropped.csv")
df.head()

Unnamed: 0,Package,Version,Depends,Imports,LinkingTo,Suggests,License,MD5sum,NeedsCompilation,Repository,count
0,A3,1.0.0,"R (>= 2.15.0), xtable, pbapply",,,"randomForest, e1071",GPL (>= 2),027ebdd8affce8f0effaecfcd5f5ade2,no,https://cran.rstudio.com/src/contrib,116853
1,AalenJohansen,1.0,,,,"knitr, rmarkdown",GPL (>= 2),d7eb2a6275daa6af43bf8a980398b312,no,https://cran.rstudio.com/src/contrib,3356
2,AATtools,0.0.2,R (>= 3.6.0),"magrittr, dplyr, doParallel, foreach",,,GPL-3,bc59207786e9bc49167fd7d8af246b1c,no,https://cran.rstudio.com/src/contrib,20756
3,ABACUS,1.0.0,R (>= 3.1.0),"ggplot2 (>= 3.1.0), shiny (>= 1.3.1),",,"rmarkdown (>= 1.13), knitr (>= 1.22)",GPL-3,50c54c4da09307cb95a70aaaa54b9fbd,no,https://cran.rstudio.com/src/contrib,27700
4,abasequence,0.1.0,,,,,GPL-3,1392d909eb0f65be94fd4160a371ae21,no,https://cran.rstudio.com/src/contrib,1856


In [3]:
df = df[["Package", "Imports", "Suggests"]]
df.head()

Unnamed: 0,Package,Imports,Suggests
0,A3,,"randomForest, e1071"
1,AalenJohansen,,"knitr, rmarkdown"
2,AATtools,"magrittr, dplyr, doParallel, foreach",
3,ABACUS,"ggplot2 (>= 3.1.0), shiny (>= 1.3.1),","rmarkdown (>= 1.13), knitr (>= 1.22)"
4,abasequence,,


In [4]:
df.tail()

Unnamed: 0,Package,Imports,Suggests
20097,zTree,plyr (>= 1.0),
20098,ztype,"magrittr, rvest, stats, stringr, utils, dplyr,...",
20099,ZVCV,"Rcpp (>= 0.11.0), glmnet, abind, mvtnorm, stat...","partitions, ggplot2, ggthemes"
20100,zyp,,
20101,zzlite,"httr, jsonlite","httptest, roxygen2, testthat, knitr, rmarkdown"


In [5]:
def rem_brackets(string):
    if isinstance(string, str):
        return re.sub(r'\([^()]*\)', '', string)
    return string


df[['Imports', 'Suggests']] = df[['Imports', 'Suggests']].map(rem_brackets)

In [6]:
packs = {}
for _, row in df.iterrows():
    package_name = row['Package']
    imports = row['Imports'] if pd.notnull(row['Imports']) else []
    suggests = row['Suggests'] if pd.notnull(row['Suggests']) else []
    packs[package_name.lower()] = {'imports': imports, 'suggests': suggests}

packs["abacus"]

{'imports': 'ggplot2 , shiny ,', 'suggests': 'rmarkdown , knitr '}

In [7]:
len(packs)

20102

In [8]:
known = set()

for package in packs:
    known.add(package.lower())

len(known)

20102

In [9]:
for package in packs:
    if packs[package]['imports']:
        imports = set([ele.strip().lower() for ele in (packs[package]["imports"]).split(",") if ele.strip().lower() in known])

        if "" in imports:
            imports.remove("")

    if packs[package]['suggests']:
        suggests = set([ele.strip().lower() for ele in (packs[package]["suggests"]).split(",") if ele.strip().lower() in known])

        if "" in suggests:
            suggests.remove("")
    
    packs[package]["imports"] = list(imports)
    packs[package]["suggests"] = list(suggests)

In [10]:
packs["abacus"]

{'imports': ['shiny', 'ggplot2'], 'suggests': ['knitr', 'rmarkdown']}

In [11]:
totality = set()
known = set()

for package in packs:
    totality.add(package.lower())
    known.add(package.lower())
    
    totality.update(packs[package]["imports"])
    totality.update(packs[package]["suggests"])

len(totality)

20114

In [12]:
unknown = totality.difference(known)

In [13]:
for package in packs:
    if packs[package]['imports']:
        imports = [ele for ele in (packs[package]["imports"]) if ele in known]

    if packs[package]['suggests']:
        suggests = [ele for ele in (packs[package]["suggests"]) if ele in known]

    packs[package]["imports"] = list(imports)
    packs[package]["suggests"] = list(suggests)

In [14]:
totality = set()
known = set()

for package in packs:
    totality.add(package.lower())
    known.add(package.lower())
    
    totality.update(packs[package]["imports"])
    totality.update(packs[package]["suggests"])

len(totality)

20102

In [15]:
index = {}
label = {}

i = 1

for package in packs:
    index[package.lower()] = i
    label[i] = package.lower()
    i += 1

print(index["abacus"])
label[4]

4


'abacus'

In [16]:
dependecy = {}

for package in packs:
    dependecy[index[package.lower()]] = {}

    strong = [index[ele.lower()] for ele in packs[package]["imports"]]
    weak = [index[ele.lower()] for ele in packs[package]["suggests"]]

    dependecy[index[package.lower()]]["strong"] = strong
    dependecy[index[package.lower()]]["weak"] = weak

dependecy[4]

{'strong': [16523, 6644], 'weak': [8780, 15103]}

In [17]:
label[16523]

'shiny'

In [18]:
backlinks = {}

for dp in dependecy:
    for ele in dependecy[dp]["strong"]:
        backlinks[ele] = backlinks.get(ele, []) + [dp]

    for ele in dependecy[dp]["weak"]:
        backlinks[ele] = backlinks.get(ele, []) + [dp]

len(backlinks)

6009

In [20]:
backlinks[6644]

[4,
 5,
 6,
 7,
 8,
 9,
 16,
 17,
 22,
 23,
 24,
 27,
 34,
 40,
 42,
 43,
 48,
 51,
 52,
 57,
 85,
 88,
 89,
 90,
 92,
 95,
 96,
 97,
 99,
 100,
 102,
 118,
 119,
 125,
 135,
 136,
 150,
 158,
 163,
 164,
 165,
 166,
 172,
 173,
 176,
 177,
 178,
 191,
 192,
 194,
 195,
 196,
 206,
 207,
 208,
 209,
 210,
 211,
 213,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 224,
 226,
 227,
 228,
 229,
 232,
 237,
 243,
 257,
 258,
 258,
 259,
 259,
 260,
 261,
 264,
 265,
 266,
 267,
 268,
 270,
 276,
 280,
 283,
 289,
 293,
 294,
 297,
 299,
 300,
 312,
 313,
 314,
 317,
 318,
 329,
 334,
 335,
 336,
 337,
 347,
 349,
 350,
 351,
 357,
 364,
 365,
 366,
 367,
 386,
 387,
 388,
 391,
 392,
 393,
 394,
 396,
 397,
 410,
 413,
 415,
 416,
 417,
 419,
 420,
 421,
 422,
 423,
 424,
 425,
 430,
 431,
 432,
 438,
 439,
 440,
 444,
 445,
 448,
 449,
 450,
 451,
 454,
 457,
 463,
 464,
 465,
 466,
 467,
 471,
 472,
 476,
 483,
 484,
 485,
 486,
 495,
 496,
 497,
 497,
 498,
 507,
 511,
 514,
 517,
 51

In [21]:
import json

json.dump(backlinks, open('./backlinks.json', 'w'))

In [18]:
json.dump(dependecy, open("./dependency.json", "w"))
json.dump(index, open("./indices.json", "w"))
json.dump(label, open("./labels.json", "w"))