# Information about the current dataset

This worksheet analyses some basic properties of the dataset we use.
We highlight numbers referred to in the paper by quoting the corresponding sentences from Section 5.1, which explains our experimental setup.

## Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import utils

In [2]:
selected_builds = utils.load_selected_builds()

## Number of Builds

The number of crates we managed to successfully build:

In [3]:
len(selected_builds)

510

In [4]:
selected_builds

Unnamed: 0,build,package,version,crate,crate_hash,edition,crate_types
0,1,hostname,0.3.1,hostname,7d24e7bf196f8468,2015,rlib
1,4,rustc_version,0.2.3,rustc_version,e7a6ad26a2154927,2015,rlib
2,6,tinyvec,1.0.1,tinyvec,71ef93374e1c0780,2018,rlib
3,7,md5,0.7.0,md5,52b68823a0dc0c42,2015,rlib
4,10,object,0.21.1,object,3f7d31cb03159107,2018,rlib
...,...,...,...,...,...,...,...
505,2118,owning_ref,0.4.1,owning_ref,35803e692910b68,2015,rlib
506,2121,ring,0.17.0-alpha.4,ring,57d7c3f120e595e4,2018,rlib
507,2125,difference,2.0.0,difference,1604b5f77078947f,2015,rlib
508,2127,pretty_assertions,0.6.1,pretty_assertions,f3ce7e520d654cc8,2018,rlib


In [5]:
selected_builds.at[1,'package']

'rustc_version'

In [6]:
#for name in selected_builds.crate:
#    if 'core' in name:
#        print("Found:", name)

for name in selected_builds.crate:
    if name == 'ppv_lite86':
        print("Found:", name)

for hash in selected_builds.crate_hash:
    if hash == '8f7f8b0411c06b17':
        print("Found hash:", hash)
        
for i, s in selected_builds.iterrows():
    if 'core' in s.crate:
        print("")
        print("Found this series:")
        print(s)

Found: ppv_lite86
Found hash: 8f7f8b0411c06b17

Found this series:
build                        52
package         core-foundation
version                   0.9.1
crate           core_foundation
crate_hash     fd9e7bfc17ad8030
edition                    2015
crate_types                rlib
Name: 31, dtype: object

Found this series:
build                           53
package        core-foundation-sys
version                      0.8.1
crate          core_foundation_sys
crate_hash        3ad5e02c9fe93a19
edition                       2015
crate_types                   rlib
Name: 32, dtype: object

Found this series:
build                        96
package              rayon-core
version                   1.8.1
crate                rayon_core
crate_hash     d7dd626bcc745a48
edition                    2018
crate_types                rlib
Name: 54, dtype: object

Found this series:
build                       102
package        parking_lot_core
version                   0.8.0
crate       

In [7]:
selected_builds.query("crate=='core_arch'").crate

Series([], Name: crate, dtype: object)

## Crate Names

The most popular crate names are:

In [8]:
selected_builds.crate.value_counts()[:10]

probe1            4
mdbook            2
open              2
sha1              2
pulldown_cmark    2
semver_parser     2
gcc_shim          2
bindgen           2
rustversion       1
native_tls        1
Name: crate, dtype: int64

The crate names `build_script_<file>` indicate that the crate is a build script; `<file>` is the name of the actual build script without the `.rs` suffix. We have omited the build scripts from our analysis; that is why they are not shown in `selected_builds`.

The crate names that often repeat such as `main`, `example`, `test`, and `demo` typically belong to binaries as can be seen from the following frequence table:

In [9]:
selected_builds.query("crate_types=='bin'").crate.value_counts()

gcc_shim           2
cat                1
mdbook             1
bindgen            1
xml_analyze        1
exit               1
semver_parser      1
open               1
process_cert       1
handlebars_cli     1
pulldown_cmark     1
docopt_wordlist    1
Name: crate, dtype: int64

## Crate Types

We first check what crate types exist:

In [10]:
pd.unique(selected_builds.crate_types)

array(['rlib', 'proc-macro', 'bin'], dtype=object)

Count how many crates of each type we have:

In [11]:
import utils
crate_type_counts = utils.count_builds_per_type(selected_builds)
print(crate_type_counts)

{'rlib': 472, 'proc-macro': 25, 'bin': 13}


The three builds with missing type are Rust files [generated](https://gitlab.com/tspiteri/rug/-/blob/834cbefbce178a67bd97cc93ab1f88f262bd6126/build.rs#L27-29) by the `rug` package build.rs file to check what features are supported by the Rust compiler.

In [12]:
selected_builds.query("crate_types == ''")

Unnamed: 0,build,package,version,crate,crate_hash,edition,crate_types


## Editions

In [13]:
selected_builds.edition.value_counts()

2018    256
2015    254
Name: edition, dtype: int64

## `-sys` packages

Check what portion of `-sys` crates have matching non`-sys` crates:

In [14]:
# Validation: check that for each `-sys` crate there is one without the suffix.
def check_sys_crates():
    packages = utils.load_original_crates_list()
    assert len(packages[packages.isnull().any(axis=1)]) == 0
    all_crate_names = set(packages['package'])
    not_found = []
    found = []
    for crate_name in all_crate_names:
        if crate_name.endswith('-sys'):
            if crate_name[:-4] not in all_crate_names:
                not_found.append(crate_name)
            else:
                found.append(crate_name)
    print("Total: ", (len(found)+len(not_found)))
    print("Percent of -sys crates that have matching non-sys crates: {:.2f}%".format(
        100*len(found) / (len(found)+len(not_found))))
check_sys_crates()

Total:  21
Percent of -sys crates that have matching non-sys crates: 42.86%


In [15]:
# Validation: check that for each `-sys` crate there is one wit the same prefix.
def check_sys_crates_relaxed():
    packages = utils.load_original_crates_list()
    assert len(packages[packages.isnull().any(axis=1)]) == 0
    all_crate_names = set(packages['package'])
    not_found = []
    found = []
    for crate_name in all_crate_names:
        if crate_name.endswith('-sys'):
            base_name = crate_name[:-4]
            # Quadratic... well
            if any(
                map(
                    lambda x: (
                        x != crate_name
                        and not x.endswith('-sys')
                        and x.startswith(base_name)
                    ),
                    all_crate_names
                )
            ):
                found.append(crate_name)
            else:
                not_found.append(crate_name)
    print("Total: ", (len(found)+len(not_found)))
    print("Percent of -sys crates that have relaxed-matching non-sys crates: {:.2f}%".format(
        100*len(found) / (len(found)+len(not_found))))
check_sys_crates_relaxed()

Total:  21
Percent of -sys crates that have relaxed-matching non-sys crates: 61.90%


# Basics about Packages

> We evaluated our queries on a dataset that comprises the most-recent version (as of 2020-01-14) of all 34445 packages published on central Rust repository crates.io. The implementation of a package can be composed of multiple crates, one of which is usually primary and determines the name of the package.
> We excluded 5459 packages (16%) whose most recent version did not successfully compile.
> For packages with conditional compilation features, we used the default flags specified in the manifest.
> In cases when a package failed to compile with the default flags, but succeeded with different ones (when compiled as a dependency of another package) we selected a random build for analysis.
> As a result, our dataset consists of 31867 crates.

In [16]:
print("Number of original packages:", len(utils.load_original_crates_list()))
print("Number of compiling packages:", len(selected_builds.package.unique()),
      len(selected_builds.package.unique()) / len(utils.load_original_crates_list()))
print("Number of non-compiling packages:",
      len(utils.load_original_crates_list()) - len(selected_builds.package.unique()),
     (len(utils.load_original_crates_list()) - len(selected_builds.package.unique())) / len(utils.load_original_crates_list()))
print("Number of crates:", len(selected_builds))
print("Number of unique crate names (don't use this number):", len(selected_builds.crate.unique()))

Number of original packages: 500
Number of compiling packages: 488 0.976
Number of non-compiling packages: 12 0.024
Number of crates: 510
Number of unique crate names (don't use this number): 500


> Most of these crates are compiled to Rust libraries, namely 76%, or binaries, namely 20%.
The other crates are procedural macros (4%).

In [17]:
print("Crates")
print(selected_builds.crate_types.value_counts())
print()
print("Libs:",
      sum(selected_builds.crate_types.str.contains("lib")),
      sum(selected_builds.crate_types.str.contains("lib")) / len(selected_builds)
)
print("Bin:",
      sum(selected_builds.crate_types.str.contains("bin")),
      sum(selected_builds.crate_types.str.contains("bin")) / len(selected_builds)
)
print("proc-macro:",
      sum(selected_builds.crate_types.str.contains("proc-macro")),
      sum(selected_builds.crate_types.str.contains("proc-macro")) / len(selected_builds)
)
others = (
    ~selected_builds.crate_types.str.contains("bin")
    & ~selected_builds.crate_types.str.contains("proc-macro")
    & ~selected_builds.crate_types.str.contains("lib")
)
print("Others:",
      sum(others),
      sum(others) / len(selected_builds)
)

Crates
rlib          472
proc-macro     25
bin            13
Name: crate_types, dtype: int64

Libs: 472 0.9254901960784314
Bin: 13 0.025490196078431372
proc-macro: 25 0.049019607843137254
Others: 0 0.0


## Search for specific crates

In [18]:
from whitelists import *
print("sys crates:", len([
    x
    for x in selected_builds.crate
    if x.lower().replace("-", "_").endswith("_sys")
]))
print("sys crates (with manual):", len(get_sys_crate_names(selected_builds.crate.unique())))

sys crates: 19
sys crates (with manual): 32


In [19]:
print("hardware crates", len(hardware_crate_names))

hardware crates 10


In [20]:
from top_crates import *
print("top_500_crates:", len(set(top_500_crates)))

def norm(x):
    return x.lower().replace("-", "_")

top_500_compiling_crates = set(map(norm, top_500_crates)) & set(map(norm, selected_builds.package.unique()))
top_500_crates_set = set(top_500_crates)
top_500_mask = selected_builds.package.map(lambda x: x.replace("-", "_") in top_500_crates_set)
top_500_builds = selected_builds[top_500_mask][
    ~selected_builds[top_500_mask]["package"].duplicated()
]

print("top_500_packages compiling:", len(top_500_compiling_crates))

print("top_500_crates compiling:", len(top_500_builds))

print("Crates")
print(selected_builds.crate_types.value_counts())
print()
print("Libs:",
      sum(top_500_builds.crate_types.str.contains("rlib")),
      sum(top_500_builds.crate_types.str.contains("rlib")) / len(selected_builds)
)
print("Bin:",
      sum(top_500_builds.crate_types.str.contains("bin")),
      sum(top_500_builds.crate_types.str.contains("bin")) / len(selected_builds)
)
print("proc-macro:",
      sum(top_500_builds.crate_types.str.contains("proc-macro")),
      sum(top_500_builds.crate_types.str.contains("proc-macro")) / len(selected_builds)
)
others = (
    ~top_500_builds.crate_types.str.contains("bin")
    & ~top_500_builds.crate_types.str.contains("proc-macro")
    & ~top_500_builds.crate_types.str.contains("rlib")
)
print("Others:",
      sum(others),
      sum(others) / len(top_500_builds)
)

top_500_crates: 500
top_500_packages compiling: 461
top_500_crates compiling: 461
Crates
rlib          472
proc-macro     25
bin            13
Name: crate_types, dtype: int64

Libs: 440 0.8627450980392157
Bin: 3 0.0058823529411764705
proc-macro: 18 0.03529411764705882
Others: 0 0.0
