In [12]:
from collections import Counter
from collections import defaultdict
import json
import magic
import matplotlib
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import subprocess
import tarfile
import timeit
import time
from tqdm import tqdm
from typing import Dict, List
from yara_scanner import YaraScanner

%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [3]:
# Initialize and load pypi malware checks rules
pypi_malware_checks_rule_path = os.path.abspath("../scanners/pypi-malware-checks/setup_py_rules.yara")
scanner = YaraScanner()
scanner.track_yara_file(pypi_malware_checks_rule_path)
scanner.load_rules()

True

In [10]:
# This function is used to calculate runtimes of PyPI malware checks
def calculate_runtimes(package_releases_path):
    package_dirs = [f.path for f in os.scandir(package_releases_path) if f.is_dir()]
    runtimes = {}
    for package_dir in tqdm(package_dirs):
        package_name = package_dir.split("/")[-1]
        start = time.time()
        subprocess.call(
            ["scan", "-r", package_dir, "-y", "../scanners/pypi-malware-checks/setup_py_rules.yara"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT)
        runtimes[package_name] = time.time() - start
    return runtimes

In [109]:
# Scanning packages using PyPI
def scanning_packages(package_releases_path) -> Dict:
    results = defaultdict(list)
    for subdir, dirs, files in tqdm(os.walk(package_releases_path)):
        for file in files:
            # Here I made an assumpution of the file path; e.g., I stored the bad-snakes repo under my home directory 
            try:
                package_name = f"{subdir.split('/')[6]}"
            except IndexError:
                import pdb; pdb.set_trace()
            filepath = subdir + os.sep + file
            # Only scan Python files to reduce noises, there are packages that don't have Python files at all
            if str(file).endswith(".py"):
                try:
                    scanner.scan(filepath)
                except Exception as e:
                    # there are cases where .py files containing binary data that yara-scanner cannot exclude
                    print(package_name, filepath, e)
                else:
                    scan_results = scanner.scan_results
                    if scan_results:
                        for i in scan_results:
                            results[package_name].append(i)
                    else:
                        # if there are no alerts 
                        results[package_name].append({"target": file, 'rule': ""})
    return results      

In [110]:
# Collecting number of rules of the packages
def parse_analysis_results(packages_scanning_results: Dict[str, str]) -> List[str]:
    for package, files in packages_scanning_results.items():
        for file_ in files: 
            if file_['rule']:
                yield (package, file_["target"], file_['rule'], 1)
            else:
                yield (package, file_["target"], "", 0)

In [111]:
# Collecting subpatterns of the packages
def collect_subpatterns(packages_scanning_results: Dict[str, str]) -> List[str]:
    for package, files in packages_scanning_results.items():
        for file_ in files: 
            if 'strings' in file_:
                for line in file_['strings']:
                    #line[0] -> line number, line[1] -> subpattern, line[2] -> line content 
                    yield (package, file_["target"], file_['rule'], line[1])              

## Analyzing Malicious packages

In [7]:
# Path to the malicious packages dataset
malicious_packages_path = os.path.abspath("../dataset/malicious-packages/")

In [13]:
malicious_running_times = computing_runtimes(malicious_packages_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 168/168 [02:34<00:00,  1.09it/s]


In [14]:
malicious_packages_running_times_df = pd.DataFrame(list(malicious_running_times.items()), columns=['package', 'running_time'])

In [15]:
malicious_packages_running_times_df.to_csv("../results/running_times/pypi-malware-checks/malicious-packages.csv")

In [16]:
malicious_packages_running_times_df["running_time"].describe()

count    168.000000
mean       0.917025
std        1.588213
min        0.283062
25%        0.525685
50%        0.602091
75%        0.766196
max       18.749159
Name: running_time, dtype: float64

In [None]:
# Uncomment to re-run the scanning, this would take a while
malicious_packages_scanning_results = scanning_releases(malicious_packages_path)

In [83]:
# Loading the existing result in case we do not want to rescan the packages
with open(os.path.abspath(os.path.join("..", "results", "pypi-malware-checks", "malicious_packages.pkl")), 'rb') as fp:
    malicious_packages_scanning_results = pickle.load(fp)

### Triggered rules and subpatterns in all Python files in malicious packages

In [84]:
# Collect the alerts and transform them into datafarame
malicious_packages_rules = parse_analysis_results(malicious_packages_scanning_results)
malicious_packages_rules_df = pd.DataFrame(malicious_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [85]:
# Number of rules per package
malicious_packages_rules_groupby = malicious_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_rules_groupby.sum().sum()}")
malicious_packages_rules_groupby.sum().describe()

Total number of rules: 850


count    168.000000
mean       5.059524
std       12.944702
min        0.000000
25%        1.000000
50%        2.000000
75%        4.000000
max      118.000000
Name: has_rule, dtype: float64

In [86]:
# packages that triggered at least one alert
malicious_packages_has_rules_df = malicious_packages_rules_df[malicious_packages_rules_df['has_rule'] != 0]

In [87]:
# Number of rules per package
malicious_packages_has_rules_groupby = malicious_packages_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_has_rules_groupby.sum().sum()}")
malicious_packages_has_rules_groupby.sum().describe()

Total number of rules: 850


count    144.000000
mean       5.902778
std       13.808541
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
max      118.000000
Name: has_rule, dtype: float64

In [74]:
# Collect the alerts and transform them into datafarame
malicious_packages_subpatterns = collect_subpatterns(malicious_packages_scanning_results)
malicious_packages_subpatterns_df = pd.DataFrame(malicious_packages_subpatterns, columns=["package", "target", "rule", "subpattern"])

In [75]:
# Number of subpatterns per package
malicious_packages_subpatterns_groupby = malicious_packages_subpatterns_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {malicious_packages_subpatterns_groupby.count().sum()}")
malicious_packages_subpatterns_groupby.count().describe()

Total number of subpatterns: 8295


count     144.000000
mean       57.604167
std       258.067813
min         1.000000
25%         4.750000
50%        14.500000
75%        53.000000
max      3054.000000
Name: subpattern, dtype: float64

### Triggered rules and subpatterns in all setup.py files in malicious packages

In [88]:
# Selecting only setup.py files
malicious_packages_rules_setup_df = malicious_packages_rules_df[malicious_packages_rules_df['target'].str.contains('setup.py')]

In [89]:
# Number of rules per package
malicious_packages_rules_setup_groupby = malicious_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_rules_setup_groupby.sum().sum()}")
malicious_packages_rules_setup_groupby.sum().describe()

Total number of rules: 175


count    163.000000
mean       1.073620
std        1.057357
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max        3.000000
Name: has_rule, dtype: float64

In [90]:
# packages whose setup.py and has at least one alert
malicious_packages_setup_has_rules_df = malicious_packages_rules_setup_df[malicious_packages_rules_setup_df['has_rule'] != 0]

In [91]:
# Number of rules per package
malicious_packages_setup_has_rules_groupby = malicious_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_setup_has_rules_groupby.sum().sum()}")
malicious_packages_setup_has_rules_groupby.sum().describe()

Total number of rules: 175


count    99.000000
mean      1.767677
std       0.780218
min       1.000000
25%       1.000000
50%       2.000000
75%       2.000000
max       3.000000
Name: has_rule, dtype: float64

In [25]:
malicious_packages_subpatterns_setup_df = malicious_packages_subpatterns_df[malicious_packages_subpatterns_df['target'].str.contains('setup.py')]

In [26]:
# Number of subpatterns per package
malicious_packages_subpatterns_setup_groupby = malicious_packages_subpatterns_setup_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {malicious_packages_subpatterns_setup_groupby.count().sum()}")
malicious_packages_subpatterns_setup_groupby.count().describe()

Total number of subpatterns: 1191


count    99.000000
mean     12.030303
std      12.465016
min       1.000000
25%       3.000000
50%       6.000000
75%      15.500000
max      62.000000
Name: subpattern, dtype: float64

### Runtime

In [None]:
malicious_packages_running_times = computing_runtimes(malicious_packages_path)

In [None]:
pd.DataFrame(list(malicious_packages_running_times.values())).describe()

## Analyzing Popular packages

In [17]:
# Path to the popular packages dataset
popular_packages_path = os.path.abspath("../dataset/popular-packages")

In [18]:
popular_running_times = computing_runtimes(popular_packages_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1430/1430 [3:58:55<00:00, 10.03s/it]


In [19]:
popular_packages_running_times_df = pd.DataFrame(list(popular_running_times.items()), columns=['package', 'running_time'])

In [20]:
popular_packages_running_times_df.to_csv("../results/running_times/pypi-malware-checks/popular-packages.csv")

In [21]:
popular_packages_running_times_df["running_time"].describe()

count    1430.000000
mean       10.020938
std        53.222977
min         0.332454
25%         0.955504
50%         1.876288
75%         4.753420
max      1570.409981
Name: running_time, dtype: float64

In [28]:
popular_packages_running_times_df.loc[popular_packages_running_times_df['running_time'].idxmax()]

package             ansible
running_time    1570.409981
Name: 36, dtype: object

In [114]:
# Loading the existing result in case we do not want to rescan the packages
with open(os.path.abspath(os.path.join("..", "results", "pypi-malware-checks", "popular_packages.pkl")), 'rb') as fp:
    popular_packages_scanning_results = pickle.load(fp)

### Triggered rules and subpatterns in all Python files in popular packages

In [115]:
# Collect the alerts and transform them into datafarame
popular_packages_rules = parse_analysis_results(popular_packages_scanning_results)
popular_packages_rules_df = pd.DataFrame(popular_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [116]:
# Number of rules per package
popular_packages_rules_groupby = popular_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_rules_groupby.sum().sum()}")
popular_packages_rules_groupby.sum().describe()

Total number of rules: 51283


count    1430.000000
mean       35.862238
std       163.910398
min         0.000000
25%         3.000000
50%         8.000000
75%        27.000000
max      5377.000000
Name: has_rule, dtype: float64

In [117]:
# popular packages that have at least one alert
popular_packages_has_rules_df = popular_packages_rules_df[popular_packages_rules_df['has_rule'] != 0]

In [118]:
# Number of rules per package
popular_packages_has_rules_groupby = popular_packages_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_has_rules_groupby.sum().sum()}")
popular_packages_has_rules_groupby.sum().describe()

Total number of rules: 51283


count    1347.000000
mean       38.072012
std       168.639064
min         1.000000
25%         3.000000
50%        10.000000
75%        29.000000
max      5377.000000
Name: has_rule, dtype: float64

In [36]:
# Collect the alerts and transform them into datafarame
popular_packages_subpatterns = collect_subpatterns(popular_packages_scanning_results)
popular_packages_subpatterns_df = pd.DataFrame(popular_packages_subpatterns, columns=["package", "target", "rule", "subpattern"])

In [37]:
# Number of subpatterns per package
popular_packages_subpatterns_groupby = popular_packages_subpatterns_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {popular_packages_subpatterns_groupby.count().sum()}")
popular_packages_subpatterns_groupby.count().describe()

Total number of subpatterns: 607854


count     1347.000000
mean       451.265033
std       1795.839013
min          1.000000
25%         12.000000
50%         60.000000
75%        265.500000
max      44798.000000
Name: subpattern, dtype: float64

### Triggered rules and subpatterns in all setup.py files in popular packages

In [119]:
# Selecting only setup.py files
popular_packages_rules_setup_df = popular_packages_rules_df[popular_packages_rules_df['target'].str.contains('setup.py')]

In [120]:
# Number of rules per package
popular_packages_rules_setup_groupby = popular_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_rules_setup_groupby.sum().sum()}")
popular_packages_rules_setup_groupby.sum().describe()

Total number of rules: 768


count    1320.000000
mean        0.581818
std         1.568541
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max        45.000000
Name: has_rule, dtype: float64

In [121]:
# popular packages whose setup.py files that have at least one alert
popular_packages_setup_has_rules_df = popular_packages_rules_setup_df[popular_packages_rules_setup_df['has_rule'] != 0]

In [122]:
# Number of rules per package
popular_packages_setup_has_rules_groupby = popular_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_setup_has_rules_groupby.sum().sum()}")
popular_packages_setup_has_rules_groupby.sum().describe()

Total number of rules: 768


count    474.000000
mean       1.620253
std        2.274802
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max       45.000000
Name: has_rule, dtype: float64

In [123]:
# Ratio of true positives to false postives 
thresholds = [1, 2, 3, 4, 5]
scores = popular_packages_setup_has_rules_groupby.sum().to_list()
thesholds_tpr_fpr_ratio = []
for t in thresholds:
    tp, fp = get_tpr_fpr(scores, t)
    print(t, tp, fp)

NameError: name 'get_tpr_fpr' is not defined

In [42]:
popular_packages_subpatterns_setup_df = popular_packages_subpatterns_df[popular_packages_subpatterns_df['target'].str.contains('setup.py')]

In [43]:
# Number of subpatterns per package
popular_packages_subpatterns_setup_groupby = popular_packages_subpatterns_setup_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {popular_packages_subpatterns_setup_groupby.count().sum()}")
popular_packages_subpatterns_setup_groupby.count().describe()

Total number of subpatterns: 14219


count    474.000000
mean      29.997890
std       66.334806
min        1.000000
25%        1.000000
50%        3.000000
75%       22.750000
max      610.000000
Name: subpattern, dtype: float64

### Running time

In [44]:
popular_packages_running_times = computing_runtimes(popular_packages_path)

NameError: name 'popular_packages_path' is not defined

In [None]:
pd.DataFrame(list(popular_packages_running_times.values())).describe()

In [None]:
# Saving the existing result
with open(os.path.abspath("../results/pypi-malware-checks/popular_packages_scanning_results_pypi_malwarechecks.pkl"), 'wb') as fp:
    pickle.dump(popular_packages_scanning_results, fp, protocol=pickle.HIGHEST_PROTOCOL)

### Ploting results

In [None]:
# Number of alerts per rule
print(malicious_rules_num_alerts_df.groupby('rule')['num_alerts'].sum())
print(f"Total: {malicious_rules_num_alerts_df.groupby('rule')['num_alerts'].sum().sum()}")

In [None]:
ax = malicious_rules_num_alerts_df.num_alerts.groupby(malicious_rules_num_alerts_df.package).nunique().hist(bins=10);
ax.set_xlabel('num packages');
ax.set_ylabel('num alerts');
ax.set_xticks(range(1, 30, 2), minor=False)
plt.show()

In [None]:
# Number of unique packages having a rule
malicious_rules_num_alerts_df.groupby('rule')['package'].nunique()

In [None]:
# Loading the existing result
with open(os.path.abspath("../results/popular_packages_scanning_results_pypi_malwarechecks.pkl"), 'rb') as fp:
    popular_packages_scanning_results = pickle.load(fp)

In [None]:
# Selecting only setup.py files
malicious_packages_alerts_setup_df = malicious_packages_alerts_df[malicious_packages_alerts_df['target'].str.contains('setup.py')]

In [None]:
# Number of rules per package
malicious_packages_rules_setup = malicious_packages_alerts_setup_df.groupby('package')['rule']
print(f"Total number of rules: {malicious_packages_rules_setup.nunique().sum()}")
malicious_packages_rules_setup.nunique().describe()

In [None]:
# Number of subpatterns per package
malicious_packages_subpatterns_setup = malicious_packages_alerts_setup_df.groupby('package')['subpattern']
print(f"Total number of subpatterns in setup.py files: {malicious_packages_subpatterns_setup.count().sum()}")
malicious_packages_subpatterns_setup.count().describe()

In [None]:
# Number of alerts per rule
print(malicious_rules_num_alerts_setup_df.groupby('rule')['num_alerts'].sum())
print(f"Total: {malicious_rules_num_alerts_setup_df.groupby('rule')['num_alerts'].sum().sum()}")

In [None]:
# Number of unique packages having a rule
print(malicious_rules_num_alerts_setup_df.groupby('rule')['package'].nunique())
print(f"Total: {malicious_rules_num_alerts_setup_df.groupby('rule')['package'].nunique().sum()}")

In [None]:
# Saving the existing result
with open(os.path.abspath("../results/malicious_packages_scanning_results_pypi_malwarechecks.pkl"), 'wb') as fp:
    pickle.dump(malicious_packages_scanning_results, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Collect the number of alerts associated with the packages and transform the result to dataframe for analysis
popular_rules_num_alerts = collect_rules_num_alerts(popular_packages_scanning_results)
popular_rules_num_alerts_df = pd.DataFrame(popular_rules_num_alerts, columns=["package", "target", "rule", "num_alerts"])

In [None]:
# Descriptive statistics of number of alerts
popular_rules_num_alerts_df.num_alerts.describe()

In [None]:
# Number of alerts per rule
print(popular_rules_num_alerts_df.groupby('rule')['num_alerts'].sum())
print(f"Total: {popular_rules_num_alerts_df.groupby('rule')['num_alerts'].sum().sum()}")

In [None]:
ax = popular_rules_num_alerts_df.num_alerts.groupby(popular_rules_num_alerts_df.package).nunique().hist(bins=20);
ax.set_xlabel('num packages');
ax.set_ylabel('num alerts');
ax.set_xticks(range(1, 100, 5), minor=False)
plt.show()

In [None]:
# Number of unique packages having a rule
print(popular_rules_num_alerts_df.groupby('rule')['package'].nunique())
print(f"Total: {popular_rules_num_alerts_df.groupby('rule')['package'].nunique().sum()}")

In [None]:
# Selecting only setup.py files
popular_rules_num_alerts_setup_df = popular_rules_num_alerts_df[popular_rules_num_alerts_df['target'].str.contains('setup.py')]

In [None]:
# Number of alerts per rule
print(popular_rules_num_alerts_setup_df.groupby('rule')['num_alerts'].sum())
print(f"Total: {popular_rules_num_alerts_setup_df.groupby('rule')['num_alerts'].sum().sum()}")

In [None]:
# Number of unique packages having a rule
popular_rules_num_alerts_setup_df.groupby('rule')['package'].nunique()

## Analyzing Random packages

In [22]:
random_packages_path = os.path.abspath("../dataset/random-packages/")

In [23]:
random_running_times = computing_runtimes(random_packages_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 986/986 [30:43<00:00,  1.87s/it]


In [24]:
random_packages_running_times_df = pd.DataFrame(list(random_running_times.items()), columns=['package', 'running_time'])

In [25]:
random_packages_running_times_df.to_csv("../results/running_times/pypi-malware-checks/random-packages.csv")

In [26]:
random_packages_running_times_df["running_time"].describe()

count    986.000000
mean       1.866525
std       10.444509
min        0.285383
25%        0.539373
50%        0.690816
75%        1.085275
max      243.675172
Name: running_time, dtype: float64

In [None]:
# Uncomment to re-run the scanning, this would take a while
random_packages_scanning_results = scanning_releases(random_pypi_packages)

In [None]:
ceremony_scanning_results = scanning_releases(random_pypi_packages)

### Triggered rules and subpatterns in all Python files in random packages

In [102]:
# Loading the existing result
with open(os.path.abspath(os.path.join("..", "results", "pypi-malware-checks", "random_packages.pkl")), 'rb') as fp:
    random_packages_scanning_results = pickle.load(fp)

In [103]:
# Collect the alerts and transform them into datafarame
random_packages_rules = parse_analysis_results(random_packages_scanning_results)
random_packages_rules_df = pd.DataFrame(random_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [104]:
# Number of rules per package
random_packages_rules_groupby = random_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_groupby.sum().sum()}")
random_packages_rules_groupby.sum().describe()

Total number of rules: 5460


count    986.000000
mean       5.537525
std       31.223431
min        0.000000
25%        0.000000
50%        1.000000
75%        4.000000
max      728.000000
Name: has_rule, dtype: float64

In [107]:
# packages that have at least one alert
random_packages_has_rules_df = random_packages_rules_df[random_packages_rules_df['has_rule'] != 0]

In [108]:
# Number of rules per package
random_packages_has_rules_groupby = random_packages_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_has_rules_groupby.sum().sum()}")
random_packages_has_rules_groupby.sum().describe()

Total number of rules: 5460


count    668.000000
mean       8.173653
std       37.657930
min        1.000000
25%        1.000000
50%        2.000000
75%        6.000000
max      728.000000
Name: has_rule, dtype: float64

In [49]:
# Collect the alerts and transform them into datafarame
random_packages_subpatterns = collect_subpatterns(random_packages_scanning_results)
random_packages_subpatterns_df = pd.DataFrame(random_packages_subpatterns, columns=["package", "target", "rule", "subpattern"])

In [50]:
# Number of subpatterns per package
random_packages_subpatterns_groupby = random_packages_subpatterns_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {random_packages_subpatterns_groupby.count().sum()}")
random_packages_subpatterns_groupby.count().describe()

Total number of subpatterns: 57108


count     668.000000
mean       85.491018
std       476.756346
min         1.000000
25%         3.000000
50%         9.000000
75%        41.000000
max      9466.000000
Name: subpattern, dtype: float64

### Triggered rules and subpatterns in all setup.py files in random packages

In [51]:
# Selecting only setup.py files
random_packages_rules_setup_df = random_packages_rules_df[random_packages_rules_df['target'].str.contains('setup.py')]

In [52]:
# Number of rules per package
random_packages_rules_setup_groupby = random_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_setup_groupby.sum().sum()}")
random_packages_rules_setup_groupby.sum().describe()

Total number of rules: 197


count    868.000000
mean       0.226959
std        0.633812
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        9.000000
Name: has_rule, dtype: float64

In [53]:
random_packages_setup_has_rules_df = random_packages_rules_setup_df[random_packages_rules_setup_df['has_rule'] != 0]

In [54]:
# Number of rules per package
random_packages_rules_setup_has_rules_groupby = random_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_setup_has_rules_groupby.sum().sum()}")
random_packages_rules_setup_has_rules_groupby.sum().describe()

Total number of rules: 197


count    147.000000
mean       1.340136
std        0.939955
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        9.000000
Name: has_rule, dtype: float64

In [None]:
random_packages_subpatterns_setup_df = random_packages_subpatterns_df[random_packages_subpatterns_df['target'].str.contains('setup.py')]

In [None]:
# Number of subpatterns per package
random_packages_subpatterns_setup_groupby = random_packages_subpatterns_setup_df.groupby('package')['subpattern']
print(f"Total number of subpatterns: {random_packages_subpatterns_setup_groupby.count().sum()}")
random_packages_subpatterns_setup_groupby.count().describe()

### Running time

In [None]:
random_packages_running_times = computing_runtimes(random_pypi_packages)

In [None]:
pd.DataFrame(list(random_packages_running_times.values())).describe()

In [None]:
# Saving the raw result for faster analaysis next time
with open(os.path.abspath("../results/pypi-malware-checks/random_packages_scanning_results_pypi_malwarechecks.pkl"), 'wb') as fp:
    pickle.dump(random_packages_scanning_results, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Collect the number of alerts associated with the packages and transform the result to dataframe for analysis
random_rules_num_alerts = collect_rules_num_alerts(random_packages_scanning_results)
random_rules_num_alerts_df = pd.DataFrame(random_rules_num_alerts, columns=["package", "target", "rule", "num_alerts"])

In [None]:
# Descriptive statistics of number of alerts
random_rules_num_alerts_df.num_alerts.describe()

In [None]:
# Number of alerts per rule
print(random_rules_num_alerts_df.groupby('rule')['num_alerts'].sum())
print(f"Total: {random_rules_num_alerts_df.groupby('rule')['num_alerts'].sum().sum()}")

In [None]:
ax = random_rules_num_alerts_df.num_alerts.groupby(random_rules_num_alerts_df.package).nunique().hist(bins=20);
ax.set_xlabel('num packages');
ax.set_ylabel('num alerts');
ax.set_xticks(range(1, 100, 5), minor=False)
plt.show()

In [None]:
# Number of unique packages having a rule
print(random_rules_num_alerts_df.groupby('rule')['package'].nunique())
print(f"Total: {random_rules_num_alerts_df.groupby('rule')['package'].nunique().sum()}")

In [None]:
# Selecting only setup.py files
random_rules_num_alerts_setup_df = random_rules_num_alerts_df[random_rules_num_alerts_df['target'].str.contains('setup.py')]

In [None]:
# Number of alerts per rule
print(random_rules_num_alerts_setup_df.groupby('rule')['num_alerts'].sum())
print(f"Total: {random_rules_num_alerts_setup_df.groupby('rule')['num_alerts'].sum().sum()}")

In [None]:
# Number of unique packages having a rule
random_rules_num_alerts_setup_df.groupby('rule')['package'].nunique()

### ERRORS, NEED TO INVESTIGATE
/home/lyvd/bad-snakes/dataset/random-packages/clifton-jasmine.tar.gz [Errno 13] Permission denied: '/Users'
/home/lyvd/bad-snakes/dataset/random-packages/cexprtk-0.4.0-cp310-cp310-macosx_12_0_x86_64.whl

/home/lyvd/bad-snakes/dataset/popular-packages/sphinx-autoapi-1.8.4.tar.gz [Errno 13] Permission denied: '/home/lyvd/bad-snakes/dataset/popular-packages/sphinx-autoapi-1.8.4/tests/dotnetexample/example/Identity/.git/objects/pack/pack-fca9d88bcb5aeae361d4776558a15acd16ccab84.idx'
/home/lyvd/bad-snakes/dataset/popular-packages/tensorflow_gpu-2.9.1-cp310-cp310-win_amd64.whl File is not a zip file
/home/lyvd/bad-snakes/dataset/popular-packages/ansible-6.0.0/ansible_collections/f5networks/f5_modules/tests/unit/modules/network/f5/fixtures/fake_policy.tar.gz file could not be opened successfully:
- method gz: ReadError('invalid header')
- method bz2: ReadError('not a bzip2 file')
- method xz: ReadError('not an lzma file')
- method tar: ReadError('invalid header')
/home/lyvd/bad-snakes/dataset/popular-packages/twine-4.0.1/tests/fixtures/malformed.tar.gz Compressed file ended before the end-of-stream marker was reached

