In [1]:
# Importing necessary libraries
import os
from typing import Dict
from typing import List
import pandas as pd
import pickle

In [2]:
# Collecting number of rules of the packages
def parse_analysis_result(packages_scanning_results: Dict[str, str]) -> List[str]:
    for package, files in packages_scanning_results.items():
        for file_ in files: 
            if file_['rule']:
                # if a file has a rule, 1
                yield (package, file_["target"], file_['rule'], 1)
            else:
                # if a file does not have a rule, 0
                yield (package, file_["target"], "", 0)

## Analyzing Malicious packages

In [3]:
# Loading the existing results
with open(os.path.abspath(os.path.join("results", "malicious_packages.pkl")), 'rb') as fp:
    malicious_packages_scanning_results = pickle.load(fp)

### Triggered rules and subpatterns in all Python files in malicious packages

In [4]:
# Collect the alerts and transform them into datafarame
malicious_packages_rules = parse_analysis_result(malicious_packages_scanning_results)
malicious_packages_rules_df = pd.DataFrame(malicious_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [5]:
# Number of rules per package
malicious_packages_rules_groupby = malicious_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_rules_groupby.sum().sum()}")
malicious_packages_rules_groupby.sum().describe()

Total number of rules: 850


count    168.000000
mean       5.059524
std       12.944702
min        0.000000
25%        1.000000
50%        2.000000
75%        4.000000
max      118.000000
Name: has_rule, dtype: float64

### Triggered rules and subpatterns in all setup.py files in malicious packages

In [6]:
# Selecting only setup.py files
malicious_packages_rules_setup_df = malicious_packages_rules_df[malicious_packages_rules_df['target'].str.contains('setup.py')]

In [7]:
# Number of rules per package
malicious_packages_rules_setup_groupby = malicious_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_rules_setup_groupby.sum().sum()}")
malicious_packages_rules_setup_groupby.sum().describe()

Total number of rules: 175


count    163.000000
mean       1.073620
std        1.057357
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max        3.000000
Name: has_rule, dtype: float64

In [8]:
# Selecting setup.py files having at least one alert
malicious_packages_setup_has_rules_df = malicious_packages_rules_setup_df[malicious_packages_rules_setup_df['has_rule'] != 0]

In [9]:
# Number of rules per package
malicious_packages_setup_has_rules_groupby = malicious_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {malicious_packages_setup_has_rules_groupby.sum().sum()}")
malicious_packages_setup_has_rules_groupby.sum().describe()

Total number of rules: 175


count    99.000000
mean      1.767677
std       0.780218
min       1.000000
25%       1.000000
50%       2.000000
75%       2.000000
max       3.000000
Name: has_rule, dtype: float64

## Analyzing Popular packages

In [10]:
# Loading the existing result in case we do not want to rescan the packages
with open(os.path.abspath(os.path.join("results", "popular_packages.pkl")), "rb") as fp:
    popular_packages_scanning_results = pickle.load(fp)

### Triggered rules and subpatterns in all Python files in popular packages

In [11]:
# Collect the alerts and transform them into datafarame
popular_packages_rules = parse_analysis_result(popular_packages_scanning_results)
popular_packages_rules_df = pd.DataFrame(popular_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [12]:
# Number of rules per package
popular_packages_rules_groupby = popular_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_rules_groupby.sum().sum()}")
popular_packages_rules_groupby.sum().describe()

Total number of rules: 51283


count    1430.000000
mean       35.862238
std       163.910398
min         0.000000
25%         3.000000
50%         8.000000
75%        27.000000
max      5377.000000
Name: has_rule, dtype: float64

### Triggered rules in all setup.py files in popular packages

In [13]:
# Selecting only setup.py files
popular_packages_rules_setup_df = popular_packages_rules_df[popular_packages_rules_df['target'].str.contains('setup.py')]

In [14]:
# Number of rules per package
popular_packages_rules_setup_groupby = popular_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_rules_setup_groupby.sum().sum()}")
popular_packages_rules_setup_groupby.sum().describe()

Total number of rules: 768


count    1320.000000
mean        0.581818
std         1.568541
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max        45.000000
Name: has_rule, dtype: float64

In [15]:
# setup.py files in popular packages that have at least one alert
popular_packages_setup_has_rules_df = popular_packages_rules_setup_df[popular_packages_rules_setup_df['has_rule'] != 0]

In [16]:
# Number of rules per package
popular_packages_setup_has_rules_groupby = popular_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {popular_packages_setup_has_rules_groupby.sum().sum()}")
popular_packages_setup_has_rules_groupby.sum().describe()

Total number of rules: 768


count    474.000000
mean       1.620253
std        2.274802
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max       45.000000
Name: has_rule, dtype: float64

## Analyzing Random packages

### Triggered rules in all Python files in random packages

In [17]:
# Loading the existing result
with open(os.path.abspath(os.path.join("results", "random_packages.pkl")), 'rb') as fp:
    random_packages_scanning_results = pickle.load(fp)

In [18]:
# Collect the alerts and transform them into datafarame
random_packages_rules = parse_analysis_result(random_packages_scanning_results)
random_packages_rules_df = pd.DataFrame(random_packages_rules, columns=["package", "target", "rule", "has_rule"])

In [19]:
# Number of rules per package
random_packages_rules_groupby = random_packages_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_groupby.sum().sum()}")
random_packages_rules_groupby.sum().describe()

Total number of rules: 5460


count    986.000000
mean       5.537525
std       31.223431
min        0.000000
25%        0.000000
50%        1.000000
75%        4.000000
max      728.000000
Name: has_rule, dtype: float64

### Triggered rules in all setup.py files in random packages

In [20]:
# Selecting only setup.py files
random_packages_rules_setup_df = random_packages_rules_df[random_packages_rules_df['target'].str.contains('setup.py')]

In [21]:
# Number of rules per package
random_packages_rules_setup_groupby = random_packages_rules_setup_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_setup_groupby.sum().sum()}")
random_packages_rules_setup_groupby.sum().describe()

Total number of rules: 197


count    868.000000
mean       0.226959
std        0.633812
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        9.000000
Name: has_rule, dtype: float64

In [22]:
# random packages whose setup.py files having at least one alert
random_packages_setup_has_rules_df = random_packages_rules_setup_df[random_packages_rules_setup_df['has_rule'] != 0]

In [23]:
# Number of rules per package
random_packages_rules_setup_has_rules_groupby = random_packages_setup_has_rules_df.groupby('package')['has_rule']
print(f"Total number of rules: {random_packages_rules_setup_has_rules_groupby.sum().sum()}")
random_packages_rules_setup_has_rules_groupby.sum().describe()

Total number of rules: 197


count    147.000000
mean       1.340136
std        0.939955
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        9.000000
Name: has_rule, dtype: float64