In [11]:
import os
import pandas as pd
import pickle
from typing import Dict, List

In [7]:
# Collecting subpatterns of the alerts of the packages
def collect_subpatterns(packages_scanning_results: Dict[str, str]) -> List[str]:
    packages_subpatterns = []
    for package, files in packages_scanning_results.items():
        for file_ in files:
            if file_["strings"]:
                for subpattern in file_["strings"]:
                    packages_subpatterns.append([package, file_["target"], file_["rule"], subpattern[0], subpattern[1], subpattern[2]])
                    
    return packages_subpatterns

In [8]:
# Loading the existing result
with open(os.path.abspath("../results/malicious_packages_scanning_results_pypi_malwarechecks.pkl"), 'rb') as fp:
    malicious_packages_scanning_results = pickle.load(fp)

In [12]:
malicious_subpatterns = collect_subpatterns(malicious_packages_scanning_results)

In [15]:
malicious_subpatterns_df = pd.DataFrame(malicious_subpatterns, columns=["package", "target", "rule", "line_number", "subpattern", "line_content"])

In [16]:
malicious_subpatterns_df.head()

Unnamed: 0,package,target,rule,line_number,subpattern,line_content
0,important-package,/home/lyvd/bad-snakes/dataset/malicious-packag...,deserialization_in_setup,63,$import_base64,b'import base64'
1,important-package,/home/lyvd/bad-snakes/dataset/malicious-packag...,metaprogramming_in_setup,613,$dir_call,b'dir()'
2,important-package,/home/lyvd/bad-snakes/dataset/malicious-packag...,networking_in_setup,382,$from_socket_import,b'from socket import'
3,important-package,/home/lyvd/bad-snakes/dataset/malicious-packag...,networking_in_setup,0,$from_urllib_import,b'from urllib import'
4,important-package,/home/lyvd/bad-snakes/dataset/malicious-packag...,networking_in_setup,49,$import_socket,b'import socket'


In [23]:
# Top 10 most subpatterns in the malicious packages
malicious_subpatterns_df.subpattern.value_counts().head(10)

$assign_alias_os            2693
$alias_system               1051
$getattr_call                983
$alias_Popen                 780
$dir_call                    638
$assign_alias_subprocess     627
$bare_system                 354
$alias_check_call            320
$obfuscation_chr             309
$import_urllib_sub           293
Name: subpattern, dtype: int64

In [24]:
# Path to the popular packages dataset
popular_packages_path = os.path.abspath("../dataset/popular-packages")

In [25]:
# Loading the existing result
with open(os.path.abspath("../results/popular_packages_scanning_results_pypi_malwarechecks.pkl"), 'rb') as fp:
    popular_packages_scanning_results = pickle.load(fp)

In [26]:
popular_subpatterns = collect_subpatterns(popular_packages_scanning_results)

In [27]:
popular_subpatterns_df = pd.DataFrame(popular_subpatterns, columns=["package", "target", "rule", "line_number", "subpattern", "line_content"])

In [28]:
# Top 10 most subpatterns in the malicious packages
popular_subpatterns_df.subpattern.value_counts().head(10)

$assign_alias_os            56271
$dir_call                   55383
$getattr_call               26103
$alias_exec                 19810
$assign_alias_subprocess    18646
$alias_run                  16493
$bare_run                   14841
$alias_call                 14079
$vars_call                  13335
$bare_exec                  13228
Name: subpattern, dtype: int64

In [30]:
# Loading the existing result
with open(os.path.abspath("../results/random_packages_scanning_results_pypi_malwarechecks.pkl"), 'rb') as fp:
    random_packages_scanning_results = pickle.load(fp)

In [31]:
random_subpatterns = collect_subpatterns(random_packages_scanning_results)

In [32]:
random_subpatterns_df = pd.DataFrame(random_subpatterns, columns=["package", "target", "rule", "line_number", "subpattern", "line_content"])

In [33]:
# Top 10 most subpatterns in the malicious packages
random_subpatterns_df.subpattern.value_counts().head(10)

$dir_call                   8378
$assign_alias_os            7753
$alias_system               6001
$bare_system                5375
$alias_exec                 2716
$getattr_call               2485
$assign_alias_subprocess    2338
$alias_run                  1630
$alias_Popen                1621
$bare_exec                  1569
Name: subpattern, dtype: int64