In [None]:
import json
import os
import pandas as pd
import pickle
import subprocess
import time
from tqdm import tqdm
from functools import lru_cache
import tempfile
import shutil
from pathlib import Path

In [None]:
def scanning_packages(package_releases_path, results_dir):
    package_dirs = [f.path for f in os.scandir(package_releases_path) if f.is_dir()]
    for package_dir in tqdm(package_dirs):
        package_name = f"{package_dir.split('/')[6]}" 
        if package_name not in popular_packages_running_times:
            print(package_name)
            with tempfile.TemporaryDirectory() as temp_dir:
                for dirpath,_,filenames in os.walk(package_dir):
                    for f in filenames:
                        file_path = os.path.abspath(os.path.join(dirpath, f))
                        if Path(file_path).suffix == '.py':
                            shutil.copy2(file_path, temp_dir)
                start_time = time.time()
                subprocess.run(["bandit", "-r", temp_dir,  "-f", "json", "-o", f"{os.path.join(results_dir, package_name)}.json"])
                popular_packages_running_times[package_name] = time.time() - start_time

In [None]:
def parse_analysis_results(results_dir):
    results = []
    for root, dirs, files in os.walk(results_dir):
        for file in files:
            file_path = os.path.join(root, file)
            package_name = file_path.split("/")[-1].replace(".json", "")
            with open(file_path, "rb") as f:
                data_bytes = f.read()
                data_str = data_bytes.decode("utf-8", errors='ignore')
                data = json.loads(data_str)
                for result in data["results"]:
                    results.append([package_name, result["filename"], result['issue_text']])
    return results

In [None]:
def get_tp_fp(num_alerts_list, threshold):
    count = 0
    for i in num_alerts_list:
        if i > threshold:
            count = count + 1
    tp = count
    fp = len(num_alerts_list) - tp
    return (tp, fp)

## Malicious packages

In [None]:
malicious_packages_path = os.path.abspath("../dataset/malicious-packages/")
malicious_results_dir = os.path.abspath("../results/bandit4mal/malicious/")

In [None]:
malicious_packages_running_time = scanning_packages(malicious_packages_path, malicious_results_dir)

In [None]:
malicious_packages_running_time_df = pd.DataFrame(list(malicious_packages_running_time.items()), columns=['package', 'running_time'])
malicious_packages_running_time_df["running_time"].describe()

In [None]:
malicious_packages_running_time_df.to_csv("../results/running_times/bandit4mal/malicious-packages.csv")

In [None]:
scan_releases(malicious_packages_path, malicious_results_dir)

In [None]:
malicious_results_path = os.path.abspath("../results/bandit4mal/malicious/")
malicious_results = analyze_results(malicious_results_path)

### Triggered rules in all Python files in malicious packages

In [None]:
malicious_results_df = pd.DataFrame(malicious_results, columns=["package", "target", "rule"])

In [None]:
# Number of rules per package
malicious_packages_rules_groupby = malicious_results_df.groupby('package')['rule']
print(f"Total number of rules: {malicious_packages_rules_groupby.count().sum()}")
malicious_packages_rules_groupby.count().describe()

### Triggered rules in all setup.py files in malicious packages

In [None]:
malicious_packages_rules_setup_df = malicious_results_df[malicious_results_df['target'].str.contains('setup.py')]

In [None]:
# Number of rules per package
malicious_packages_rules_setup_groupby = malicious_packages_rules_setup_df.groupby('package')['rule']
print(f"Total number of rules: {malicious_packages_rules_setup_groupby.count().sum()}")
malicious_packages_rules_setup_groupby.count().describe()

In [None]:
# Saving the existing result
with open(os.path.abspath("../results/malicious_packages_scanning_results_bandit4mal.pkl"), 'wb') as fp:
    pickle.dump(malicious_results, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Popular packages

In [None]:
# Paths to popular packages and the results dir
popular_packages_path = os.path.abspath("../dataset/popular-packages/")
popular_results_dir = os.path.abspath("../results/bandit4mal/popular/")

In [None]:
# Scanning popular packages
scan_releases(popular_packages_path, popular_results_dir)

In [None]:
popular_packages_running_time = calculate_running_times_popular(popular_packages_path, popular_results_dir)

In [None]:
popular_packages_running_times_df = pd.DataFrame(list(popular_packages_running_times.items()), columns=['package', 'running_time'])

In [None]:
popular_packages_running_times_df.to_csv("../results/running_times/bandit4mal/popular-packages.csv")

In [None]:
popular_packages_running_times_df["running_time"].describe()

In [None]:
# Parsing the results 
popular_results_path = os.path.abspath("../results/bandit4mal/popular/")
popular_results = parse_analysis_results(popular_results_path)

### Triggered rules in all Python files in popular packages

In [None]:
# Transform the results into DataFrame for analysis
popular_results_df = pd.DataFrame(popular_results, columns=["package", "target", "rule"])

In [None]:
# Number of rules per package
popular_packages_rules_groupby = popular_results_df.groupby('package')['rule']
print(f"Total number of rules: {popular_packages_rules_groupby.count().sum()}")
popular_packages_rules_groupby.count().describe()

In [None]:
# Ratio of true positives to false postives 
thresholds = [1, 5, 10, 15, 20, 25, 30]
scores = popular_packages_rules_groupby.count().to_list()
thesholds_tpr_fpr_ratio = []
for t in thresholds:
    tp, fp = get_tp_fp(scores, t)
    print(t, tp, fp)

### Triggered rules in all setup.py files in popular packages

In [None]:
popular_packages_rules_setup_df = popular_results_df[popular_results_df['target'].str.contains('setup.py')]

In [None]:
# Number of rules per package
popular_packages_rules_setup_groupby = popular_packages_rules_setup_df.groupby('package')['rule']
print(f"Total number of rules: {popular_packages_rules_setup_groupby.count().sum()}")
popular_packages_rules_setup_groupby.count().describe()

In [None]:
# Ratio of true positives to false postives 
thresholds = [1, 2, 3, 4, 5]
scores = popular_packages_rules_groupby.count().to_list()
thesholds_tpr_fpr_ratio = []
for t in thresholds:
    tp, fp = get_tp_fp(scores, t)
    print(t, tp, fp)

In [None]:
# Saving the existing result
with open(os.path.abspath("../results/bandit4mal/popular-packages.pkl"), 'wb') as fp:
    pickle.dump(popular_results, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Random packages

In [None]:
# Paths to random packags, and results dir
random_packages_path = os.path.abspath("../dataset/random-packages/")
random_results_dir = os.path.abspath("../results/bandit4mal/random/")

In [None]:
# Scanning packages and measure running time
random_packages_running_time = scanning_packages(random_packages_path, random_results_dir)

In [None]:
random_packages_running_times_df = pd.DataFrame(list(random_packages_running_time.items()), columns=['package', 'running_time'])
random_packages_running_times_df["running_time"].describe()

In [None]:
random_packages_running_times_df.to_csv("../results/running_times/bandit4mal/random-packages.csv")

In [None]:
# Analyzing the results generated
random_results_path = os.path.abspath("../results/bandit4mal/random/")
random_results = parse_analysis_results(random_results_path)

### Triggered rules in all Python files in random packages

In [None]:
# Transform the results into Dataframe for analysis
random_results_df = pd.DataFrame(random_results, columns=["package", "target", "rule"])

In [None]:
# Number of rules per package
random_packages_rules_groupby = random_results_df.groupby('package')['rule']
print(f"Total number of rules: {random_packages_rules_groupby.count().sum()}")
random_packages_rules_groupby.count().describe()

In [None]:
# Ratio of true positives to false postives 
thresholds = [1, 5, 10, 15, 20, 25, 30]
scores = random_packages_rules_groupby.count().to_list()
thesholds_tpr_fpr_ratio = []
for t in thresholds:
    tpr, fpr = get_tpr_fpr(scores, t)
    print(t, round(tpr/fpr, 2))

### Triggered rules in all setup.py files in random packages

In [None]:
# Selecting only setup.py files of the packages
random_packages_rules_setup_df = random_results_df[random_results_df['target'].str.contains('setup.py')]

In [None]:
# Number of rules per package
random_packages_rules_setup_groupby = random_packages_rules_setup_df.groupby('package')['rule']
print(f"Total number of rules: {random_packages_rules_setup_groupby.count().sum()}")
random_packages_rules_setup_groupby.count().describe()

In [None]:
# Ratio of true positives to false postives 
thresholds = [1, 2, 3, 4, 5]
scores = random_packages_rules_setup_groupby.count().to_list()
thesholds_tpr_fpr_ratio = []
for t in thresholds:
    tp, fp = get_tp_fp(scores, t)
    print(t, tp, fp)

In [None]:
# Saving the existing result
with open(os.path.abspath("../results/bandit4mal/random-packages.pkl"), 'wb') as fp:
    pickle.dump(random_results, fp, protocol=pickle.HIGHEST_PROTOCOL)