In [200]:
# pip install treeinterpreter
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25ldone
[?25h  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283913 sha256=07993a0c6ffb700bd489c2415218ad518da07a6bf510262c07eddc7e982ef3a3
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
#token Extraction

# Sample JavaScript API keywords (this list can be expanded)
JS_API_KEYWORDS = {"document", "window", "console", "Array", "Object", "Function", "setTimeout", "clearInterval"}

def tokenize_js(js_code):
    """Tokenize JavaScript code using esprima."""
    tokens = esprima.tokenize(js_code)
    return [token.value for token in tokens]

def filter_tokens(js_codes, frequency_threshold=50):
    """Filter tokens based on frequency threshold and JS API keywords."""
    all_tokens = []
    
    # Tokenize all scripts
    for code in js_codes:
        all_tokens.extend(tokenize_js(code))
    
    # Compute token frequencies
    token_counts = Counter(all_tokens)
    
    # Filter tokens
    filtered_tokens = {token for token, count in token_counts.items() if count >= frequency_threshold or token in JS_API_KEYWORDS}
    
    return filtered_tokens

# Example usage
js_scripts = [
    "console.log('Hello, World!'); var x = 10; function test() { return x; }",
    "window.alert('Test'); let y = 20; Object.keys({a:1, b:2});"
]

filtered_tokens = filter_tokens(js_scripts)
print(filtered_tokens)


{'Object', 'console', 'window'}


In [40]:
#AST feature extraction

class JSASTFeatureExtractor:
    def __init__(self):
        self.js_apis = {
            'console', 'log', 'localStorage', 'setItem',
            'test', 'x', 'y', 'key', 'value',  # Identifiers from your code
            'document', 'window', 'Array', 'String'  # Common JS APIs
        }
    
    def extract_features(self, js_code):
        try:
            ast = esprima.parseScript(js_code)
            features = []
            self._traverse_ast(ast, None, features)
            return self._filter_features(features)
        except Exception as e:
            print(f"Error parsing JavaScript: {str(e)}")
            return []
    
    def _traverse_ast(self, node, parent, features):
        # Process current node
        if hasattr(node, 'type'):
            node_type = node.type
            # Capture identifiers and literals with their parent context
            if node_type == 'Identifier':
                if parent and hasattr(parent, 'type'):
                    features.append(f"{parent.type}:{node.name}")
            elif node_type == 'Literal':
                if parent and hasattr(parent, 'type'):
                    features.append(f"{parent.type}:{node.value}")
            
            # Recursively process child nodes
            for child in self._get_child_nodes(node):
                self._traverse_ast(child, node, features)
    
    def _get_child_nodes(self, node):
        children = []
        if not hasattr(node, '__dict__'):
            return children
            
        # Handle different node structures
        for attr, value in vars(node).items():
            if attr.startswith('_') or attr in ['type', 'loc', 'range']:
                continue
                
            if isinstance(value, list):
                children.extend([item for item in value if hasattr(item, 'type')])
            elif hasattr(value, 'type'):
                children.append(value)
        
        return children
    
    def _filter_features(self, features):
        return [f for f in features if any(api in f.split(':')[1] for api in self.js_apis)]
    
    def get_feature_vector(self, js_code):
        features = self.extract_features(js_code)
        feature_counts = defaultdict(int)
        for feature in features:
            feature_counts[feature] += 1
        return dict(feature_counts)

# Test with your code
extractor = JSASTFeatureExtractor()
js_code = """
    function test() {
        var x = 10;
        var y = 20;
        console.log(x + y);
        if (x > 5) {
            console.log('X is greater than 5');
        } else {
            console.log('X is not greater than 5');
        }
        while (y > 0) {
            y--;
        }
    }
    """

# print("All features (unfiltered):")
ast = esprima.parseScript(js_code)
features = []
extractor._traverse_ast(ast, None, features)
# for f in features:
#     print(f)

# print("\nFiltered features:")
filtered = extractor._filter_features(features)
# for f in filtered:
#     print(f)

print("\nFeature vector:", extractor.get_feature_vector(js_code))


Feature vector: {'FunctionDeclaration:test': 1, 'VariableDeclarator:x': 1, 'VariableDeclarator:y': 1, 'MemberExpression:console': 3, 'MemberExpression:log': 3, 'BinaryExpression:x': 2, 'BinaryExpression:y': 2, 'UpdateExpression:y': 1}


In [44]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import esprima
from collections import Counter, defaultdict

# Define the AST feature extractor
class JSASTFeatureExtractor:
    def __init__(self):
        self.js_apis = {
            'console', 'log', 'localStorage', 'setItem',
            'document', 'window', 'Array', 'String'
        }

    def extract_features(self, js_code):
        """Extract features from the provided JavaScript code."""
        try:
            ast = esprima.parseScript(js_code)
            features = []
            self._traverse_ast(ast.body, features)
            return self._filter_features(features)
        except Exception as e:
            print(f"Error parsing JavaScript: {str(e)}")
            return []

    def _traverse_ast(self, nodes, features):
        """Traverse the AST to create features based on statements."""
        for node in nodes:
            if hasattr(node, 'type'):
                node_type = node.type
                if node_type in ['Identifier', 'Literal']:
                    statement = self._get_statement(node)
                    features.append(statement)
                # Additional handling for control flow constructs
                if node_type == 'IfStatement':
                    features.append(f"If: {self._get_statement(node.test)}")
                    self._traverse_ast([node.consequent], features)
                    if node.alternate:
                        self._traverse_ast([node.alternate], features)

    def _get_statement(self, node):
        """Get a string representation of a statement for the features."""
        if node.type == 'Identifier':
            return f"Identifier: {node.name}"
        elif node.type == 'Literal':
            return f"Literal: {node.value}"
        return "Unknown Statement"

    def _filter_features(self, features):
        """Filter features to include only those matching JavaScript APIs."""
        return [f for f in features if any(api in f for api in self.js_apis)]

# Token feature extraction function
def extract_token_features(js_code):
    """Tokenize JavaScript code and filter tokens based on frequency."""
    tokens = esprima.tokenize(js_code)
    token_counts = Counter(token.value for token in tokens)

    # Filter tokens based on frequency (keeping top 20%)
    frequency_threshold = 0.2 * len(token_counts)  # Change this if needed
    filtered_tokens = {token for token, count in token_counts.items() if count >= frequency_threshold}

    return filtered_tokens

# Function to extract combined features
def extract_combined_features(js_code):
    """Extract combined features from both AST and token features."""
    ast_extractor = JSASTFeatureExtractor()
    ast_features = ast_extractor.extract_features(js_code)
    print(ast_features)
    
    # Extract token features
    token_features = extract_token_features(js_code)

    # Combine features (here we use counts)
    combined_features = Counter(ast_features + list(token_features))
    
    # Convert combined features to a fixed-size vector
    feature_vector = np.zeros(256)  # Define a fixed size for the feature vector
    for i, feature in enumerate(combined_features):
        if i < 256:  # Limit to 256 dimensions
            feature_vector[i] = combined_features[feature]
    
    return feature_vector

# Sample JavaScript code snippets (replace with your dataset)
js_code_samples = [
    "console.log('Hello, World!'); var x = 10;",
    "window.alert('Test'); let y = 20; Object.keys({a:1, b:2});",
    "var a = b + c; if (a > 10) { console.log(a); }"
]

# Assume we have the following labels for the above snippets
labels = [0, 1, 0]  # 0 = benign, 1 = malicious (example labels)

# Prepare feature vectors
feature_vectors = []
for code in js_code_samples:
    feature_vector = extract_combined_features(code)
    feature_vectors.append(feature_vector)

# Convert to NumPy array for machine learning
X = np.array(feature_vectors)

# Step 3: Prepare the Dataset
y = np.array(labels)  # Labels for the training

# Step 4: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 5: Model Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


[]
[]
[]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [201]:
####################################
# ML model with deppfpd features
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd

import joblib
import json

# import shap
from lime.lime_text import LimeTextExplainer

In [2]:
with open('DeepFPD/DeepFPD/exp_n/fusion/train_data_styx/ast.npy','rb') as f:
    ast_data=np.load(f,allow_pickle=True)

In [338]:
with open('DeepFPD-Code/train/process_ast2/ast.npy','rb') as f:
    ast_with_wasm=np.load(f,allow_pickle=True)

In [4]:
#feature importance

# Load test AST sequences and true labels from the same file
test_data = ast_data

# Separate sequences and labels
test_labels = []
test_sequences = []

for entry in test_data:
    label, seq = entry
    test_labels.append(label)
    test_sequences.append(' '.join(map(str, seq)))

# Load the saved vectorizer and model
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(test_sequences).toarray()
labels = np.array(test_labels)

# 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Train final model on all data for feature importance analysis
clf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
clf_bow.fit(X, labels)
importances = clf_bow.feature_importances_
feature_names = vectorizer.get_feature_names_out()
sorted_idx = np.argsort(importances)[::-1]

# Load your original vocab (token -> index)
with open('DeepFPD/DeepFPD/exp_n/fusion/train_data_styx/vocab.json', 'r') as f:
    vocab = json.load(f)

# Reverse it to get index -> token
index_to_token = {v: k for k, v in vocab.items()}

print("\nTop 20 Important AST tuples (BoW features):")
# for idx in sorted_idx[:20]:
#     print(f"{feature_names[idx]}: {importances[idx]:.4f}")

for idx in sorted_idx[:20]:
    token = feature_names[idx]
    if token.isdigit():
        token_int = int(token)
        readable = index_to_token.get(token_int + 1, f"[Unknown token {token_int}]")  # +1 to account for 1-based vocab
    else:
        readable = token
    print(f"{readable}: {importances[idx]:.4f}")


Top 20 Important AST tuples (BoW features):
MemberExpression:screen: 0.0214
MemberExpression:fillText: 0.0192
CallExpression:canvas: 0.0186
MemberExpression:language: 0.0145
MemberExpression:localStorage: 0.0118
MemberExpression:appName: 0.0116
MemberExpression:platform: 0.0110
MemberExpression:fillStyle: 0.0106
MemberExpression:colorDepth: 0.0105
MemberExpression:fillRect: 0.0089
MemberExpression:openDatabase: 0.0086
MemberExpression:getContext: 0.0084
MemberExpression:plugins: 0.0083
MemberExpression:getTimezoneOffset: 0.0079
MemberExpression:textBaseline: 0.0076
MemberExpression:charCodeAt: 0.0074
MemberExpression:font: 0.0071
MemberExpression:cpuClass: 0.0068
MemberExpression:getItem: 0.0068
MemberExpression:attachShader: 0.0066


In [337]:
# #shap
# # import shap
# # Load test AST sequences and true labels from the same file
# test_data = ast_data

# # Separate sequences and labels
# test_labels = []
# test_sequences = []

# for entry in test_data:
#     label, seq = entry
#     test_labels.append(label)
#     test_sequences.append(' '.join(map(str, seq)))

# # Load the saved vectorizer and model
# vectorizer = CountVectorizer(max_features=5000)
# X = vectorizer.fit_transform(test_sequences).toarray()
# labels = np.array(test_labels)

# # 10-Fold Stratified Cross-Validation
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# # Train final model on all data for feature importance and SHAP analysis
# print('Starting Training')
# clf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
# print('debug print')
# clf_bow.fit(X, labels)
# # importances = clf_bow.feature_importances_
# # feature_names = vectorizer.get_feature_names_out()
# # sorted_idx = np.argsort(importances)[::-1]

# # print("\nTop 20 Important AST tuples (BoW features):")
# # for idx in sorted_idx[:20]:
# #     print(f"{feature_names[idx]}: {importances[idx]:.4f}")

# # SHAP Analysis
# print("\nGenerating SHAP summary plot...")
# explainer = shap.TreeExplainer(clf_bow)
# shap_values = explainer.shap_values(X)

# # Plot summary for class 1 (fingerprinting class)
# shap.summary_plot(shap_values[1], X, feature_names=feature_names, show=False)
# plt.tight_layout()
# plt.savefig("shap_summary_class1.png")
# print("SHAP summary plot saved as 'shap_summary_class1.png'")


In [134]:
#bow based

# Prepare labels and AST sequences
labels = []
ast_sequences = []

for entry in ast_data:
    label, seq = entry
    labels.append(label)
    ast_sequences.append(' '.join(map(str, seq)))  # convert numeric tokens to strings

labels = np.array(labels)

# Choose vectorization method (Bag-of-Words or TF-IDF)
vectorizer = CountVectorizer(max_features=5000)  # Use TfidfVectorizer for TF-IDF
X = vectorizer.fit_transform(ast_sequences).toarray()

# 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
precisions = [] 
recalls = []
fold = 1

for train_index, test_index in skf.split(X, labels):
    print(f"\nFold {fold}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
    clf_bow.fit(X_train, y_train)

    y_pred = clf_bow.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    precisions.append(precision)
    recalls.append(recall)

    print(classification_report(y_test, y_pred))
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")
    fold += 1

print(f"\nAverage Accuracy across 10 folds: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Average Precision across 10 folds: {np.mean(precisions):.4f}")
print(f"Average Recall across 10 folds: {np.mean(recalls):.4f}")


Fold 1
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       546
           1       0.99      0.87      0.92        76

    accuracy                           0.98       622
   macro avg       0.98      0.93      0.96       622
weighted avg       0.98      0.98      0.98       622

Fold 1 Accuracy: 0.9823

Fold 2
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       546
           1       1.00      0.88      0.94        76

    accuracy                           0.99       622
   macro avg       0.99      0.94      0.96       622
weighted avg       0.99      0.99      0.99       622

Fold 2 Accuracy: 0.9855

Fold 3
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       546
           1       0.98      0.82      0.89        76

    accuracy                           0.98       622
   macro avg       0.98      0.91      0.94       622
we

In [339]:
#bow based (ast with wasm)

# Prepare labels and AST sequences
labels_wasm = []
ast_sequences_wasm = []

for entry in ast_with_wasm:
    label, seq = entry
    labels_wasm.append(label)
    ast_sequences_wasm.append(' '.join(map(str, seq)))  # convert numeric tokens to strings

labels_wasm = np.array(labels_wasm)

# Choose vectorization method (Bag-of-Words or TF-IDF)
vectorizer_wasm = CountVectorizer(max_features=5000)  # Use TfidfVectorizer for TF-IDF
X = vectorizer_wasm.fit_transform(ast_sequences_wasm).toarray()

# 10-Fold Stratified Cross-Validation
skf_wasm = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
precisions = [] 
recalls = []
fold = 1

for train_index, test_index in skf_wasm.split(X, labels_wasm):
    print(f"\nFold {fold}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = labels_wasm[train_index], labels_wasm[test_index]

    clf_bow_wasm = RandomForestClassifier(n_estimators=100, random_state=42)
    clf_bow_wasm.fit(X_train, y_train)

    y_pred = clf_bow_wasm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    precisions.append(precision)
    recalls.append(recall)

    print(classification_report(y_test, y_pred))
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")
    fold += 1

print(f"\nAverage Accuracy across 10 folds: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Average Precision across 10 folds: {np.mean(precisions):.4f}")
print(f"Average Recall across 10 folds: {np.mean(recalls):.4f}")


Fold 1
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       941
           1       0.98      0.73      0.84       157

    accuracy                           0.96      1098
   macro avg       0.97      0.87      0.91      1098
weighted avg       0.96      0.96      0.96      1098

Fold 1 Accuracy: 0.9599

Fold 2
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       941
           1       0.98      0.70      0.82       157

    accuracy                           0.96      1098
   macro avg       0.97      0.85      0.90      1098
weighted avg       0.96      0.96      0.95      1098

Fold 2 Accuracy: 0.9554

Fold 3
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       941
           1       0.97      0.73      0.83       157

    accuracy                           0.96      1098
   macro avg       0.96      0.86      0.90      1098
we

In [108]:
#embedding based feature prep

# Prepare labels and AST sequences
labels = []
ast_sequences = []

for entry in ast_data:
    label, seq = entry
    labels.append(label)
    ast_sequences.append(seq)

labels = np.array(labels)

# Define embedding dimensions
vocab_size = max(max(seq) for seq in ast_sequences) + 1
embed_dim = 64

# Randomly initialized embeddings (you can replace this with trained embeddings)
embedding_matrix = np.random.rand(vocab_size, embed_dim)

# Function to compute average embeddings for sequences
def average_embedding(seq, embedding_matrix):
    embeddings = embedding_matrix[seq]
    return np.mean(embeddings, axis=0)

# Compute average embeddings for all sequences
X = np.array([average_embedding(seq, embedding_matrix) for seq in ast_sequences])

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
recalls = []
precisions = []
fold = 1

for train_index, test_index in skf.split(X, labels):
    print(f"\nFold {fold}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf_em = RandomForestClassifier(n_estimators=100, random_state=42)
    clf_em.fit(X_train, y_train)

    y_pred = clf_em.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    precisions.append(precision)
    recalls.append(recall)

    print(classification_report(y_test, y_pred))
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")
    fold += 1

print(f"\nAverage Accuracy across 10 folds: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Average Precision across 10 folds: {np.mean(precisions):.4f}")
print(f"Average Recall across 10 folds: {np.mean(recalls):.4f}")



Fold 1
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       546
           1       0.98      0.64      0.78        76

    accuracy                           0.95       622
   macro avg       0.97      0.82      0.88       622
weighted avg       0.96      0.95      0.95       622

Fold 1 Accuracy: 0.9550

Fold 2
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       546
           1       0.98      0.67      0.80        76

    accuracy                           0.96       622
   macro avg       0.97      0.83      0.89       622
weighted avg       0.96      0.96      0.95       622

Fold 2 Accuracy: 0.9582

Fold 3
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       546
           1       0.98      0.62      0.76        76

    accuracy                           0.95       622
   macro avg       0.96      0.81      0.87       622
we

In [114]:
#save models and necessary data
joblib.dump(clf_bow, 'rf_bow_model.joblib')
joblib.dump(vectorizer, 'bow_vectorizer.joblib')

joblib.dump(clf_em, 'rf__em_model.joblib')
np.save('embedding_matrix.npy', embedding_matrix)
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [289]:
# test data
with open('DeepFPD-Code/test/process_ast2/ast.npy','rb') as f:
    test_data_ = np.load(f,allow_pickle=True)
with open('DeepFPD-Code/wasm_test/process_ast2/ast.npy','rb') as f:
    wasm_test_data_ = np.load(f,allow_pickle=True)

In [290]:
# Separate sequences and labels
test_labels = []
test_sequences = []

for entry in test_data_:
    label, seq = entry
    test_labels.append(label)
    test_sequences.append(' '.join(map(str, seq)))

# vectorizer = joblib.load('bow_vectorizer.joblib')
# Transform test sequences into BoW vectors
X_test_bow = vectorizer.transform(test_sequences).toarray()

# # Load your trained model
# clf_bow = joblib.load('rf_bow_model.joblib')

# Make predictions on test data
predictions = clf_bow.predict(X_test_bow)

# View predictions
# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='binary')
recall = recall_score(test_labels, predictions, average='binary')

# print(predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
predictions

Accuracy: 97.81%
Precision: 86.67%
Recall: 72.22%


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [291]:
#test data prep for bow model

# Separate sequences and labels
test_labels = []
test_sequences = []

# for entry in test_data_:
for entry in wasm_test_data_:
    label, seq = entry
    test_labels.append(label)
    test_sequences.append(' '.join(map(str, seq)))

# vectorizer = joblib.load('bow_vectorizer.joblib')
# Transform test sequences into BoW vectors
X_test_bow = vectorizer.transform(test_sequences).toarray()

# # Load your trained model
# clf_bow = joblib.load('rf_bow_model.joblib')

# Make predictions on test data
predictions = clf_bow.predict(X_test_bow)

# View predictions
# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='binary')
recall = recall_score(test_labels, predictions, average='binary')

# print(predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
predictions

Accuracy: 96.55%
Precision: 88.89%
Recall: 44.44%


array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [340]:
# Separate sequences and labels
test_labels = []
test_sequences = []

for entry in test_data_:
    label, seq = entry
    test_labels.append(label)
    test_sequences.append(' '.join(map(str, seq)))

# vectorizer = joblib.load('bow_vectorizer.joblib')
# Transform test sequences into BoW vectors
X_test_bow = vectorizer.transform(test_sequences).toarray()

# # Load your trained model
# clf_bow = joblib.load('rf_bow_model.joblib')

# Make predictions on test data
predictions = clf_bow_wasm.predict(X_test_bow)

# View predictions
# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='binary')
recall = recall_score(test_labels, predictions, average='binary')

# print(predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
predictions

Accuracy: 83.07%
Precision: 2.63%
Recall: 5.56%


array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [341]:
#test data prep for bow model

# Separate sequences and labels
test_labels = []
test_sequences = []

# for entry in test_data_:
for entry in wasm_test_data_:
    label, seq = entry
    test_labels.append(label)
    test_sequences.append(' '.join(map(str, seq)))

# vectorizer = joblib.load('bow_vectorizer.joblib')
# Transform test sequences into BoW vectors
X_test_bow = vectorizer.transform(test_sequences).toarray()

# # Load your trained model
# clf_bow = joblib.load('rf_bow_model.joblib')

# Make predictions on test data
predictions = clf_bow_wasm.predict(X_test_bow)

# View predictions
# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='binary')
recall = recall_score(test_labels, predictions, average='binary')

# print(predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
predictions

Accuracy: 83.39%
Precision: 2.70%
Recall: 5.56%


array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [292]:
test_labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [267]:
preds = []

for idx in range(31):
    from treeinterpreter import treeinterpreter as ti
    i = idx  # or any index
    X_single = X_test_bow[i].reshape(1, -1)
    prediction, bias, contributions = ti.predict(clf_bow, X_single)
    feature_names = vectorizer.get_feature_names_out()
    # Select class 1 contributions
    class_idx = 1
    class_contribs = contributions[0][:, class_idx]
    
    # Sort by absolute value of contributions
    top_idx = np.argsort(class_contribs)[::-1]
    # top_idx = np.argsort(np.abs(class_contribs))[::-1]
    
    preds.append(prediction[0])
    # print(prediction[0])

stacked = np.stack(preds)  # shape will be (6, 2)
mean_values = np.mean(stacked, axis=0)

print("Mean confidence of non-fp:", mean_values[0])
print("Mean confidence of fp:", mean_values[1])

# js
# Mean confidence of non-fp: 0.2825127649001083
# Mean confidence of fp: 0.7174872350998915

Mean confidence of non-fp: 0.4260881796850773
Mean confidence of fp: 0.5739118203149227


In [230]:
idx = 3
from treeinterpreter import treeinterpreter as ti
i = idx  # or any index
X_single = X_test_bow[i].reshape(1, -1)
prediction, bias, contributions = ti.predict(clf_bow, X_single)
feature_names = vectorizer.get_feature_names_out()
# Select class 1 contributions
class_idx = 1
class_contribs = contributions[0][:, class_idx]

# Sort by absolute value of contributions
top_idx = np.argsort(class_contribs)[::-1]
# top_idx = np.argsort(np.abs(class_contribs))[::-1]

print(f"\nPrediction: {prediction[0]}")
print(f"Bias (base value): {bias[0]}")
# print(f"Contributions: {len(contributions[0])}")
print("Top contributing features to class 1:")

# for idx in top_idx[:10]:
#     print(f"{feature_names[idx]}: {class_contribs[idx]:.4f}")

pi = []

for idx in top_idx[0:20]:
    token = feature_names[idx]
    if token.isdigit():
        token_int = int(token)
        readable = index_to_token.get(token_int + 1, f"[Unknown token {token_int}]")  # +1 to account for 1-based vocab
    else:
        readable = token
    print(f"{readable}: {importances[idx]:.4f}")


Prediction: [0.5565 0.4435]
Bias (base value): [0.87774811 0.12225189]
Top contributing features to class 1:
MemberExpression:screen: 0.0214
MemberExpression:suffixes: 0.0032
MemberExpression:getTimezoneOffset: 0.0079
CallExpression:webgl: 0.0022
MemberExpression:pixelDepth: 0.0007
MemberExpression:getChannelData: 0.0008
MemberExpression:charging: 0.0021
MemberExpression:arrayBuffer: 0.0002
MemberExpression:localStorage: 0.0118
MemberExpression:getBattery: 0.0016
MemberExpression:filename: 0.0008
MemberExpression:OfflineAudioContext: 0.0009
MemberExpression:Number: 0.0027
MemberExpression:referrer: 0.0053
MemberExpression:eventStatus: 0.0001
ArrayExpression:Marlett: 0.0001
Property:hardwareConcurrency: 0.0001
ArrayExpression:monospace: 0.0008
MemberExpression:visibility: 0.0002
SequenceExpression:triangle: 0.0003


In [None]:
# scraped data - 100 random samples- all fp

#only js
# Accuracy: 78.79%
# Precision: 100.00%
# Recall: 78.79%

#wasm+js
# Accuracy: 81.82%
# Precision: 100.00%
# Recall: 81.82%

In [None]:
#scraped data - set 2

#only js
# Accuracy: 85.19%
# Precision: 100.00%
# Recall: 85.19%

#wasm+js
# Accuracy: 88.89%
# Precision: 100.00%
# Recall: 88.89%

In [None]:
#DeepFPD Data

# only JS
# Accuracy: 81.40%
# Precision: 92.31%
# Recall: 63.16%

#wasm+js
# Accuracy: 79.07%
# Precision: 91.67%
# Recall: 57.89%

In [145]:
# # test data prep for embedding model
# # test_sequences = []
# # for seq in test_data:
# #     test_sequences.append(seq)

# # Load saved embedding_matrix and scaler
# embedding_matrix = np.load('embedding_matrix.npy')
# scaler = joblib.load('scaler.joblib')

# # Compute embeddings for test data
# X_test = np.array([average_embedding(test_sequences, embedding_matrix) for seq in test_sequences])

# # Standardize using trained scaler
# X_test = scaler.transform(X_test)

In [296]:
with open('persistent_data/conversion/combined_converted/js/js_sample_01_20250528_131610.json','r') as f:
    js_sample_1 = json.load(f)
with open('persistent_data/conversion/combined_converted/wasm/wasm_sample_01_20250528_131610.json','r') as f:
    wasm_sample_1 = json.load(f)

In [336]:
print(wasm_sample_1[1].keys())
# print(wasm_sample_1[1]['461']['content'])
# len(wasm_sample_1)

dict_keys(['457', '461', '463', '465', '467', '469', '472', '474', '477', '478', '487'])


In [326]:
with open('DeepFPD-Code/train/process_token2/script_with_content.json','rb') as f:
    deepFPD_train = json.load(f)

In [331]:
# list(deepFPD_train.keys())[0]
deepFPD_train['002018445aafff40aed85143d61e8f90fc8859cc7df4f2ace93331cdf94b855e']

{'label': '0',
 'content': '(window.webpackJsonp_N_E=window.webpackJsonp_N_E||[]).push([[20],{ZS0p:function(t,n,e){"use strict";e.r(n);var r=e("q1tI"),o=e("tseg"),i=e.n(o);n.default=function(t){var n=t.facebookPixelId;return Object(r.useEffect)((function(){n&&(i.a.init(n),i.a.pageView())}),[n]),null}},tseg:function(t,n,e){window,t.exports=function(t){var n={};function e(r){if(n[r])return n[r].exports;var o=n[r]={i:r,l:!1,exports:{}};return t[r].call(o.exports,o,o.exports,e),o.l=!0,o.exports}return e.m=t,e.c=n,e.d=function(t,n,r){e.o(t,n)||Object.defineProperty(t,n,{enumerable:!0,get:r})},e.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},e.t=function(t,n){if(1&n&&(t=e(t)),8&n)return t;if(4&n&&"object"==typeof t&&t&&t.__esModule)return t;var r=Object.create(null);if(e.r(r),Object.defineProperty(r,"default",{enumerable:!0,value:t}),2&n&&"string"!=typeof t)for(var o 

In [324]:
from tqdm import tqdm
count = 0

for d in tqdm(wasm_sample_1):
    for item in list(d.values()):
        count+=1
        code = item['content']
        with open(f'persistent_data/conversion/scripts_for_fp_inspector/wasm/wasm_{count}.js','w') as f:
            f.write(code)

100%|██████████| 10/10 [00:00<00:00, 209.11it/s]
