In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# scale back and remmove negative values
# apply gradient dexcent attack on benign sample -> scale back + delta + modify sample by adding bytes

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('PDFMalware2022.csv').dropna()
# data.columns

# Features recognized by pdfid
pdfid_columns = ['obj', 'endobj', 'stream', 'endstream', 'xref', 'trailer', 'startxref', 'header',
                 'pages', 'isEncrypted', 'ObjStm', 'JS', 'Javascript', 'AA', 'OpenAction',
                 'Acroform', 'JBIG2Decode', 'RichMedia', 'launch', 'embedded files', 'XFA', 'Colors',
                 'Class']

print(data['endstream'].value_counts())

endstream
2      1966
3      1304
1      1008
-1      519
9       360
       ... 
201       1
494       1
222       1
128       1
411       1
Name: count, Length: 241, dtype: int64


In [3]:
new_labels = {'Malicious': 1, 'Benign': 0}
data['Class'] = data['Class'].map(new_labels)


new_labels = {'1(1)': '1', '2(1)': '2', '3(1)': '3', '29(2)': '29', '34(2)': '34', '2(2)': '2', '>': '0', '53(1)': '53', '5(1)': '5', '12(2)': '2', '53(2)': '53', '-1': '0', 
              '(most': '0', '_Pro_Rodeo_Pix_': '0', "_Pro_Rodeo_Pix_'": '0', 'pdfid.py': '0', 'pdfHeader)': '0', 'bytes[endHeader]': '0', 'list': '0', 'unclear': '0', 'Yes': '1', 'No': '0'}

for col in data.drop(columns=['Class']).columns:
    data[col] = data[col].replace(new_labels)

data.head()

Unnamed: 0,Fine name,pdfsize,metadata size,pages,xref Length,title characters,isEncrypted,embedded files,images,text,...,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,EmbeddedFile,XFA,Colors,Class
0,aedaf3c5428a2e3ba600c44b96ad78dfdf8ed76e7df129...,8.0,180.0,1.0,11.0,0.0,0.0,0.0,0,0,...,0,1,0,0,0,0,0,0,0.0,1
1,fe767fb2584a10c010626263ea950643ac25f6ca24628f...,15.0,224.0,0.0,20.0,7.0,0.0,0.0,0,0,...,0,0,1,0,0,0,8,1,0.0,1
2,544c5223ee301affad514b6fa585b3191625aba0a7222b...,4.0,468.0,2.0,13.0,16.0,0.0,0.0,0,1,...,0,1,0,0,0,0,0,0,0.0,1
3,669772e626deccb9cfb7eb6a61e13d248d0ea08f1abe15...,17.0,250.0,1.0,15.0,0.0,0.0,0.0,0,0,...,0,1,1,0,0,0,0,0,0.0,1
4,e434c884f45a691b0bf33d765f61794007eb0b8bb9f590...,7.0,252.0,3.0,16.0,45.0,0.0,0.0,0,1,...,0,1,0,0,0,0,0,0,0.0,1


In [4]:
data = data[pdfid_columns]
class_data = data.drop(columns=['header', 'Class'])

x_train, x_test, y_train, y_test = train_test_split(class_data, data['Class'], test_size=0.2, random_state=77)
x_train.shape

(8018, 21)

In [5]:
# Normalize features
normalizer = RobustScaler()
x_train = normalizer.fit_transform(x_train)
x_test = normalizer.transform(x_test)

pd.DataFrame(x_train, columns=class_data.columns).head()

Unnamed: 0,obj,endobj,stream,endstream,xref,trailer,startxref,pages,isEncrypted,ObjStm,...,Javascript,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,embedded files,XFA,Colors
0,0.410256,0.435897,0.1875,0.1875,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.153846,1.179487,3.1875,3.1875,-0.5,-1.0,2.0,1.0,0.0,11.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.179487,-0.153846,-0.0625,-0.0625,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.230769,4.25641,3.875,3.875,0.0,0.0,0.0,3.0,0.0,0.0,...,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26.282051,26.307692,5.5,5.5,3.5,7.0,4.0,3.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [6]:
def evasion_gradient_descent(x0, gradient, t, epsilon, max_iter):

    m = 0
    x_m = x0
    for i in range(max_iter):

        m += 1
        x_m = x_m - t * gradient
        # print(f"Iteration {i}...")
        if np.linalg.norm(x_m - x0) < epsilon:
            break

    x = x_m
    return x_m

## SVM

In [7]:
C_values = [1, 0.8, 0.5, 0.3, 0.1]
scores = []

for c in C_values:
    svc = SVC(kernel='linear', C=c)
    svc.fit(x_train, y_train)
    y_pred = svc.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    scores.append(f1)
    print(f"C = {c}, F1: {f1}")

best_c = C_values[scores.index(max(scores))]
best_c

C = 1, F1: 0.9630642954856361
C = 0.8, F1: 0.9630642954856361
C = 0.5, F1: 0.9625912408759124
C = 0.3, F1: 0.9625912408759124
C = 0.1, F1: 0.9625912408759124


1

In [8]:
svc = SVC(kernel='linear', C=1)
svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)
svm_f1 = f1_score(y_test, y_pred)

print(f"""SVM scores
      accuracy score: {svm_accuracy}
      precision score: {svm_precision}
      recall score: {svm_recall}
      f1 score: {svm_f1}\n""")
print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")

SVM scores
      accuracy score: 0.9596009975062344
      precision score: 0.9924812030075187
      recall score: 0.9353410097431355
      f1 score: 0.9630642954856361

Confusion matrix:
[[ 868    8]
 [  73 1056]]


In [9]:
gamma_values = [1, 1/1000, 1/x_train.shape[0], 1/10**6]
scores = []

for g in gamma_values:
     svc_rbf = SVC(kernel='rbf', gamma=g)
     svc_rbf.fit(x_train, y_train)
     y_pred = svc_rbf.predict(x_test)
     f1 = f1_score(y_test, y_pred)
     scores.append(f1)
     print(f"Gamma = {g}, F1: {f1}")

best_gamma = gamma_values[scores.index(max(scores))]

Gamma = 1, F1: 0.954398894518655
Gamma = 0.001, F1: 0.953551912568306
Gamma = 0.0001247193813918683, F1: 0.9264069264069265
Gamma = 1e-06, F1: 0.7255723960012899


In [10]:
svc_rbf = SVC(kernel='rbf', gamma=best_gamma)
svc_rbf.fit(x_train, y_train)

y_pred = svc_rbf.predict(x_test)

svm_rbf_accuracy = accuracy_score(y_test, y_pred)
svm_rbf_precision = precision_score(y_test, y_pred)
svm_rbf_recall = recall_score(y_test, y_pred)
svm_rbf_f1 = f1_score(y_test, y_pred)

print(f"""\nSVM with rbf kernel scores
      accuracy score: {svm_rbf_accuracy}
      precision score: {svm_rbf_precision}
      recall score: {svm_rbf_recall}
      f1 score: {svm_rbf_f1}\n""")
print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")


SVM with rbf kernel scores
      accuracy score: 0.9506234413965087
      precision score: 0.9942418426103646
      recall score: 0.9176262178919398
      f1 score: 0.954398894518655

Confusion matrix:
[[ 870    6]
 [  93 1036]]


In [11]:
def rbf_kernel(x, x_i, gamma=0.0001):
    return np.exp(- gamma * np.pow(np.linalg.norm(x - x_i), 2))

def poly_kernel(x, x_i, d=3, c=1):
    return np.pow((x @ x_i) + c, d)

def gradient_rbf_kernel(x, x_i, gamma):
    return -2 * gamma * np.exp(- gamma * np.pow(np.linalg.norm(x - x_i), 2)) * (x - x_i)

def gradient_poly_kernel(x, x_i, d, c):
    return d * np.pow(d * (x @ x_i + c), d - 1) * x_i


def svm_gradient(weights, feature_vector=None, support_vectors=None, kernel='linear', gamma=0.001, d=3, c=0):
    if kernel == 'linear':
        return weights
    
    delta_g = np.zeros(feature_vector.shape)
    kernel_gradient = np.zeros(feature_vector.shape)
    for i in range(support_vectors.shape[0]):
        w_i = weights[i]
        x_i = support_vectors[i]
        if kernel == 'rbf':
            kernel_gradient = gradient_rbf_kernel(feature_vector, x_i, gamma)
        elif kernel == 'poly':
            kernel_gradient = gradient_poly_kernel(feature_vector, x_i, d, c)
        # print(f"delta_g: {delta_g.shape}, weights: {weights.shape}, kernel_gradient: {kernel_gradient.shape}")
        delta_g = delta_g + w_i * kernel_gradient
    return delta_g

In [12]:
weights_linear = svc.coef_[0]
bias_linear = svc.intercept_
gradient_linear = weights_linear
print(f"Linear weights:\n{gradient_linear}")

malicious_samples = class_data[data['Class'] == 1]
x_malicious_samples = malicious_samples.to_numpy().astype(int)
modified = np.zeros(x_malicious_samples.shape)

x_malicious_samples = normalizer.fit_transform(x_malicious_samples)
learning_rates = [0.001]
epsilons = [0.0001]
iterations = 100
accuracy = 0


for lr in learning_rates:

    for e in epsilons:

        modified = np.zeros(x_malicious_samples.shape)
        for i in range (len(x_malicious_samples)):
            # grad_i = svm_gradient(weights=weights_rbf,feature_vector=x,support_vectors=support_vectors_rbf,kernel='rbf',gamma=gamma)
            # grad_i = svm_gradient(weights=weights_poly,feature_vector=x,support_vectors=support_vectors_poly,kernel='poly',d=d,c=c)
            grad_i = gradient_linear
            modified[i] = evasion_gradient_descent(
                x0=x_malicious_samples[i],
                gradient=grad_i,
                t=lr,
                epsilon=epsilons,
                max_iter=iterations
                )
            
        modified_samples = pd.DataFrame(modified, columns=class_data.columns)
        cl = svc_rbf.predict(modified_samples)
        modified_samples['Class'] = cl
        accuracy = len(modified_samples[modified_samples['Class'] == 0]) / len(x_malicious_samples)
        print(f"Accuracy: {accuracy} with learning rat = {lr} and epsilon = {e}")


modified_samples = pd.DataFrame(modified, columns=class_data.columns)
cl = svc_rbf.predict(modified_samples)
modified_samples['Class'] = cl
# print(modified_samples)

accuracy = len(modified_samples[modified_samples['Class'] == 0]) / len(x_malicious_samples)
print(f"Accuracy: {accuracy}")

print("Delta: ")
delta_df = x_malicious_samples - modified

delta_df = pd.DataFrame(delta_df, columns=class_data.columns)
delta_df.iloc[0]

Linear weights:
[ 1.20419194e+00 -1.70017908e+00 -3.14167524e+00  2.72451988e+00
 -2.51478689e+00 -2.41749827e-02  5.76831206e-01  1.17556174e-05
 -2.82271303e+00 -3.22466802e-01 -5.13676922e-01  1.13145207e+00
 -1.16124337e+00  1.08493474e+00  3.12448060e-01 -5.49863649e-01
  1.59158033e+00  5.26677184e+00  2.55856914e-02  3.03331031e+00
  1.65653273e-02]




Accuracy: 0.9614761476147615 with learning rat = 0.001 and epsilon = 0.0001




Accuracy: 0.9614761476147615
Delta: 


obj               0.120419
endobj           -0.170018
stream           -0.314168
endstream         0.272452
xref             -0.251479
trailer          -0.002417
startxref         0.057683
pages             0.000001
isEncrypted      -0.282271
ObjStm           -0.032247
JS               -0.051368
Javascript        0.113145
AA               -0.116124
OpenAction        0.108493
Acroform          0.031245
JBIG2Decode      -0.054986
RichMedia         0.159158
launch            0.526677
embedded files    0.002559
XFA               0.303331
Colors            0.001657
Name: 0, dtype: float64

In [13]:
scaled_back = normalizer.inverse_transform(modified_samples.drop(columns=['Class']))

modified_df = pd.DataFrame(scaled_back, columns=class_data.columns).astype(int)
modified_df['Class'] = modified_samples['Class']

modified_df

Unnamed: 0,obj,endobj,stream,endstream,xref,trailer,startxref,pages,isEncrypted,ObjStm,...,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,embedded files,XFA,Colors,Class
0,9,11,3,2,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18,20,9,8,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11,13,3,2,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,13,15,2,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14,16,4,3,1,1,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5550,8,10,3,2,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5551,7,8,2,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5552,7,9,2,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5553,8,10,3,2,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
modified_df.describe()

Unnamed: 0,obj,endobj,stream,endstream,xref,trailer,startxref,pages,isEncrypted,ObjStm,...,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,embedded files,XFA,Colors,Class
count,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,...,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0,5555.0
mean,14.909271,20.19856,4.79676,5.080648,0.842844,1.019802,0.159316,1.164536,0.008821,0.340594,...,0.070567,0.008281,0.012781,0.005221,0.00288,0.00072,-0.048065,0.00396,1.386499,0.038524
std,29.75335,264.875058,11.744393,90.159262,1.342486,1.351044,1.350789,12.393686,0.124846,8.365498,...,0.56074,0.101855,0.184028,0.102935,0.078188,0.026827,0.260939,0.070892,11.106607,0.192475
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0
25%,7.0,8.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,10.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14.0,16.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,817.0,19633.0,377.0,6667.0,46.0,46.0,67.0,594.0,4.0,600.0,...,23.0,3.0,7.0,6.0,3.0,1.0,4.0,3.0,147.0,1.0


In [15]:
delta_scaled = normalizer.inverse_transform([delta_df.iloc[0]]).astype(int)

delta_scaled

array([[9, 7, 1, 2, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [16]:
malicious_sample = class_data[data['Class'] == 1].sample(n=1).astype(int)
malicious_mod = malicious_sample - delta_scaled

pred = svc.predict(malicious_mod)
print(f"Predicted class: {pred}")
malicious_mod

Predicted class: [0]




Unnamed: 0,obj,endobj,stream,endstream,xref,trailer,startxref,pages,isEncrypted,ObjStm,...,Javascript,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,embedded files,XFA,Colors
9901,0,2,2,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## MLP

In [17]:
x_train_mlp, x_test_mlp, y_train_mlp, y_test_mlp = train_test_split(class_data, data['Class'], test_size=0.2, random_state=77)

In [18]:
# Normalize features
scaler = Normalizer()
x_train_mlp = scaler.fit_transform(x_train_mlp)
x_test_mlp = scaler.transform(x_test_mlp)

pd.DataFrame(x_train, columns=class_data.columns).head()

Unnamed: 0,obj,endobj,stream,endstream,xref,trailer,startxref,pages,isEncrypted,ObjStm,...,Javascript,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,embedded files,XFA,Colors
0,0.410256,0.435897,0.1875,0.1875,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.153846,1.179487,3.1875,3.1875,-0.5,-1.0,2.0,1.0,0.0,11.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.179487,-0.153846,-0.0625,-0.0625,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.230769,4.25641,3.875,3.875,0.0,0.0,0.0,3.0,0.0,0.0,...,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26.282051,26.307692,5.5,5.5,3.5,7.0,4.0,3.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [19]:
hidden_layers = (128,)
activation = 'tanh'
solver = 'adam'

mlp = MLPClassifier(hidden_layer_sizes=hidden_layers, activation=activation, solver=solver)
mlp.fit(x_train_mlp, y_train_mlp)

y_pred = mlp.predict(x_test_mlp)

mlp_accuracy = accuracy_score(y_test_mlp, y_pred)
mlp_precision = precision_score(y_test_mlp, y_pred)
mlp_recall = recall_score(y_test_mlp, y_pred)
mlp_f1 = f1_score(y_test_mlp, y_pred)

print(f"""MLP scores
      accuracy score: {mlp_accuracy}
      precision score: {mlp_precision}
      recall score: {mlp_recall}
      f1 score: {mlp_recall}\n""")
print(f"Confusion matrix:\n{confusion_matrix(y_test_mlp, y_pred)}")



MLP scores
      accuracy score: 0.9650872817955112
      precision score: 0.9871205151793928
      recall score: 0.9503985828166519
      f1 score: 0.9503985828166519

Confusion matrix:
[[ 862   14]
 [  56 1073]]


In [20]:
def tanh(z):
    return np.tanh(z)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def relu(z):
    return max(0, z)

def tanh_derivative(z):
    return 1 - np.pow(tanh(z), 2)

def sigmoid_derivative(z):
    return 1 - sigmoid(z)

def relu_derivative(z):
    return z>=0


def mlp_discriminant(x, hidden_weights, hidden_bias, output_weights, output_bias):
    g = sigmoid(output_weights.T @ sigmoid(hidden_weights.T @ x + hidden_bias) + output_bias)
    return g


def mlp_gradient(x, hidden_weights, hidden_bias, output_weights, output_bias):
    g = mlp_discriminant(x, hidden_weights, hidden_bias, output_weights, output_bias)
    delta_k = sigmoid(hidden_weights.T @ x + hidden_bias)

    delta_g = np.zeros(x.shape)
    for i in range(x.shape[0]):
        delta_g[i] = g * (1 - np.pow(g, 1)) * (output_weights.T @ (delta_k * (1 - np.pow(delta_k, 1)) * hidden_weights[i]))
        # delta_g[i] = g * (1 - np.pow(g, 2)) * (output_weights.T @ (delta_k * (1 - np.pow(delta_k, 2)) * hidden_weights[i]))

    return delta_g

weights = mlp.coefs_

bias = mlp.intercepts_
output_weights = np.array(weights[-1])
output_bias = np.array(bias[-1])
hidden_weights = np.array(weights[0])
hidden_bias = np.array(bias[0])

In [21]:
malicious_samples = class_data[data['Class'] == 1]
x_malicious_samples = malicious_samples.to_numpy().astype(int)
modified = np.zeros(x_malicious_samples.shape)

norms = np.linalg.norm(x_malicious_samples, axis=1, keepdims=True)
mean_norms = np.mean(norms)
x_malicious_samples = scaler.fit_transform(x_malicious_samples)

for i in range (len(x_malicious_samples)):
    mlp_gr = mlp_gradient(x_malicious_samples[i], hidden_weights, hidden_bias, output_weights, output_bias)
    modified[i] = evasion_gradient_descent(
        x0=x_malicious_samples[i],
        gradient=mlp_gr,
        t=0.03,
        epsilon=0.00001,
        max_iter=500
        )

modified_samples = pd.DataFrame(modified, columns=class_data.columns)
cl = mlp.predict(modified_samples)
modified_samples['Class'] = cl
modified_samples.describe()

accuracy = len(modified_samples[modified_samples['Class'] == 0]) / len(x_malicious_samples)
print(f"Accuracy: {accuracy}")

print("Delta: ")
delta_df = x_malicious_samples - modified

delta_df = pd.DataFrame(delta_df, columns=class_data.columns)
delta_df.iloc[0]

  delta_g[i] = g * (1 - np.pow(g, 1)) * (output_weights.T @ (delta_k * (1 - np.pow(delta_k, 1)) * hidden_weights[i]))


Accuracy: 0.9823582358235824
Delta: 




obj               -1.259683
endobj            -1.345396
stream             0.009141
endstream         -1.505205
xref              -7.894383
trailer            0.014515
startxref          8.173189
pages             -1.266876
isEncrypted       -1.955416
ObjStm            -1.152940
JS                 5.930538
Javascript        12.984535
AA                -1.809699
OpenAction        13.581785
Acroform          -2.463193
JBIG2Decode       -6.313746
RichMedia         24.661394
launch             1.169097
embedded files     3.530648
XFA               48.500344
Colors             1.453817
Name: 0, dtype: float64

In [22]:
scaled_back = modified_samples.drop(columns=['Class']) * norms

modified_df = pd.DataFrame(scaled_back, columns=class_data.columns).astype(int)
modified_df['Class'] = modified_samples['Class']

modified_df

Unnamed: 0,obj,endobj,stream,endstream,xref,trailer,startxref,pages,isEncrypted,ObjStm,...,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,embedded files,XFA,Colors,Class
0,28,30,2,25,119,0,-121,20,29,17,...,27,-202,36,94,-369,-17,-52,-727,-21,0
1,94,100,8,99,477,0,-491,76,117,69,...,108,-819,149,380,-1487,-70,-212,-2923,-87,0
2,26,27,2,20,92,0,-93,16,22,13,...,20,-156,28,73,-285,-13,-40,-561,-16,0
3,21,21,1,10,47,0,-47,8,11,6,...,10,-79,15,37,-146,-6,-20,-287,-8,0
4,50,53,3,46,225,0,-231,39,55,32,...,51,-385,70,179,-701,-33,-100,-1379,-41,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5550,22,23,2,19,85,0,-86,14,21,12,...,19,-145,26,67,-265,-12,-38,-522,-15,0
5551,13,13,1,8,36,0,-36,6,8,5,...,8,-60,11,28,-111,-5,-16,-219,-6,0
5552,16,17,1,12,54,0,-54,10,13,7,...,12,-91,16,42,-167,-7,-24,-329,-9,0
5553,22,23,2,19,85,0,-86,14,21,12,...,19,-145,26,67,-265,-12,-38,-522,-15,0


In [23]:
delta_scaled = (delta_df.iloc[0] * mean_norms).astype(int)

delta_scaled

obj                -36
endobj             -39
stream               0
endstream          -43
xref              -230
trailer              0
startxref          238
pages              -37
isEncrypted        -57
ObjStm             -33
JS                 173
Javascript         379
AA                 -52
OpenAction         396
Acroform           -71
JBIG2Decode       -184
RichMedia          720
launch              34
embedded files     103
XFA               1417
Colors              42
Name: 0, dtype: int64

In [24]:
malicious_sample = class_data[data['Class'] == 1].sample(n=1).astype(int)
malicious_mod = malicious_sample - delta_scaled

pred = svc.predict(malicious_mod)
print(f"Predicted class: {pred}")
malicious_mod

Predicted class: [0]




Unnamed: 0,obj,endobj,stream,endstream,xref,trailer,startxref,pages,isEncrypted,ObjStm,...,Javascript,AA,OpenAction,Acroform,JBIG2Decode,RichMedia,launch,embedded files,XFA,Colors
4674,36,39,-1,43,230,-1,-238,38,57,32,...,-379,52,-396,71,184,-720,-34,-103,-1417,-43
