In [None]:
## import pandas as pd
import numpy as np
import tlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.datasets import dump_svmlight_file

X, y, scaler = tlib.load_tfw1(scale=True)

print(f"Training the classifier...")
rf = RandomForestClassifier(random_state=42)
rf.fit(X.to_numpy(), y)

print("Computing adversarial examples for the entire training set...")
X_adv, y_adv, norms = tlib.generate_rf_adversarials(rf, X, y)

print(f"Writing {len(X_adv)} samples to file...")
tlib.dump_csv(X_adv, y_adv, "fin_02_adversarial_examples_for_rf_42_entire_training_data.csv")

In [26]:
import pandas as pd
import numpy as np
import tlib

print("Loading tfw1 data...")
X, y, scaler = tlib.load_tfw1(scale=True)

print("Loading adversarial data...")
X_adv, y_adv = tlib.load_csv("fin_02_adversarial_examples_for_rf_42_entire_training_data.csv")

print(f"Total records: {len(X)}")
print(f"Adversarial records: {len(X_adv)}")

def set_values(series, **kwargs):
    clone = series.copy()
    
    for key, value in kwargs.items():
        clone.at[key] = value
    
    return clone

def assess_attack_quality(X_samples, y_samples, X_adv, y_adv, top_n=5):
    rows_original = []
    rows_adversarial = []
    rows_diff = []
    rows_percentage = []
    
    X_samples_mis = X_samples.loc[X_adv.index]
    percentage_norms = np.linalg.norm(tlib.percentage_change(X_samples_mis.to_numpy(), X_adv.to_numpy()), ord=2, axis=1)
    perturbation_norms = np.linalg.norm(X_samples_mis - X_adv, ord=2, axis=1)
    min_indices = percentage_norms.argsort()
    interesting = []
    
    unique, counts = np.unique(percentage_norms, return_counts=True)
    print(unique[:top_n], counts[:top_n])
    
    previous = 0
    for norm_index in min_indices:
        norm = percentage_norms[norm_index]
        
        if norm <= previous:
            continue
        
        index = X_adv.iloc[norm_index].name
        interesting.append(index)
        previous = norm
        
        if len(interesting) >= top_n:
            break
            
#     columns_to_consider = ["Tp", "Cl", "pH", "Redox", "Leit", "Trueb"]
#     used_columns = []
#     for norm_index in min_indices:
#         norm = percentage_norms[norm_index]
#         index = X_adv.iloc[norm_index].name
#         source = X_samples.loc[index]
#         adv = X_adv.loc[index]
#         diff = (source - adv).abs().round(decimals=6)
#         diff_cols = adv.index[diff > 0]
        
#         if len(diff_cols) > 1:
#             # Ignore changes in more than one columns
#             continue
        
#         col = diff_cols[0]
#         if col in used_columns or col not in columns_to_consider:
#             continue
        
#         interesting.append(index)
#         used_columns.append(col)
        
#         if len(interesting) >= top_n:
#             break
    
    for index in interesting:
        norm = perturbation_norms[X_adv.index.get_loc(index)]
        rows_original.append(set_values(X_samples.loc[index], EVENT=y_samples.loc[index]))
        rows_adversarial.append(set_values(X_adv.loc[index], EVENT=not y_samples.loc[index]))
        rows_diff.append(set_values(X_samples.loc[index] - X_adv.loc[index], EVENT=not y_samples.loc[index]))
        rows_percentage.append(set_values(tlib.percentage_change(X_samples.loc[index], X_adv.loc[index]), L2_Norm=norm))

    return rows_original, rows_adversarial, rows_diff, rows_percentage

# Assess quality
rows_original, rows_adversarial, rows_diff, rows_percentage = assess_attack_quality(
    pd.DataFrame(scaler.inverse_transform(X), index=X.index, columns=X.columns),
    y,
    pd.DataFrame(scaler.inverse_transform(X_adv), index=X_adv.index, columns=X_adv.columns),
    y_adv,
)

columns = X.columns.tolist()

# Print results table
table_original = pd.DataFrame(rows_original, columns=columns+["EVENT"]).round(6)
table_adversarial = pd.DataFrame(rows_adversarial, columns=columns+["EVENT"]).round(6)
table_diff = pd.DataFrame(rows_diff, columns=columns+["EVENT"]).round(6)
table_percentage = pd.DataFrame(rows_percentage, columns=columns+["L2_Norm"]).round(6)

tlib.print_df(table_original)
tlib.print_df(table_adversarial)
tlib.print_df(table_diff)
tlib.print_df(table_percentage)

table_original.to_csv("results-original-data.csv")
table_adversarial.to_csv("results-adversarial-data.csv")
table_percentage.to_csv("results-percentage-data.csv")

Loading tfw1 data...
Loading adversarial data...
Total records: 139566
Adversarial records: 139522
[0.00107948 0.00145224 0.00152085 0.00177785 0.00246303] [2 2 2 1 1]


Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
72054,8.4,0.16,8.38,721.0,210.0,0.015,0.106,1550.0,1023.0,False
60587,8.5,0.27,8.29,754.0,212.0,0.024,0.101,597.0,47.0,False
72052,8.4,0.16,8.38,720.0,210.0,0.015,0.105,1585.0,1032.0,True
94693,9.0,0.16,8.5,753.0,217.0,0.017,0.103,1226.0,827.0,False
72055,8.4,0.16,8.38,722.0,210.0,0.015,0.106,1530.0,1030.0,False


Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
72054,8.4,0.16,8.38,720.221693,210.0,0.015,0.106,1550.0,1023.0,True
60587,8.5,0.27,8.29,752.905008,212.0,0.024,0.101,597.0,47.0,True
72052,8.4,0.16,8.38,721.09501,210.0,0.015,0.105,1585.0,1032.0,False
94693,9.0,0.16,8.515112,753.0,217.0,0.017,0.103,1226.0,827.0,True
72055,8.4,0.16,8.38,720.221693,210.0,0.015,0.106,1530.0,1030.0,True


Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
72054,0.0,0.0,0.0,0.778307,0.0,0.0,0.0,0.0,0.0,True
60587,0.0,0.0,0.0,1.094992,0.0,0.0,0.0,0.0,0.0,True
72052,0.0,0.0,0.0,-1.09501,0.0,0.0,0.0,0.0,0.0,False
94693,0.0,0.0,-0.015112,0.0,0.0,0.0,0.0,0.0,0.0,True
72055,0.0,0.0,0.0,1.778307,0.0,0.0,0.0,0.0,0.0,True


Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,L2_Norm
72054,0.0,0.0,0.0,0.001079,0.0,0.0,0.0,0.0,0.0,0.778307
60587,0.0,0.0,0.0,0.001452,0.0,0.0,0.0,0.0,0.0,1.094992
72052,0.0,0.0,0.0,0.001521,0.0,0.0,0.0,0.0,0.0,1.09501
94693,0.0,0.0,0.001778,0.0,0.0,0.0,0.0,0.0,0.0,0.015112
72055,0.0,0.0,0.0,0.002463,0.0,0.0,0.0,0.0,0.0,1.778307


In [24]:
import pandas as pd
import numpy as np
import tlib

print("Loading tfw1 data...")
X, y, scaler = tlib.load_tfw1(scale=True)

print("Loading adversarial data...")
X_adv, y_adv = tlib.load_csv("fin_02_adversarial_examples_for_rf_42_entire_training_data.csv")

print(f"Total records: {len(X)}")
print(f"Adversarial records: {len(X_adv)}")

a = X.iloc[[0]]
b = X_adv.iloc[[0]]

np.set_printoptions(suppress=True)
print(a.to_numpy()[0])
print(b.to_numpy()[0])
print(tlib.percentage_change(a, b)[0])
print()


c = scaler.inverse_transform(X.iloc[[0]])
d = scaler.inverse_transform(X_adv.iloc[[0]])
print(c[0])
print(d[0])
print(tlib.percentage_change(c, d)[0])
print()

Loading tfw1 data...
Loading adversarial data...
Total records: 139566
Adversarial records: 139522
[0.55084746 0.2125     0.88326559 0.75462185 0.32662539 0.04330709
 0.25541126 0.42747897 0.26813272]
[0.55084746 0.2125     0.88326559 0.72813384 0.32662539 0.04330709
 0.25541126 0.42747897 0.26813272]
[0.         0.         0.         0.03510103 0.         0.
 0.         0.         0.        ]

[   6.5      0.17     8.36   749.     211.       0.011    0.118 1677.
  695.   ]
[   6.5          0.17         8.36       733.2396374  211.
    0.011        0.118     1677.         695.       ]
[0.         0.         0.         0.02104187 0.         0.
 0.         0.         0.        ]

