## Functions

In [4]:
! pip install stable-baselines3[extra] pandas 

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.8.0-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.5/174.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting importlib-metadata~=4.13
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting autorom[accept-rom-license]~=0.6.0
  Downloading AutoROM-0.6.1-py3-none-any.whl (9.4 kB)
Collecting ale-py==0.7.4
  Downloading ale_py-0.7.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:00:01[0m
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-r

In [7]:
import numpy as np
import torch as th
import pandas as pd
from stable_baselines3 import PPO

In [8]:
datasets_dict = {}
datasets_dict['MDDR'] = dict(fullname='MDDR',partition=['DS1','DS2','DS3'])
# datasets_dict['DUD'] = dict(fullname='DUD')
# datasets_dict['MUV'] = dict(fullname='MUV')

In [9]:
def jaccard_similarity(fp1, fp2):
    return np.dot(fp1, fp2) / (np.sum(fp1**2) + np.sum(fp2**2) - np.dot(fp1, fp2))


In [10]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [11]:
from scipy.stats import spearmanr
def spearman_correlation(x, y):
    correlation, p_value = spearmanr(x, y)
    return correlation

In [12]:
def pearson_correlation(x, y):
    covariance = np.cov(x, y)[0][1]
    std_x = np.std(x)
    std_y = np.std(y)
    return covariance / (std_x * std_y)

In [13]:
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y)**2))

In [14]:
similarities = {}
similarities["cosine"] = lambda fp1,fp2 : cosine_similarity(fp1,fp2)
similarities["euclidean"] = lambda fp1,fp2 : euclidean_distance(fp1,fp2)
similarities["spearman"] = lambda fp1,fp2 : spearman_correlation(fp1,fp2)
similarities["pearson"] = lambda fp1,fp2 : pearson_correlation(fp1,fp2)


In [15]:
# y = np.random.random((1024))*100
x = np.array([0,2,4,61.10,6.2,7.3,6,5])
y = np.array([0,0,4,6,6,0,1,100])

In [16]:
def loadDataset(name,fp,partition=False):
    classes=[]
    classes_count=[]
    classes_df={}
    descrpitors=[]

    dataset_path = ""
    if name in datasets_dict.keys():
        if partition:
            if "partition" in datasets_dict[name].keys():
                if partition in datasets_dict[name]["partition"]:
                    dataset_path = "./Datasets/"+name+"/"+partition
                else: 
                    print("Partition doesn't exist")    
            else:
                print("This dataset doesn't have a partition")

        else:
            if "partition" in datasets_dict[name].keys():
                print(datasets_dict[name]["partition"])
                print("Enter the partition name")
            else:
                dataset_path = "./Datasets/"+name
        
        if dataset_path != "":
            with open(dataset_path+"/ActivityClass.txt", 'r') as file:
                i=0
                file.readline()
                for line in file:
                    l = line.split()
                    classes+=[i]*int(l[2])
                    classes_count.append(int(l[2]))
                    i+=1

            with open(dataset_path+"/ActivityDecoyClass.txt", 'r') as file:
                i=-1
                file.readline()
                for line in file:
                    l = line.split()
                    classes+=[i]*int(l[2])
                    i-=1
                    
            with open(dataset_path+"/FP/"+fp+".txt", 'r') as file:
                i=0
                for line in file:
                    descrpitors.append(np.array(line.split(),dtype=np.int8))
                    
                    classes_df[i+1] = {"id":i,"class":classes[i]}
                    i+=1
    else:
        print("Dataset doesn't existe")
    
    return np.array(descrpitors),pd.DataFrame(classes_df).T,classes_count

In [17]:
def evaluateQueries(descrpitors,classes_df,classes_count,name,n,percent,partition=False,sim_function=cosine_similarity):
    evaluation = {}
    dataset_path = ""
    multi_decoy = True

    if name in datasets_dict.keys():
        if partition:
            if "partition" in datasets_dict[name].keys():
                if partition in datasets_dict[name]["partition"]:
                    dataset_path = "./Datasets/"+name+"/"+partition
                else: 
                    print("Partition doesn't exist")    
            else:
                print("This dataset doesn't have a partition")

        else:
            if "partition" in datasets_dict[name].keys():
                print(datasets_dict[name]["partition"])
                print("Enter the partition name")
            else:
                dataset_path = "./Datasets/"+name

        with open(dataset_path+"/ActivityDecoyClass.txt", 'r') as f:
            if(int(f.readline())== 1):
                multi_decoy = False
        
        if dataset_path != "":
            with open(dataset_path+"/Queries.txt", 'r') as file:
                clss = 0
                for line in file:
                    classQueries = line.split()[:n]
                    evaluation_querie = {}
                    
                    if multi_decoy:
                        decoy_clss = clss*(-1)-1
                        classes_df_tmp = classes_df.loc[classes_df["class"].isin([clss,decoy_clss])].copy()
                    else:
                        decoy_clss = -1
                        classes_df_tmp = classes_df.copy()

                    
                    
                    for q in classQueries:
                        querie = descrpitors[int(q)-1]
                        tmp_df = classes_df_tmp.loc[classes_df_tmp["id"]!=int(q)-1].copy()

#                         tmp_df["similarity"] = [cosine_similarity(descrpitors[i] ,querie) for i in tmp_df["id"]]
                        tmp_df["similarity"] = [sim_function(descrpitors[i] ,querie) for i in tmp_df["id"]]

                        tmp_df = tmp_df.sort_values(by='similarity',ascending=False)
                        
                        evaluation_pressesions = {}
                        print(q)
                        for p in percent:
                            num_rows = int(p * len(tmp_df))
                            subset_df = tmp_df.head(num_rows)
                            evaluation_pressesions[p] = list(subset_df["class"]).count(clss)/(min(classes_count[clss]-1,len(subset_df)))
                        
                        evaluation_querie[q] = evaluation_pressesions
                    evaluation[clss] = evaluation_querie
                    clss+=1
    else:
        print("Dataset doesn't existe")

    return evaluation

In [18]:
def evaluationMeanDF(eval):
    evaluation = {}
    for key in eval.keys():
        df_tmp = pd.DataFrame(eval[key]).T
        evaluation[key] = dict(df_tmp.mean())
    df_eval = pd.DataFrame(evaluation).T
    df_eval.loc["mean"] = df_eval.mean()
    return df_eval.round(3)

In [20]:
model_path = "INSER_STABLEBASELINES_MODEL_NAME"
agent = PPO.load(model_path)
dev = th.device('cuda:0' if th.cuda.is_available() else 'cpu')

In [21]:

INPUT_SHAPE = (1,1024)
mlp_extr = agent.policy.mlp_extractor.policy_net
features_extr = agent.policy.features_extractor
def creatDescriptorArrayRL_PPO(fp):
    fp = np.array(fp)
    fp = fp.reshape(INPUT_SHAPE)
    fp = th.tensor(fp).float().unsqueeze(0)
    fp = fp.to(dev)
    
    fp = features_extr(fp).cpu().detach().numpy()
    return fp.reshape((INPUT_SHAPE[1],))
    # return fp

In [22]:
import json
def save_eval_as_json(eval_rl:type(dict),name:str):
    if len(name) < 1:
        print("name must be set")
        return
    with open(name+".json",mode="w") as jf:
        jf.write(json.dumps(eval_rl,ensure_ascii=False))
    
t = {"ds1":{"1":{"0.1":[0,0.9,0.2,0.8]}}}
save_eval_as_json(t,"test")

In [None]:
fp = "ECFP4"
simil = "Jaccard"
queries_n = 10
p = [0.01,0.05]
for key,value in datasets_dict.items():
    dataset = key
    print("-"*14)
    print("Dataset : ",dataset)
    if "partition" in value.keys():
        for partition in value["partition"]:
            print("-"*14)
            print(f"Dataset : {dataset} -> partition : {partition}")
            descrpitors_fp,classes_df,classes_count = loadDataset(dataset,fp,partition)
            eval_fp = evaluateQueries(descrpitors_fp,classes_df,classes_count,dataset,queries_n,p,partition)
            df_eval_fp = evaluationMeanDF(eval_fp)
            df_eval_fp.to_csv("./Evaluation/"+dataset+"/"+partition+"/evaluation_"+fp+"_fp_"+dataset+"_"+partition+"_"+simil+".csv")

            descrpitors_rl = np.apply_along_axis(creatDescriptorArrayRL_PPO, axis=1, arr=descrpitors_fp)
            eval_rl = evaluateQueries(descrpitors_rl,classes_df,classes_count,dataset,queries_n,p,partition,sim_function=jaccard_similarity)
            save_eval_as_json(eval_rl,f"{dataset}_{partition}_eval_cnn_mha")
            df_eval_rl = evaluationMeanDF(eval_rl)
            df_eval_rl.to_csv("./Evaluation/"+dataset+"/"+partition+"/evaluation_"+fp+"_rl_cnn_mha_"+dataset+"_"+partition+"_"+simil+".csv")
    else:
        descrpitors_fp,classes_df,classes_count = loadDataset(dataset,fp)
        eval_fp = evaluateQueries(descrpitors_fp,classes_df,classes_count,dataset,queries_n,p)
        df_eval_fp = evaluationMeanDF(eval_fp)
        df_eval_fp.to_csv("./Evaluation/"+dataset+"/evaluation_"+fp+"_fp_"+dataset+"_"+simil+".csv")

        descrpitors_rl = np.apply_along_axis(creatDescriptorArrayRL_PPO, axis=1, arr=descrpitors_fp)
        eval_rl = evaluateQueries(descrpitors_rl,classes_df,classes_count,dataset,queries_n,p)
        df_eval_rl = evaluationMeanDF(eval_rl)
        df_eval_rl.to_csv("./Evaluation/"+dataset+"/evaluation_"+fp+"_rl_"+dataset+"_"+simil+".csv")
    