# Feature Selection

#### Assumptions: 
1. The prediction task is an edge classification problem
2. The labels are 0 and 1 only
3. The ID is linear and sorted

#### Usage: 
Select any number of node and edge features in the data given that matches the assumptions above, where the original data is stored in ``Drug_combination_data``

In [None]:
import pandas as pd
import os

In [None]:
original = "../Drug_combination_data"  # original data
data = "../data"                       # processed data

In [None]:
if not os.path.exists(data):
    os.makedirs(data)

### Load data

In [None]:
df1 = pd.read_csv(f"{original}/DrugCombiNet_drug_disease_scores.tsv", sep="\t")       # node features
df2 = pd.read_csv(f"{original}/DrugCombiNet_drug_disease_z_scores.tsv", sep="\t")     # node features z-scores
df3 = pd.read_csv(f"{original}/DrugCombiNet_drug_drug_scores.tsv", sep="\t")          # edge features
df4 = pd.read_csv(f"{original}/groundtruth_cardiovascular_2drugs_only.tsv", sep="\t") # labels

### Initial Preprocessing

There are more drug-drug data (edge features) than ground truth labels, so only the drug-drug data that have labels are selected

In [None]:
# node features 1, node features 2, edge features, label
def preprocess_data(df1, df2, df3, df4):      
    # ------- Step 1: get node features -------
    combined = pd.merge(df1, df2, on="drug")
    #combined.to_csv(f"{data}/node_features.csv", index=False)

    # ------- Step 2: create a file with drugA and drugB with the label -------
    df3["drugcomb_sorted"] = df3["drugA"] + "_" + df3["drugB"]   # create a new column drugcomb_sorted based on the ID of drug A and drugB
    df4 = pd.merge(df3, df4, on=["drugcomb_sorted"])             # combine edge features with label    
    df4.drop_duplicates(subset=["drugcomb_sorted"], keep="first", inplace=True)
    mapping = {df4.columns[0]: "drugA", df4.columns[1]: "drugB"} # rename columns 
    df4 = df4.rename(columns=mapping)
    #df4.to_csv(f"{data}/drugA_drugB_label.csv", index=False)   

    # ------- Step 3: join drug in the node features file with drugA and drugB -------
    df = pd.read_csv(f"{data}/drugA_drugB_label.csv")
    df = df.merge(combined, how="left", left_on="drugA", right_on="drug")  # drugA's features
    # rename edge features to avoid columns with the same name
    df = df.rename(columns={
            "s_x": "s",
            "op_x": "op",
            "meanSP_x": "meanSP",
            "medianSP_x": "medianSP",
            "minSP_x": "minSP",
            "maxSP_x": "maxSP",
        }) 
    df = df.merge(combined, how="left", left_on="drugB", right_on="drug")  # drugB's features

    # ------- Step 4: rearrange columns -------
    # remove additional columns
    df.drop(["drug_x", "drug_y"], axis=1, inplace=True)
    columns = df.columns.to_list()
    df.drop(["drugA_y", "drugB_y"], axis=1, inplace=True)

    # renaming the columns
    df.columns = ["drugA","drugB","s","op","meanSP","medianSP","minSP","maxSP","drugcomb_sorted","label","s_x","op_x","meanSP_x",
                  "medianSP_x","minSP_x","maxSP_x","zTD_x","zDT_x","s_y","op_y","meanSP_y","medianSP_y","minSP_y","maxSP_y","zTD_y","zDT_y"]  

    # reorder the columns so the label is in the last column
    df = df[["drugcomb_sorted","drugA","drugB","s","op","meanSP","medianSP","minSP","maxSP", "s_x","op_x","meanSP_x",
             "medianSP_x","minSP_x","maxSP_x","zTD_x","zDT_x","s_y","op_y","meanSP_y","medianSP_y","minSP_y","maxSP_y","zTD_y","zDT_y", 
             "label"]]
    
    # final file with node and edge features and labels
    #df.to_csv(f"{data}/node_edge_features.csv", index=False)
    return combined, df4, df

preprocess_data(df1, df2, df3, df4)

#### Convert ID to integer

In [None]:
df1 = pd.read_csv(f"{data}/node_features.csv")
df2 = pd.read_csv(f"{data}/node_edge_features.csv")

In [None]:
# convert ID to int
def convert_int(df1, df2):
    df1["drug"] = df1["drug"].str.removeprefix("DB").to_numpy(dtype=int)
    df2["drugA"] = df2["drugA"].str.removeprefix("DB").to_numpy(dtype=int)
    df2["drugB"] = df2["drugB"].str.removeprefix("DB").to_numpy(dtype=int)
    #df1.to_csv(f"{data}/node_features_id.csv", index_label="ID")
    #df2.to_csv(f"{data}/node_edge_features_id.csv", index=False)
    return df1, df2
convert_int(df1, df2)

## Select any number of node and edge features

### Tabular data (Random forest, XGBoost, Neural Network)

In [None]:
df1 = pd.read_csv(f"{data}/node_features_id.csv")    
df2 = pd.read_csv(f"{data}/node_edge_features_id.csv")

# combine 2 files (df1 = node features, df2 = edge features) and select the features
def baseline_data(df1, df2, node_features, edge_features):  
    # ------- Step 1: add any number of node features -------
    nodes = []
    for i in range(len(node_features)):
        nodes.append(node_features[i] + "_x")
    for i in range(len(node_features)):
        nodes.append(node_features[i] + "_y")
    new_data = df2[["drugcomb_sorted", "drugA", "drugB"] + edge_features + nodes + ["label"]]
    
    # ------- Step 2: swap the columns for permutation invariance -------
    # swap the ID of drugA and drugB
    features = ["drugA", "drugB"] + edge_features + nodes
    new_data1 = new_data.copy()
    new_data1["drugA"], new_data1["drugB"] = new_data1["drugB"], new_data1["drugA"]  

    # swap the node features
    n = 2+len(edge_features)  # column 0: drugcomb_sorted, column 1: drugA, column 2: drugB, column 3 onwards: edge_features
    for i in range(n, n+len(node_features)):  
        new_data1[features[i]], new_data1[features[i+n]] = new_data1[features[i+n]], new_data1[features[i]]

    # ------- Step 3: Merge the original and the swapped dataframes together -------
    final = pd.concat([new_data, new_data1]).sort_index(kind="merge")
    #final.to_csv(f"{data}/baseline_features.csv", index=False)
    return final

baseline_data(df1, df2, ["s", "meanSP", "medianSP", "minSP", "maxSP", "zTD","zDT"], 
              ["s", "meanSP", "medianSP", "minSP", "maxSP"])  # specify the node and edge features

### Graph data (GCN, GAT, Graph Transformer)

In [None]:
df1 = pd.read_csv(f"{data}/node_features_id.csv")
df2 = pd.read_csv(f"{data}/baseline_features.csv")

new_df2 = df2[["drugcomb_sorted","drugA","drugB","s","meanSP","medianSP","minSP","maxSP", "label"]]  # took out op

def gnn_data(df1, df2, node_features, edge_features):
    mapping = pd.Series(df1["ID"].values, index=df1["drug"]).to_dict()  # map old value to new value
    nodes = df1[["ID", "drug"] + node_features]
    #nodes.to_csv(f"{data}/gnn_node_features.csv", index=False)
    df2.insert(2, "drugA_ID", value=None)
    df2["drugA_ID"] = df2.drugA.map(mapping)
    df2.insert(4, "drugB_ID", value=None)
    df2["drugB_ID"] = df2.drugB.map(mapping)
    edges = df2[["drugcomb_sorted", "drugA", "drugA_ID", "drugB", "drugB_ID"] + edge_features + ["label"]]
    edges.to_csv(f"{data}/gnn_edge_features_test.csv", index=False)
    return nodes, edges

gnn_data(df1, new_df2, ["s", "meanSP", "medianSP", "minSP", "maxSP", "zTD","zDT"], 
              ["s", "meanSP", "medianSP", "minSP", "maxSP"])  # specify the node and edge features