# Run procedures

## Imports

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

## Functions

In [None]:
def calculate_and_export_metrics(df_test, main_column, path_to_export):
    
    df_test["target"] = df_test["target"].astype(bool)
    df_test["pred"]   = df_test["pred"].astype(bool)
    
    df_metrics = df_test[["Pe", main_column]].groupby(["Pe", main_column]).any()
    df_metrics["true_negatives"]  = 0
    df_metrics["false_negatives"] = 0
    df_metrics["true_positives"]  = 0
    df_metrics["false_positives"] = 0
    df_metrics["detection_proba"] = 0
    df_metrics["precision"]       = 0.0
    df_metrics["recall"]          = 0.0
    df_metrics["accuracy"]        = 0.0

    for Pe, value in df_metrics.index:
        df_preds = df_test.loc[(df_test["Pe"]==Pe) & (df_test[main_column]==value)]
        CM       = confusion_matrix(df_preds["target"], df_preds["pred"], labels=[False, True])
        df_metrics.loc[(Pe, value), "true_negatives"]  = CM[0][0]
        df_metrics.loc[(Pe, value), "false_negatives"] = CM[1][0]
        df_metrics.loc[(Pe, value), "true_positives"]  = CM[1][1]
        df_metrics.loc[(Pe, value), "false_positives"] = CM[0][1]
        df_metrics.loc[(Pe, value), "detection_proba"] = CM[:, 1].sum()/CM.sum()
        df_metrics.loc[(Pe, value), "precision"]       = precision_score(df_preds["target"], df_preds["pred"], zero_division=1.0)
        df_metrics.loc[(Pe, value), "recall"]          = recall_score(df_preds["target"], df_preds["pred"], zero_division=1.0)
        df_metrics.loc[(Pe, value), "accuracy"]        = accuracy_score(df_preds["target"], df_preds["pred"])
        
        
    df_metrics.to_csv(path_to_export)

In [None]:
def hassans_algorithm(sample):
    return True if sample["E"] > sample["eta"] else False

## Global parameters

In [None]:
dirDatasets  = Path("../datasets/")
dirResults   = Path("../results/")
features     = ["n_users", "E"]
n_jobs       = -1
random_state = 0
np.random.seed(0)

## Procedure 1 

### Gridsearch for max depth

In [None]:
df_train = pd.read_csv(dirDatasets.joinpath("procedure_1_train.csv"))
X_train  = df_train[features]
y_train  = df_train["target"]

In [None]:
gs = GridSearchCV(
    estimator  = DecisionTreeClassifier(),
    param_grid = {"max_depth": np.arange(1, 6)},
    scoring    = "recall",
    n_jobs     = n_jobs,
    cv         = 10
)
gs.fit(X_train, y_train)
print(gs.best_params_)

### Training trees and exporting sketches

In [None]:
trees = []
dict_trees = dict()

for max_depth in range(1, gs.best_params_["max_depth"]+1):
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
    clf.fit(X_train, y_train)
    trees.append(clf)
    dict_trees["depth_{}".format(max_depth)] = export_text(clf, feature_names=features)
    
with open(dirResults.joinpath("procedure_1_trees_sketch.json"), mode="w") as file:
    json.dump(dict_trees, file, indent=4)

### Test 1 evaluation

In [None]:
df_test = pd.read_csv(dirDatasets.joinpath("procedure_1_test_1.csv"))

for max_depth, tree in zip(range(1, gs.best_params_["max_depth"]+1), trees):
    df_test["pred"] = tree.predict(df_test[features])
    calculate_and_export_metrics(
        df_test, 
        "snr", 
        dirResults.joinpath("procedure_1_test_1_depth_{}.csv".format(max_depth))
    )

### Test 2 evaluation

In [None]:
df_test = pd.read_csv(dirDatasets.joinpath("procedure_1_test_2.csv"))

for max_depth, tree in zip(range(1, gs.best_params_["max_depth"]+1), trees):
    df_test["pred"] = tree.predict(df_test[features])
    calculate_and_export_metrics(
        df_test, 
        "n_users", 
        dirResults.joinpath("procedure_1_test_2_depth_{}.csv".format(max_depth))
    )

## Procedure 2

In [None]:
# Snr and number of users to consider when evaluating metrics (keep all values when training)
test_snr   = 10
test_users = 64

# Initializing dataframe to keep predictions of desired snr and users
df_test = pd.DataFrame(columns=["n_antennas", "Pe", "target", "pred"])

# For each number of antennas, fit new model
dict_max_depths = dict()
for n_antennas in range(64, 257, 16):
    
    # Open dataset and separate data
    if n_antennas < 256:
        df = pd.read_csv(dirDatasets.joinpath("procedure_2_{}_antennas.csv".format(n_antennas)))
    else:
        df = pd.read_csv(dirDatasets.joinpath("procedure_1_train.csv"))
    df["pred"] = False
    df["n_antennas"] = n_antennas
    X = df[features]
    y = df["target"]
    
    # Gridsearch CV
    gs = GridSearchCV(
        estimator  = DecisionTreeClassifier(),
        param_grid = {"max_depth": np.arange(1, 6)},
        scoring    = "recall",
        n_jobs     = n_jobs,
        cv         = 10
    )
    gs.fit(X, y)
    dict_max_depths["antennas_{}".format(n_antennas)] = int(gs.best_params_["max_depth"])
    
    # Use the best max_depth to train/test each stratified k fold
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        clf = DecisionTreeClassifier(max_depth=gs.best_params_["max_depth"], random_state=random_state)
        clf.fit(X_train, y_train)
        df.loc[test_index, "pred"] = clf.predict(X_test)
        
    # Keeping only samples of desired snr and users
    df_test = pd.concat(
        axis         = 0,
        ignore_index = True,
        objs         = (
            df_test, 
            df.loc[(df["snr"]==test_snr) & (df["n_users"]==test_users)][df_test.columns]
        )
    )
    
# Calculating metrics
calculate_and_export_metrics(
    df_test, 
    "n_antennas", 
    dirResults.joinpath("procedure_2_tree.csv")
)

# Registering the best tree depth for each n antennas
with open(dirResults.joinpath("procedure_2_max_depths.json"), mode="w") as file:
    json.dump(dict_max_depths, file, indent=4)

## Hassan's algorithm

### Procedure 1.1

In [None]:
df_test = pd.read_csv(dirDatasets.joinpath("procedure_1_test_1.csv"))
df_test["pred"] = df_test.apply(hassans_algorithm, axis=1)
calculate_and_export_metrics(
    df_test, 
    "snr", 
    dirResults.joinpath("procedure_1_test_1_hassan.csv")
)

### Procedure 1.1

In [None]:
df_test = pd.read_csv(dirDatasets.joinpath("procedure_1_test_2.csv"))
df_test["pred"] = df_test.apply(hassans_algorithm, axis=1)
calculate_and_export_metrics(
    df_test, 
    "n_users", 
    dirResults.joinpath("procedure_1_test_2_hassan.csv")
)

### Procedure 2

In [None]:
test_snr   = 10
test_users = 64
df_test    = pd.DataFrame(columns=["n_antennas", "Pe", "target", "pred"])

for n_antennas in range(64, 257, 16):
    if n_antennas < 256:
        df = pd.read_csv(dirDatasets.joinpath("procedure_2_{}_antennas.csv".format(n_antennas)))
    else:
        df = pd.read_csv(dirDatasets.joinpath("procedure_1_train.csv"))
        
    df = df.loc[(df["snr"]==test_snr) & (df["n_users"]==test_users)]
    df["n_antennas"] = n_antennas
    df["pred"] = df.apply(hassans_algorithm, axis=1)
    df_test = pd.concat(
        axis         = 0,
        ignore_index = True,
        objs         = (
            df_test, 
            df[df_test.columns]
        )
    )
    
calculate_and_export_metrics(
    df_test, 
    "n_antennas", 
    dirResults.joinpath("procedure_2_hassan.csv")
)