# LLM-Lasso: Adversarial Experiment


In [52]:
from llm_lasso.data_splits import read_train_test_splits, read_baseline_splits
from llm_lasso.task_specific_lasso.llm_lasso import *
from llm_lasso.task_specific_lasso.plotting import plot_llm_lasso_result, plot_heatmap
import os
import json
import matplotlib.pyplot as plt
import pickle

## Step 1: Set up Lung Cancer Splits
See **`LungCancerExperiment.ipynb`** and run the first few cells.

## Step 2: Command Line Portion
Run the following in your command line
```
./shell_scripts/Lung_TCGA/adversarial/adversarial_gene_names.sh

./shell_scripts/Lung_TCGA/adversarial/llm_score.sh

./shell_scripts/Lung_TCGA/adversarial/llm_lasso_penalties.sh
```

**Note**: you need an OMIM API key to generate adversarial genenames, because it uses the OMIM API to check that the adversarial genenames don't already exist.

## Step 3: Evaluation

In [None]:
config = LLMLassoExperimentConfig(
    folds_cv=5, # number of cross-validation folds
    regression=False,
    score_type=PenaltyType.PF, # We have penalty factors from the LLM,
                               # not importance scores.
    max_imp_power=1,
    lambda_min_ratio=0.001, # Lasso parameter,
    n_threads=8, # number of threads to use for computation
    run_pure_lasso_after=10,
    lasso_downstream_l2=True,
    cross_val_metric=CrossValMetric.ERROR
)

In [None]:
N_SPLITS = 10
DATASET="Lung_TCGA"
splits = read_train_test_splits(f"../data/splits/{DATASET}", N_SPLITS)

In [None]:
import pickle
with open("../data/adversarial/Lung_TCGA/new_genenames.pkl", "rb") as f:
    genenames = pickle.load(f)
genename_mapping = {}
for (old, new) in zip(splits[0].x_train.columns, genenames):
    genename_mapping[old] = new

In [None]:
for i in range(N_SPLITS):
    splits[i].x_train.columns = genenames
    splits[i].x_test.columns = genenames

In [None]:
feature_baseline = read_baseline_splits(f"../data/baselines/{DATASET}", n_splits=N_SPLITS, n_features=49)
for key in feature_baseline:
    for i in range(N_SPLITS):
        feature_baseline[key][i] = [genename_mapping[x] for x in feature_baseline[key][i]]

with open(f"../data/adversarial/llm-score/Lung_TCGA/trial_scores_llm_score.json") as f:
    llm_score = json.load(f)
features = splits[0].x_train.columns
scores = np.mean(np.array([scores["scores"] for scores in llm_score]), axis=0)
llm_score_features = features[np.argsort(-scores)].tolist()
feature_baseline["llm_score"] = [llm_score_features] * N_SPLITS

In [None]:
baselines = run_downstream_baselines_for_splits(
    splits=splits,
    feature_baseline=feature_baseline,
    config=config
)

In [None]:
lasso = run_lasso_baseline_for_splits(
    splits=splits,
    config=config
)

In [None]:
penalty_list={
    "plain": np.array(
        np.load("../data/adversarial/llm-lasso/Lung_TCGA/final_scores_plain.pkl", allow_pickle=True)
    ),
}

In [None]:
llm_lasso = run_llm_lasso_cv_for_splits(
    splits=splits,
    scores=penalty_list,
    config=config,
    verbose=False
)

In [None]:
dataframes_to_plot = [df[df["n_features"] > 0] for df in [lasso, baselines, llm_lasso]]
plot_llm_lasso_result(
    dataframes_to_plot,
    plot_error_bars=False,
    test_error_y_lim=(0.04, 0.1),
    auroc_y_lim=(0.95, 0.99),
    x_lim=30
)