In [2]:
from llm_lasso.data_splits import read_train_test_splits, read_baseline_splits
from llm_lasso.task_specific_lasso.llm_lasso import *
from llm_lasso.task_specific_lasso.plotting import plot_llm_lasso_result, plot_heatmap
import os
import json

ModuleNotFoundError: No module named 'xgboost'

In [None]:
%load_ext autoreload
%autoreload 2

## Experimentation on ETP Dataset

### Step 1: Load in Data

In [1]:
N_SPLITS = 10
DATASET="ETP"
BASE_FOLDER="data/experiment-results"
os.makedirs(f"{BASE_FOLDER}/{DATASET}", exist_ok=True)
splits = read_train_test_splits(f"data/splits/{DATASET}", N_SPLITS)

NameError: name 'os' is not defined

In [None]:
feature_baseline = read_baseline_splits(f"data/baselines/{DATASET}", n_splits=N_SPLITS, n_features=49)

### Step 2: Experiment Config

In [None]:
config = LLMLassoExperimentConfig(
    folds_cv=5,
    regression=False,
    max_features_for_baselines=30,
    n_threads=8,

    # Lasso config
    lambda_min_ratio=0.001,
    relaxed_lasso=False,
    lasso_downstream_l2=True,
    max_imp_power=4,

    remove_correlated_features=False,
    run_pure_lasso_after=8
)

### Step 3: Run Baselines

All experiments (baselines, Lasso, LLM-Lasso, etc.) in this notebook have the same structure:
1. **Step 1**: Look for previous results in the specified CSV file (for the baselines, it's `data/experiment-results/ETP/baselines.csv`, as defined in the variable `baseline_csv` below).
    If the CSV is found, and the variable **`RERUN_BASELINES`** (e.g.) is not set `True`, then we just load in the CSV.
2. **Step 2**: if the CSV is not found, run the experiment, e.g., `run_all_baselines_for_splits`.
3. **Step 3**: save the experiments to CSV.

In [None]:
RERUN_BASELINES = False

baseline_csv = f"{BASE_FOLDER}/{DATASET}/baselines.csv"
if not RERUN_BASELINES and os.path.exists(baseline_csv):
    print(f"CSV found at {baseline_csv}. Loading.")
    baselines = pd.read_csv(baseline_csv)
else:
    baselines = run_all_baselines_for_splits(
        splits=splits,
        feature_baseline=feature_baseline,
        config=config
    )
    baselines.to_csv(baseline_csv, index=False)

**Note**: for Lasso, LLM-Lasso, and Adaptive Lasso, this notebook defines an **`EXPERIMENT_NAME`** variable, which is added to the CSV filename to allow for saving multiple CSVs in the same directory with different experiment configurations.

In [None]:
RERUN_LASSO = False
EXPERIMENT_NAME = "logistic"

lasso_csv = f"{BASE_FOLDER}/{DATASET}/lasso_{EXPERIMENT_NAME}.csv"

if not RERUN_LASSO and os.path.exists(lasso_csv):
    print(f"CSV found at {lasso_csv}. Loading.")
    lasso = pd.read_csv(lasso_csv)
else:
    lasso = run_lasso_baseline_for_splits(
        splits=splits,
        config=config
    )
    lasso.to_csv(lasso_csv, index=False)

In [None]:
RERUN_ADAPTIVE_LASSO = True
EXPERIMENT_NAME = "logistic"

adaptive_lasso_csv = f"{BASE_FOLDER}/{DATASET}/adaptive_lasso_{EXPERIMENT_NAME}.csv"

if not RERUN_ADAPTIVE_LASSO and os.path.exists(adaptive_lasso_csv):
    print(f"CSV found at {adaptive_lasso_csv}. Loading.")
    adaptive_lasso = pd.read_csv(adaptive_lasso_csv)
else:
    adaptive_lasso = run_adaptive_lasso_for_splits(
        splits=splits,
        config=config
    )
    adaptive_lasso.to_csv(adaptive_lasso_csv, index=False)

In [None]:
RERUN_XGBOOST = False

xgboost_csv = f"{BASE_FOLDER}/{DATASET}/xgboost.csv"

if not RERUN_XGBOOST and os.path.exists(xgboost_csv):
    print(f"CSV found at {xgboost_csv}. Loading.")
    xgboost = pd.read_csv(xgboost_csv)
else:
    xgboost = run_xgboost_for_splits(
        splits=splits,
        ordered_features=feature_baseline["xgboost"],
        config=config
    )
    xgboost.to_csv(xgboost_csv, index=False)

In [None]:
# Generate using the web interface on the subset provided by XGBoost
manually_tuned_penalties = {
  "AEBP1": 0.8,
  "ANP32B": 0.8,
  "CCND2": 0.2,
  "CD164": 0.8,
  "CD1B": 0.2,
  "CD5": 0.2,
  "DEFA1": 0.8,
  "DEK": 0.8,
  "DNAJC1": 0.8,
  "EGR1": 0.2,
  "ELOVL4": 0.8,
  "EPHB6": 0.8,
  "GALNT2": 0.8,
  "GPX4": 0.8,
  "HIST1H2AD": 0.8,
  "IGFBP7": 0.8,
  "IL7R": 0.8,
  "ILF3": 0.8,
  "JARID2": 0.8,
  "JUN": 0.8,
  "KLF10": 0.8,
  "KLHDC3": 0.8,
  "KMT2E": 0.8,
  "LAT2": 0.7,
  "LEF1": 0.2,
}


penalties = []
for gene in splits[0].x_test.columns:
    penalties.append(1 if gene not in manually_tuned_penalties else manually_tuned_penalties[gene])
penalties = np.array(penalties)

In [None]:
with open("data/llm-lasso/etp_manual.json", "r") as f:
    json_penalties = json.load(f)

In [None]:
penalties_per_split = []
for spl in json_penalties:
    penalties = []
    for gene in splits[0].x_test.columns:
        penalties.append(1 if gene not in spl else spl[gene])
    # penalties = np.array(penalties)
    penalties_per_split.append(penalties)
penalties_per_split = np.array(penalties)

In [None]:
# penalties = np.array(np.load("data/llm-lasso/ETP/final_scores_RAG.pkl", allow_pickle=True))

In [None]:
RERUN_LLM_LASSO = True
EXPERIMENT_NAME = "chatgpt"

llm_lasso_csv = f"{BASE_FOLDER}/{DATASET}/llm_lasso_{EXPERIMENT_NAME}.csv"

if not RERUN_LLM_LASSO and os.path.exists(llm_lasso_csv):
    print(f"CSV found at {llm_lasso_csv}. Loading.")
    llm_lasso = pd.read_csv(llm_lasso_csv)
else:
    llm_lasso = run_llm_lasso_cv_for_splits(
        splits=splits,
        scores={"chatGPT": penalties_per_split},
        config=config,
        verbose=True
    )
    llm_lasso.to_csv(llm_lasso_csv, index=False)

In [None]:
# This line means that we only plot a few of the (better-performing) baselines!
filtered_baselines = baselines[baselines["method"].isin(["xgboost", "XGBoost_Model", "mrmr"])]

dataframes_to_plot = [
    filtered_baselines, lasso, xgboost, llm_lasso, adaptive_lasso
]

In [None]:
plot_llm_lasso_result(
    dataframes_to_plot,
    bolded_methods=["1/imp - chatGPT"],
    plot_error_bars=False,
    x_lim=20
)

In [None]:
plot_heatmap(
    dataframes_to_plot,
    method_models=["1/imp - chatGPT", "xgboost", "Lasso"], # these are from the method_model column of the dataframe
    labels=["LLM-Lasso", "XGBoost", "Lasso"], # this is how each method_model will be labeled on the plot
    feature_names=splits[0].x_train.columns,
    sort_by="LLM-Lasso"
)

### The below is some experimentation with prompt tuning

In [None]:
from llm_lasso.llm_penalty.llm import LLMQueryWrapperWithMemory, LLMType

In [None]:
prompt = """
In using gene expression levels for classifying ETP-All vs. non-ETP-All, assign each of these genes a score between 0.1 and 0.9, where 0.1 means very important and 0.9 means less important. It's ok to assign many genes 0.9 if they aren't super important. It's normal for only a few genes to be important.

Think carefully for each gene. Format your response as a JSON.

""" + str(sorted(feature_baseline["xgboost"][9]))

In [None]:
print(prompt)

In [None]:
import os
import constants
os.environ["OPENAI_API_KEY"] = constants.OPENAI_API

In [None]:
llm = LLMQueryWrapperWithMemory(
    llm_type=LLMType.O1,
    llm_name="o1",
    api_key=constants.OPENAI_API
)

In [None]:
output_json = llm.query(
    system_message="",
    full_prompt=prompt,
    sleep_time=1
)

In [None]:
print(output_json)