# LLM-Lasso: Small-Scale Experiments

In [None]:
from llm_lasso.task_specific_lasso.llm_lasso import *
from llm_lasso.task_specific_lasso.plotting import plot_heatmap, plot_llm_lasso_result
from llm_lasso.data_splits import read_train_test_splits, read_baseline_splits
import numpy as np
import warnings
import json
warnings.filterwarnings("ignore")  # Suppress warnings

In [None]:
%load_ext autoreload
%autoreload 2


## Diabetes
### Step 1: Command-Line Portion

Run the following in your command line
```
./shell_scripts/diabetes/step_01_splits.sh

./shell_scripts/diabetes/step_02_baselines.sh

./shell_scripts/diabetes/step_03_llm_score_baseline.sh

./shell_scripts/diabetes/step_04_lmpriors_baseline.sh

./shell_scripts/diabetes/step_05_llm_lasso_penalties.sh
```

### Step 2: Evaluation

In [None]:
# Load in splits
N_SPLITS = 10
splits = read_train_test_splits("../data/splits/diabetes", N_SPLITS)
n_features = splits[0].x_train.shape[1]

In [None]:
# Load in LLM-Lasso Penalties
penalty_list={
    "plain": np.array(
        np.load("../data/llm-lasso/diabetes/final_scores_plain.pkl", allow_pickle=True)
    ),
}

In [None]:
# Load in baseline features
feature_baseline = read_baseline_splits(
    "../data/baselines/diabetes", n_splits=N_SPLITS, n_features=n_features)

with open("../data/lmpriors/diabetes/Diabetes/selected_features.txt", "r") as f:
    lmpriors = [line.strip() for line in f.readlines()]
feature_baseline["lmpriors"] = [lmpriors] * N_SPLITS

with open("../data/llm-score/diabetes/llmselect_selected_features.json", "r") as f:
    llm_select_genes = json.load(f)[f"{n_features}"]

feature_baseline["llm_score"] = [llm_select_genes] * N_SPLITS

In [None]:
config = LLMLassoExperimentConfig(
    folds_cv=5, # number of cross-validation folds
    regression=False, # this is classification, not regression,
    score_type=PenaltyType.PF, # We have penalty factors from the LLM,
                               # not importance scores.
    max_imp_power=4,
    lambda_min_ratio=0.001, # Lasso parameter,
    n_threads=8, # number of threads to use for computation
    run_pure_lasso_after=5,
    lasso_downstream_l2=True,
    cross_val_metric=CrossValMetric.ERROR
)

In [None]:
baselines = run_downstream_baselines_for_splits(
    splits=splits,
    feature_baseline=feature_baseline,
    config=config
)

In [None]:
lasso = run_lasso_baseline_for_splits(
    splits=splits,
    config=config
)

In [None]:
llm_lasso = run_llm_lasso_cv_for_splits(
    splits=splits,
    scores=penalty_list,
    config=config,
    verbose=False
)

In [None]:
dataframes_to_plot = [df[df["n_features"] > 0] for df in [lasso, baselines,llm_lasso]]
plot_llm_lasso_result(
    dataframes_to_plot,
    bolded_methods=["1/imp - plain"],
    plot_error_bars=False,
)


## Bank
### Step 1: Command-Line Portion

Run the following in your command line
```
./shell_scripts/bank/step_01_splits.sh

./shell_scripts/bank/step_02_baselines.sh

./shell_scripts/bank/step_03_llm_score_baseline.sh

./shell_scripts/bank/step_04_lmpriors_baseline.sh

./shell_scripts/bank/step_05_llm_lasso_penalties.sh
```

### Step 2: Evaluation

In [None]:
# Load in splits
N_SPLITS = 10
splits = read_train_test_splits("../data/splits/bank", N_SPLITS)
n_features = splits[0].x_train.shape[1]

In [None]:
# Load in LLM-Lasso Penalties
penalty_list={
    "plain": np.array(
        np.load("../data/llm-lasso/bank/final_scores_plain.pkl", allow_pickle=True)
    ),
}

In [None]:
# Load in baseline features
feature_baseline = read_baseline_splits(
    "../data/baselines/bank", n_splits=N_SPLITS, n_features=n_features)

with open("../data/llm-score/bank/llmselect_selected_features.json", "r") as f:
    llm_select_genes = json.load(f)[f"{n_features}"]

with open("../data/lmpriors/bank/Bank/selected_features.txt", "r") as f:
    lmpriors = [line.strip() for line in f.readlines()]
feature_baseline["lmpriors"] = [lmpriors] * N_SPLITS

feature_baseline["llm_score"] = [llm_select_genes] * N_SPLITS

#### For Bank, we have to remove the `duration` feature, which, according to the dataset description "should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model."

In [None]:
duration_idx = splits[0].x_train.columns.tolist().index("duration")
for i in range(N_SPLITS):
    splits[i].x_train = splits[i].x_train.drop("duration", axis=1)
    splits[i].x_test = splits[i].x_test.drop("duration", axis=1)

    for key in feature_baseline:
        if "duration" in feature_baseline[key][i]:
            feature_baseline[key][i].remove("duration")

plain = penalty_list["plain"].tolist()
penalty_list["plain"] = np.array(
    plain[:duration_idx] + plain[duration_idx+1:]
)

In [None]:
config = LLMLassoExperimentConfig(
    folds_cv=5, # number of cross-validation folds
    regression=False, # this is classification, not regression,
    score_type=PenaltyType.PF, # We have penalty factors from the LLM,
                               # not importance scores.
    max_imp_power=2,
    lambda_min_ratio=0.001, # Lasso parameter,
    n_threads=8, # number of threads to use for computation
    run_pure_lasso_after=5,
    lasso_downstream_l2=True,
    cross_val_metric=CrossValMetric.AUROC
)

In [None]:
baselines = run_downstream_baselines_for_splits(
    splits=splits,
    feature_baseline=feature_baseline,
    config=config
)

In [None]:
lasso = run_lasso_baseline_for_splits(
    splits=splits,
    config=config
)

In [None]:
llm_lasso = run_llm_lasso_cv_for_splits(
    splits=splits,
    scores=penalty_list,
    config=config,
    verbose=False
)

In [None]:
dataframes_to_plot = [df[df["n_features"] > 0] for df in [lasso, baselines,llm_lasso]]
plot_llm_lasso_result(
    dataframes_to_plot,
    bolded_methods=["1/imp - plain"],
    plot_error_bars=False,
)


## Spotify
### Step 1: Command-Line Portion

Run the following in your command line
```
./shell_scripts/spotify/step_01_splits.sh

./shell_scripts/spotify/step_02_baselines.sh

./shell_scripts/spotify/step_03_llm_score_baseline.sh

./shell_scripts/spotify/step_04_lmpriors_baseline.sh

./shell_scripts/spotify/step_05_llm_lasso_penalties.sh
```

### Step 2: Evaluation

In [None]:
# Load in splits
N_SPLITS = 10
splits = read_train_test_splits("../data/splits/spotify", N_SPLITS)
n_features = splits[0].x_train.shape[1]

In [None]:
# Load in LLM-Lasso Penalties
penalty_list={
    "plain": np.array(
        np.load("../data/llm-lasso/spotify/final_scores_plain.pkl", allow_pickle=True)
    ),
}

In [None]:
# Load in baseline features
feature_baseline = read_baseline_splits(
    "../data/baselines/spotify", n_splits=N_SPLITS, n_features=n_features)

with open("../data/llm-score/spotify/llmselect_selected_features.json", "r") as f:
    llm_select_genes = json.load(f)[f"{n_features}"]

with open("../data/lmpriors/spotify/Spotify/selected_features.txt", "r") as f:
    lmpriors = [line.strip() for line in f.readlines()]
feature_baseline["lmpriors"] = [lmpriors] * N_SPLITS

feature_baseline["llm_score"] = [llm_select_genes] * N_SPLITS

In [None]:
config = LLMLassoExperimentConfig(
    folds_cv=5, # number of cross-validation folds
    regression=True, # this is regression
    score_type=PenaltyType.PF, # We have penalty factors from the LLM,
                               # not importance scores.
    max_imp_power=4,
    lambda_min_ratio=0.001, # Lasso parameter,
    n_threads=8, # number of threads to use for computation
    run_pure_lasso_after=5,
    lasso_downstream_l2=True,
    cross_val_metric=CrossValMetric.ERROR
)

In [None]:
baselines = run_downstream_baselines_for_splits(
    splits=splits,
    feature_baseline=feature_baseline,
    config=config
)

In [None]:
lasso = run_lasso_baseline_for_splits(
    splits=splits,
    config=config
)

In [None]:
llm_lasso = run_llm_lasso_cv_for_splits(
    splits=splits,
    scores=penalty_list,
    config=config,
    verbose=False
)

In [None]:
dataframes_to_plot = [df[df["n_features"] > 0] for df in [lasso, baselines,llm_lasso]]
plot_llm_lasso_result(
    dataframes_to_plot,
    bolded_methods=["1/imp - plain"],
    plot_error_bars=False,
)


## Wine
### Step 1: Command-Line Portion

Run the following in your command line
```
./shell_scripts/wine/step_01_splits.sh

./shell_scripts/wine/step_02_baselines.sh

./shell_scripts/wine/step_03_llm_score_baseline.sh

./shell_scripts/wine/step_04_lmpriors_baseline.sh

./shell_scripts/wine/step_05_llm_lasso_penalties.sh
```

### Step 2: Evaluation

In [None]:
# Load in splits
N_SPLITS = 10
splits = read_train_test_splits("../data/splits/wine", N_SPLITS)
n_features = splits[0].x_train.shape[1]

In [None]:
# Load in LLM-Lasso Penalties
penalty_list={
    "plain": np.array(
        np.load("../data/llm-lasso/wine/final_scores_plain.pkl", allow_pickle=True)
    ),
}

In [None]:
# Load in baseline features
feature_baseline = read_baseline_splits(
    "../data/baselines/wine", n_splits=N_SPLITS, n_features=n_features)

with open("../data/llm-score/wine/llmselect_selected_features.json", "r") as f:
    llm_select_genes = json.load(f)[f"{n_features}"]

with open("../data/lmpriors/wine/Wine/selected_features.txt", "r") as f:
    lmpriors = [line.strip() for line in f.readlines()]
    lmpriors = [x[0].lower() + x[1:] for x in lmpriors]
feature_baseline["lmpriors"] = [lmpriors] * N_SPLITS

feature_baseline["llm_score"] = [llm_select_genes] * N_SPLITS

In [None]:
config = LLMLassoExperimentConfig(
    folds_cv=5, # number of cross-validation folds
    regression=True, # this is regression
    score_type=PenaltyType.PF, # We have penalty factors from the LLM,
                               # not importance scores.
    max_imp_power=2,
    lambda_min_ratio=0.001, # Lasso parameter,
    n_threads=8, # number of threads to use for computation
    run_pure_lasso_after=5,
    lasso_downstream_l2=True,
    cross_val_metric=CrossValMetric.ERROR
)

In [None]:
baselines = run_downstream_baselines_for_splits(
    splits=splits,
    feature_baseline=feature_baseline,
    config=config
)

In [None]:
lasso = run_lasso_baseline_for_splits(
    splits=splits,
    config=config
)

In [None]:
llm_lasso = run_llm_lasso_cv_for_splits(
    splits=splits,
    scores=penalty_list,
    config=config,
    verbose=False
)

In [None]:
dataframes_to_plot = [df[df["n_features"] > 0] for df in [lasso, baselines,llm_lasso]]
plot_llm_lasso_result(
    dataframes_to_plot,
    bolded_methods=["1/imp - plain"],
    plot_error_bars=False,
)


## Glioma
### Step 1: Command-Line Portion

Run the following in your command line
```
./shell_scripts/glioma/step_01_splits.sh

./shell_scripts/glioma/step_02_baselines.sh

./shell_scripts/glioma/step_03_llm_score_baseline.sh

./shell_scripts/glioma/step_04_lmpriors_baseline.sh

./shell_scripts/glioma/step_05_llm_lasso_penalties.sh
```

### Step 2: Evaluation

In [None]:
# Load in splits
N_SPLITS = 10
splits = read_train_test_splits("../data/splits/glioma", N_SPLITS)
n_features = splits[0].x_train.shape[1]

In [None]:
# Load in LLM-Lasso Penalties
penalty_list={
    "plain": np.array(
        np.load("../data/llm-lasso/glioma/final_scores_plain.pkl", allow_pickle=True)
    ),
}

In [None]:
# Load in baseline features
feature_baseline = read_baseline_splits(
    "../data/baselines/glioma", n_splits=N_SPLITS, n_features=n_features)

with open("../data/llm-score/glioma/llmselect_selected_features.json", "r") as f:
    llm_select_genes = json.load(f)[f"{n_features}"]

with open("../data/lmpriors/glioma/Glioma/selected_features.txt", "r") as f:
    lmpriors = [line.strip() for line in f.readlines()]
feature_baseline["lmpriors"] = [lmpriors] * N_SPLITS

feature_baseline["llm_score"] = [llm_select_genes] * N_SPLITS

In [None]:
config = LLMLassoExperimentConfig(
    folds_cv=5, # number of cross-validation folds
    regression=False, # this is not regression
    score_type=PenaltyType.PF, # We have penalty factors from the LLM,
                               # not importance scores.
    max_imp_power=2,
    lambda_min_ratio=0.001, # Lasso parameter,
    n_threads=8, # number of threads to use for computation
    run_pure_lasso_after=5,
    lasso_downstream_l2=True,
    cross_val_metric=CrossValMetric.ERROR
)

In [None]:
baselines = run_downstream_baselines_for_splits(
    splits=splits,
    feature_baseline=feature_baseline,
    config=config
)

In [None]:
lasso = run_lasso_baseline_for_splits(
    splits=splits,
    config=config
)

In [None]:
llm_lasso = run_llm_lasso_cv_for_splits(
    splits=splits,
    scores=penalty_list,
    config=config,
    verbose=False
)

In [None]:
dataframes_to_plot = [df[df["n_features"] > 0] for df in [lasso, baselines,llm_lasso]]
plot_llm_lasso_result(
    dataframes_to_plot,
    bolded_methods=["1/imp - plain"],
    plot_error_bars=False,
    test_error_y_lim=(0.13, 0.2),
)