In [None]:
!pip install together

In [None]:
import copy
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from together import Together
from google.colab import userdata # To load API key securely

## Prepare Dataset

In [None]:
file_name = f"/content/drive/MyDrive/KU/TA/Tutorial 3-2. Advanced Topic (LLM)/practice/data/diabetes.csv" # Your Own Path
df = pd.read_csv(file_name)

In [None]:
df.head(5)

In [None]:
# Target Attribute
target_attribute = df.columns[-1]
print("Target Attribute:", target_attribute)

In [None]:
# Target Class
X = df.convert_dtypes()
y = df[target_attribute].to_numpy()

label_list = np.unique(y).tolist()
print("Target Class:", label_list)

In [None]:
# Split Pool / Test
X_pool, X_test, y_pool, y_test = train_test_split(
        X.drop(target_attribute, axis=1),
        y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

In [None]:
shot =  # Number of Training Example

X_train = X_pool.copy()
X_train[target_attribute] = y_pool

sampled_list = []
remainder = shot % len(np.unique(y_pool))

# Balance Class in Training Example
for _, grouped in X_train.groupby(target_attribute):
    sample_num = shot // len(np.unique(y_pool))
    if remainder > 0:
        sample_num += 1
        remainder -= 1

    sampled = grouped.sample(sample_num, random_state=42)
    sampled_list.append(sampled)

X_balanced = pd.concat(sampled_list)
X_train = X_balanced.drop([target_attribute], axis=1)
y_train = X_balanced[target_attribute].to_numpy()

In [None]:
train = pd.DataFrame(X_train)
train[target_attribute] = y_train
train

In [None]:
test = pd.DataFrame(X_test)
test[target_attribute] = y_test
test

In [None]:
df[target_attribute].value_counts() # Original Data Class Ratio

In [None]:
train[target_attribute].value_counts() # Train Data Class Ratio

In [None]:
test[target_attribute].value_counts() # Test Data Class Ratio

## Generate Prompt

In [None]:
# Base Template for LLM Feature Generation

template_str = """
You are an expert. Given the task description and the list of features and data examples, you are extracting conditions for each answer class to solve the task.

Task:[TASK]

Features:
[FEATURES]

Examples:
[EXAMPLES]

Let's first understand the problem and solve the problem step by step.

Step 1. Analyze the causal relationship or tendency between each feature and task description based on general knowledge and common sense within a short sentence.

Step 2. Based on the above examples and Step 1's results, infer [NUMBER] different conditions per answer, following the format below.
The condition should make sense, well match examples, and must match the format for [condition] according to value type.

Format for Response:
[FEATURE FORMAT]

Format for [Feature Condition]:
For the categorical variable only,
- [Feature_name] is in [list of Categorical_values]
For the numerical variable only,
- [Feature_name] (> or >= or < or <=) [Numerical_value]
- [Feature_name] is within range of [Numerical_range_start, Numerical_range_end]

Answer:
Step 1. The relationship between each feature and the task description:

Step 2.
"""

### 1. Task Description

In [None]:
task_desc = ""

### 2. Feature Description

In [None]:
feature_desc = '''

'''

In [None]:
feature_rule_number =  # Number of rules per Class

In [None]:
format_list = [f'{feature_rule_number} different conditions for class "{label}":\n- [Condition]\n...' for label in label_list]
format_desc = '\n\n'.join(format_list)

### 3. Example

In [None]:
# Train Example (Tabular) -> Text
def serialize(row):
    target_str = f""
    for attr_idx, attr_name in enumerate(list(row.index)):
        if attr_idx < len(list(row.index)) - 1:
            target_str += " is ".join([attr_name, str(row[attr_name]).strip(" .'").strip('"').strip()])
            target_str += ". "
        else:
            if len(attr_name.strip()) < 2:
                continue
            target_str += " is ".join([attr_name, str(row[attr_name]).strip(" .'").strip('"').strip()])
            target_str += "."
    return target_str

In [None]:
def fill_in_templates(fill_in_dict, template_str):
    for key, value in fill_in_dict.items():
        if key in template_str:
            template_str = template_str.replace(key, value)
    return template_str

In [None]:
in_context_desc = ""
df_current = train.copy()
df_current = df_current.groupby(
                target_attribute, group_keys=False
            ).apply(lambda x: x.sample(frac=1))

for icl_idx, icl_row in df_current.iterrows():
  answer = icl_row[target_attribute]
  icl_row = icl_row.drop(labels=target_attribute)
  in_context_desc += serialize(icl_row)
  in_context_desc += f"\nAnswer: {answer}\n"

In [None]:
fill_in_dict = {
                "[TASK]": task_desc,
                "[EXAMPLES]": in_context_desc,
                "[FEATURES]": feature_desc,
                "[FEATURE FORMAT]": format_desc,
                "[NUMBER]": str(feature_rule_number)
            }
prompt = fill_in_templates(fill_in_dict, template_str)

In [None]:
print(prompt)

## Prompt for LLM

In [None]:
# Set API Key for Inference
client = Together(api_key=userdata.get("TOGETHER_API"))

In [None]:
max_try_num=5
curr_try_num = 0
while curr_try_num < max_try_num:
    try:
        response = client.chat.completions.create(
            model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", # Free Model
            messages=[{"role":"user", "content":_____}], # Fill in the blank!
            max_tokens=1024)
        result = response.choices[0].message.content
        break
    except Exception as e:
        print(e)
        curr_try_num += 1
        if curr_try_num >= max_try_num:
            result = -1
        time.sleep(10)

In [None]:
print(result)

## Featurization

In [None]:
# Parse Rules
splitter = "onditions for class"

if splitter in result:
    splitted = result.split(splitter)
    if len(label_list) != 0 and len(splitted) == len(label_list) + 1:
        rule_raws = splitted[1:]
        rule_dict = {}
        for rule_raw in rule_raws:
            class_name = rule_raw.split(":")[0].strip(" .'").strip(' []"')
            rule_parsed = []
            for txt in rule_raw.strip().split("\n")[1:]:
                if len(txt) < 2:
                    break
                rule_parsed.append(" ".join(txt.strip().split(" ")[1:]))
            rule_dict[class_name] = rule_parsed

In [None]:
rule_dict

In [None]:
# Base Template for LLM Code Generation
prompt_code = '''
Provide me a python code for function, given description below.

Function name: [NAME]

Input: Dataframe df_input

Input Features:
[FEATURES]

Output: Dataframe df_output. Create a new dataframe df_output. Each column in df_output refers whether the selected column in df_input follows the condition (1) or not (0). Be sure that the function code well matches with its feature type (i.e., numerical, categorical).

Conditions:
[CONDITIONS]


Wrap only the function part with <start> and <end>, and do not add any comments, descriptions, and package importing lines in the code.
'''

In [None]:
template_list = []
for class_id, each_rule in rule_dict.items():
    function_name = f'extracting_features_{class_id}'
    rule_str = '\n'.join([f'- {k}' for k in each_rule])

    fill_in_dict = {
        "[NAME]": function_name,
        "[CONDITIONS]": rule_str,
        "[FEATURES]": feature_desc
    }
    template = fill_in_templates(fill_in_dict, prompt_code)
    template_list.append(template)

In [None]:
print(template_list[0]) # Prompt for Class "no"

In [None]:
print(template_list[1]) # Class "yes"

In [None]:
fct_results = []
for prompt in tqdm(template_list):
    curr_try_num = 0
    while curr_try_num < 5:
        try:
            response = client.chat.completions.create(
                model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1024)
            result = response.choices[0].message.content
            fct_results.append(result)
            break
        except Exception as e:
            print(e)
            curr_try_num += 1
            if curr_try_num >= max_try_num:
                fct_results.append(-1)
            time.sleep(10)

# Extract function code from LLM response
fct_strs = [fct_txt.split('<start>')[1].split('<end>')[0].strip() for fct_txt in fct_results]

In [None]:
# Parse function names
fct_names = []
fct_strs_final = []

if 'def' in fct_strs[0]:
    for fct_str in fct_strs:
        fct_names.append(fct_str.split('def')[1].split('(')[0].strip())
    fct_strs_final = fct_strs


In [None]:
fct_names

In [None]:
print(fct_strs_final[0])

In [None]:
print(fct_strs_final[1])

In [None]:
def convert_to_binary_vectors(fct_strs, fct_names, label_list, X_train, X_test):
    X_train_dict, X_test_dict = {}, {}

    # Match function names to class labels
    fct_idx_dict = {}
    for idx, name in enumerate(fct_names):
        for label in label_list:
            label_name = '_'.join(label.split(' '))
            if label_name.lower() in name.lower():
                fct_idx_dict[label] = idx

    # Check if all class labels are matched
    if len(fct_idx_dict) != len(label_list):
        raise ValueError("Mismatch between rules and label classes")

    for label in label_list:
        fct_idx = fct_idx_dict[label]
        exec(fct_strs[fct_idx].strip('` "'))
        func = locals()[fct_names[fct_idx]]
        X_train_each = func(X_train).astype('int').to_numpy()
        X_test_each = func(X_test).astype('int').to_numpy()
        assert X_train_each.shape[1] == X_test_each.shape[1]
        X_train_dict[label] = torch.tensor(X_train_each).float()
        X_test_dict[label] = torch.tensor(X_test_each).float()

    return X_train_dict, X_test_dict

In [None]:
X_train_dict, X_test_dict = convert_to_binary_vectors(fct_strs_final, fct_names, label_list, X_train, X_test)

In [None]:
X_train_dict

In [None]:
X_test_dict

## Train Model


In [None]:
class simple_model(nn.Module):
    def __init__(self, X):
        super(simple_model, self).__init__()
        self.weights = nn.ParameterList([nn.Parameter(torch.ones(x_each.shape[1] , 1) / x_each.shape[1]) for x_each in X])

    def forward(self, x):
        x_total_score = []
        for idx, x_each in enumerate(x):
            x_score = x_each @ torch.clamp(self.weights[idx], min=0)
            x_total_score.append(x_score)
        x_total_score = torch.cat(x_total_score, dim=-1)
        return x_total_score

In [None]:
def train(X_train_now, label_list, shot):
    criterion = nn.CrossEntropyLoss()
    if shot // len(label_list) == 1:
        model = simple_model(X_train_now)
        opt = Adam(model.parameters(), lr=1e-2)
        for _ in range(200):
            opt.zero_grad()
            outputs = model(X_train_now)
            preds = outputs.argmax(dim=1).numpy()
            acc = (np.array(y_train_num) == preds).sum() / len(preds)
            if acc == 1:
                break
            loss = criterion(outputs, torch.tensor(y_train_num))
            loss.backward()
            opt.step()
    else:
        if shot // len(label_list) <= 2:
            n_splits = 2
        else:
            n_splits = 4

        kfold = StratifiedKFold(n_splits=n_splits, shuffle=True)
        model_list = []
        for fold, (train_ids, valid_ids) in enumerate(kfold.split(X_train_now[0], y_train_num)):
            model = simple_model(X_train_now)
            opt = Adam(model.parameters(), lr=1e-2)
            X_train_now_fold = [x_train_now[train_ids] for x_train_now in X_train_now]
            X_valid_now_fold = [x_train_now[valid_ids] for x_train_now in X_train_now]
            y_train_fold = y_train_num[train_ids]
            y_valid_fold = y_train_num[valid_ids]

            max_acc = -1
            for _ in range(200):
                opt.zero_grad()
                outputs = model(X_train_now_fold)
                loss = criterion(outputs, torch.tensor(y_train_fold))
                loss.backward()
                opt.step()

                valid_outputs = model(X_valid_now_fold)
                preds = valid_outputs.argmax(dim=1).numpy()
                acc = (np.array(y_valid_fold) == preds).sum() / len(preds)
                if max_acc < acc:
                    max_acc = acc
                    final_model = copy.deepcopy(model)
                    if max_acc >= 1:
                        break
            model_list.append(final_model)

        sdict = model_list[0].state_dict()
        for key in sdict:
            sdict[key] = torch.stack([model.state_dict()[key] for model in model_list], dim=0).mean(dim=0)

        model = simple_model(X_train_now)
        model.load_state_dict(sdict)
    return model

## Evaluate

In [None]:
def evaluate(pred_probs, answers, multiclass=False):
    if multiclass == False:
        result_auc = roc_auc_score(answers, pred_probs[:, 1])
    else:
        result_auc = roc_auc_score(answers, pred_probs, multi_class='ovr', average='macro')
    return result_auc

In [None]:
X_train_now = list(X_train_dict.values())
X_test_now = list(X_test_dict.values())

# Convert labels to numeric
y_train_num = np.array([label_list.index(k) for k in y_train])
y_test_num = np.array([label_list.index(k) for k in y_test])
multiclass = len(label_list) > 2

# Train
trained_model = train(X_train_now, label_list, shot)

# Predict
test_outputs = trained_model(X_test_now).detach().cpu()
test_outputs = F.softmax(test_outputs, dim=1).detach()

# Evaluate
result_auc = evaluate(test_outputs.numpy(), y_test_num, multiclass=multiclass)
print("AUC:", result_auc)

## ML

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    results = []
    # Logistic Regression
    logreg = LogisticRegression(solver='liblinear')
    logreg.fit(X_train, y_train)
    y_prob_logreg = logreg.predict_proba(X_test)[:, 1]

    # RandomForest
    rf = RandomForestClassifier(class_weight='balanced')
    rf.fit(X_train, y_train)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]

    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)

    xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    y_prob_xgb = xgb.predict_proba(X_test)[:, 1]

    # AUC
    results.append(("LogReg", roc_auc_score(y_test, y_prob_logreg)))
    results.append(("RandomForest", roc_auc_score(y_test, y_prob_rf)))
    results.append(("XGBoost", roc_auc_score(y_test, y_prob_xgb)))

    return results

In [None]:
y_train = np.where(y_train == label_list[1], 1, 0)
y_test = np.where(y_test == label_list[1], 1, 0)
experiment_results = train_and_evaluate_models(X_train, y_train, X_test, y_test)

In [None]:
experiment_results.append(("Ours", result_auc))

In [None]:
result_df = pd.DataFrame(experiment_results, columns=["Model", "AUC"])
result_df