# Prepare Environment

In [1]:
#!pip install transformers[torch] datasets pydantic==1.10 langchain[llms] openai tiktoken hdbscan wandb

In [2]:
import os
import wandb

os.environ["WANDB_DISABLE_SYMLINK"] = "True"

# Set API keys as environment variables
os.environ["OPENAI_API_KEY"] = "..."
os.environ["OPENAI_ORG"] = "..."
os.environ["WANDB_KEY"] = "05cd0a25e7a53f3adb89d0e4dfdfa499309b1dc9"

# Login to Weights & Biases for experiment tracking
# wandb.login(key=os.environ["WANDB_KEY"])

In [3]:
seed = 42

# Prepare Datasets

In [4]:
# Import HuggingFace datasets package
import datasets
import pandas as pd
from sklearn.model_selection import train_test_split

# Define container class for datasets
class Dataset:

    def __init__(self, id, short_name, X_train, X_test, y_train, y_test, pos_class, context_train, context_test, train_zero_shot=False):
        self.id = id
        self.short_name = short_name
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.pos_class = pos_class # Class that counts as positive (for measuring precision, recall, and F1 score)
        self.context_train = context_train
        self.context_test = context_test
        self.train_zero_shot = train_zero_shot


    def __str__(self):
        return "Dataset(\n" + f"\tid='{self.id}'\n" + f"\tshort_name='{self.short_name}'\n" + f"\tcontext_train='{self.context_train}'\n" + f"\tcontext_test='{self.context_test}'\n" + f"\tclasses={list(pd.concat([self.y_train, self.y_test]).unique())}\n" + f"\tpos_class='{self.pos_class}'\n" + f"\ttrain_zero_shot={self.train_zero_shot}\n" + f"\tX_train.shape={self.X_train.shape}\n" + f"\tX_test.shape={self.X_test.shape}\n" + f"\ty_train.shape={self.y_train.shape}\n" + f"\ty_test.shape={self.y_test.shape}\n" + ")"


    def __repr__(self):
        return self.__str__()

In [5]:
# Initialize list of datasets
dataset_list = {}

# Set overall sample size
n_train = 100
n_test = 100

## Fake Scientific Papers

In [6]:
# Load the original training dataset
train = datasets.load_dataset("tum-nlp/IDMGSP", "train+gpt3", split="train", trust_remote_code=True)

# Randomly shuffle the dataset
train_shuffled = train.shuffle(seed=seed)

# Separate the different sources/generators
train_real = train_shuffled.filter(lambda x: x["src"] == "real")
train_scigen = train_shuffled.filter(lambda x: x["src"] == "scigen")
train_galactica = train_shuffled.filter(lambda x: x["src"] == "galactica")
train_gpt2 = train_shuffled.filter(lambda x: x["src"] == "gpt2")
train_gpt3 = train_shuffled.filter(lambda x: x["src"] == "gpt3")
train_chatgpt = train_shuffled.filter(lambda x: x["src"] == "chatgpt")

# Create one training dataset with alternating real and fake examples and equal number of examples from each fake source
train_fake = datasets.interleave_datasets([train_scigen, train_galactica, train_gpt2, train_gpt3, train_chatgpt])
data_train = datasets.interleave_datasets([train_real, train_fake])



# Load the original test datasets
test = datasets.load_dataset("tum-nlp/IDMGSP", "classifier_input", split="test")
ood_gpt3 = datasets.load_dataset("tum-nlp/IDMGSP", "ood_gpt3", split="test")

# Randomly shuffle the datasets
test_shuffled = test.shuffle(seed=seed)
test_gpt3 = ood_gpt3.shuffle(seed=seed)

# Separate the different sources/generators
test_real = test_shuffled.filter(lambda x: x["src"] == "real")
test_scigen = test_shuffled.filter(lambda x: x["src"] == "scigen")
test_galactica = test_shuffled.filter(lambda x: x["src"] == "galactica")
test_gpt2 = test_shuffled.filter(lambda x: x["src"] == "gpt2")
test_chatgpt = test_shuffled.filter(lambda x: x["src"] == "chatgpt")

# Create one test dataset with alternating real and fake examples and equal number of examples from each fake source
test_fake = datasets.interleave_datasets([test_scigen, test_galactica, test_gpt2, test_gpt3, test_chatgpt])
data_test = datasets.interleave_datasets([test_real, test_fake])



# Package dataset in a Dataset container
d = Dataset(
    id="tum-nlp/IDMGSP",
    short_name="papers",
    context_train="Examples are coming from a dataset with scientific papers that are either 'HUMAN-WRITTEN' (i.e., written by a real person) or 'MACHINE-GENERATED' (i.e., generated by a machine learning model). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HUMAN-WRITTEN' and 'MACHINE-GENERATED' scientific papers.",
    context_test="Examples are coming from a dataset with scientific papers that are either 'HUMAN-WRITTEN' (i.e., written by a real person) or 'MACHINE-GENERATED' (i.e., generated by a machine learning model). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HUMAN-WRITTEN' and 'MACHINE-GENERATED' scientific papers.",
    pos_class="MACHINE-GENERATED",
    X_train=data_train.to_pandas().head(n_train)[["title", "abstract", "introduction", "conclusion"]].rename(columns={"title": "Title", "abstract": "Abstract", "introduction": "Introduction", "conclusion": "Conclusion"}),
    X_test=data_test.to_pandas().head(n_test)[["title", "abstract", "introduction", "conclusion"]].rename(columns={"title": "Title", "abstract": "Abstract", "introduction": "Introduction", "conclusion": "Conclusion"}),
    y_train=data_train.to_pandas().head(n_train)["label"].map({0: "HUMAN-WRITTEN", 1: "MACHINE-GENERATED"}),
    y_test=data_test.to_pandas().head(n_test)["label"].map({0: "HUMAN-WRITTEN", 1: "MACHINE-GENERATED"})
)

# Add to dataset list
dataset_list[d.short_name] = d
d

Dataset(
	id='tum-nlp/IDMGSP'
	short_name='papers'
	context_train='Examples are coming from a dataset with scientific papers that are either 'HUMAN-WRITTEN' (i.e., written by a real person) or 'MACHINE-GENERATED' (i.e., generated by a machine learning model). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HUMAN-WRITTEN' and 'MACHINE-GENERATED' scientific papers.'
	context_test='Examples are coming from a dataset with scientific papers that are either 'HUMAN-WRITTEN' (i.e., written by a real person) or 'MACHINE-GENERATED' (i.e., generated by a machine learning model). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HUMAN-WRITTEN' and 'MACHINE-GENERATED' scientific papers.'
	classes=['HUMAN-WRITTEN', 'MACHINE-GENERATED']
	pos_class='MACHINE-GENERATED'
	train_zero_shot=False
	X_train.shape=(100, 4)
	X_test.shape=(100, 4)
	y_train.shape=(100,)
	y_test.shape=(100,

## Fake News

In [7]:
# Load the training dataset
data_train = datasets.load_dataset("GonzaloA/fake_news", split="train")

# Randomly shuffle the datasets
train_shuffled = data_train.shuffle(seed=seed)

# Separate the different classes (fake news and real news)
train_fake = train_shuffled.filter(lambda x: x["label"] == 0)
train_real = train_shuffled.filter(lambda x: x["label"] == 1)

# Create one dataset with alternating fake and real news examples
data_train = datasets.interleave_datasets([train_fake, train_real], stopping_strategy="first_exhausted")

# Convert to a Pandas DataFrame, split data and label columns, and limit to the training set size
df_train = data_train.to_pandas()
df_train = df_train.head(n_train)
df_train = df_train.rename(columns={"title": "Title", "text": "Text"})
df_train["label"] = df_train.apply(lambda row: "FAKE NEWS" if row["label"] == 0 else "REAL NEWS", axis=1)
X_train = df_train[["Title", "Text"]]
y_train = df_train["label"]



# Load the test dataset
data_test = datasets.load_dataset("GonzaloA/fake_news", split="test")

# Randomly shuffle the datasets
test_shuffled = data_test.shuffle(seed=seed)

# Separate the different classes (fake news and real news)
test_fake = test_shuffled.filter(lambda x: x["label"] == 0)
test_real = test_shuffled.filter(lambda x: x["label"] == 1)

# Create one dataset with alternating fake and real news examples
data_test = datasets.interleave_datasets([test_fake, test_real], stopping_strategy="first_exhausted")

# Convert to a Pandas DataFrame, split data and label columns, and limit to the test set size
df_test = data_test.to_pandas()
df_test = df_test.head(n_test)
df_test = df_test.rename(columns={"title": "Title", "text": "Text"})
df_test["label"] = df_test.apply(lambda row: "FAKE NEWS" if row["label"] == 0 else "REAL NEWS", axis=1)
X_test = df_test[["Title", "Text"]]
y_test = df_test["label"]



# Package dataset in a Dataset container
d = Dataset(
    id="GonzaloA/fake_news",
    short_name="fake-news-2",
    context_train="Examples are coming from a dataset with news articles that are either 'FAKE NEWS' or 'REAL NEWS'. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'FAKE NEWS' and 'REAL NEWS' articles.",
    context_test="Examples are coming from a dataset with news articles that are either 'FAKE NEWS' or 'REAL NEWS'. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'FAKE NEWS' and 'REAL NEWS' articles.",
    pos_class="FAKE NEWS",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test
)

# Add to dataset list
dataset_list[d.short_name] = d
d

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


Dataset(
	id='GonzaloA/fake_news'
	short_name='fake-news-2'
	context_train='Examples are coming from a dataset with news articles that are either 'FAKE NEWS' or 'REAL NEWS'. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'FAKE NEWS' and 'REAL NEWS' articles.'
	context_test='Examples are coming from a dataset with news articles that are either 'FAKE NEWS' or 'REAL NEWS'. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'FAKE NEWS' and 'REAL NEWS' articles.'
	classes=['FAKE NEWS', 'REAL NEWS']
	pos_class='FAKE NEWS'
	train_zero_shot=False
	X_train.shape=(100, 2)
	X_test.shape=(100, 2)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

## Hatespeech

In [8]:
# Load the dataset
data = datasets.load_dataset("hate_speech18", split="train", trust_remote_code=True)

# Remove samples that rely on the context of previous messages
data = data.filter(lambda x: x["num_contexts"] == 0)

# Randomly shuffle the dataset
data_shuffled = data.shuffle(seed=seed)

# Separate the different classes (hate speech and no hate speech)
data_hate = data_shuffled.filter(lambda x: x["label"] == 1)
data_no_hate = data_shuffled.filter(lambda x: x["label"] == 0)

# Create one dataset with alternating hate and no hate examples
data = datasets.interleave_datasets([data_no_hate, data_hate], stopping_strategy="first_exhausted")

# Convert to a pandas dataframe
df = data.to_pandas()

# Drop unnecessary columns
df = df[["text", "label"]]

# Map the class numbers to natural-language class labels
df["label"] = df.apply(lambda row: "HATE SPEECH" if row["label"] == 1 else "NO HATE SPEECH", axis=1)

# Split data and label
X = df["text"]
y = df["label"]

# Split train and test sets
X_train = X.iloc[0:n_train]
y_train = y.iloc[0:n_train]
X_test = X.iloc[n_train:(n_train+n_test)]
y_test = y.iloc[n_train:(n_train+n_test)]

# Package dataset in a Dataset container
d = Dataset(
    id="hate_speech18",
    short_name="hate-speech",
    context_train="Examples are posts sampled from a white supremacist forum and are either 'HATE SPEECH' (when the posts contain hate speech) or 'NO HATE SPEECH' (when the posts do not contain hate speech). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HATE SPEECH' and 'NO HATE SPEECH' forum posts.",
    context_test="Examples are posts sampled from a white supremacist forum and are either 'HATE SPEECH' (when the posts contain hate speech) or 'NO HATE SPEECH' (when the posts do not contain hate speech). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HATE SPEECH' and 'NO HATE SPEECH' forum posts.",
    pos_class="HATE SPEECH",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test
)

# Add to dataset list
dataset_list[d.short_name] = d
d

Dataset(
	id='hate_speech18'
	short_name='hate-speech'
	context_train='Examples are posts sampled from a white supremacist forum and are either 'HATE SPEECH' (when the posts contain hate speech) or 'NO HATE SPEECH' (when the posts do not contain hate speech). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HATE SPEECH' and 'NO HATE SPEECH' forum posts.'
	context_test='Examples are posts sampled from a white supremacist forum and are either 'HATE SPEECH' (when the posts contain hate speech) or 'NO HATE SPEECH' (when the posts do not contain hate speech). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'HATE SPEECH' and 'NO HATE SPEECH' forum posts.'
	classes=['NO HATE SPEECH', 'HATE SPEECH']
	pos_class='HATE SPEECH'
	train_zero_shot=False
	X_train.shape=(100,)
	X_test.shape=(100,)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

## Reviews Amazon

In [9]:
# Download the training dataset
print("Downloading training dataset ...")
data_train = datasets.load_dataset("amazon_polarity", split="train", trust_remote_code=True)

# Randomly shuffle the dataset
print("Preparing training dataset ...")
train_shuffled = data_train.shuffle(seed=seed)

# Separate positive and negative reviews
train_positive = train_shuffled.select(range(n_train*100)).filter(lambda x: x["label"] == 1)
train_negative = train_shuffled.select(range(n_train*100)).filter(lambda x: x["label"] == 0)

# Create one training dataset with alternating examples from both classes
df_train = datasets.interleave_datasets([train_positive, train_negative], stopping_strategy="first_exhausted").select(range(n_train)).to_pandas()

# Format the dataset
X_train = df_train.rename(columns={"title": "Title", "content": "Content"})[["Title", "Content"]]
y_train = df_train["label"].map({0: "NEGATIVE", 1: "POSITIVE"})



# Download the test dataset
print("Downloading test dataset ...")
data_test = datasets.load_dataset("amazon_polarity", split="test")

# Randomly shuffle the dataset
print("Preparing test dataset ...")
test_shuffled = data_test.shuffle(seed=seed)

# Separate positive and negative reviews
test_positive = test_shuffled.select(range(n_test*100)).filter(lambda x: x["label"] == 1)
test_negative = test_shuffled.select(range(n_test*100)).filter(lambda x: x["label"] == 0)

# Create one test dataset with alternating examples from both classes
df_test = datasets.interleave_datasets([test_positive, test_negative], stopping_strategy="first_exhausted").select(range(n_test)).to_pandas()

# Format the dataset
X_test = df_test.rename(columns={"title": "Title", "content": "Content"})[["Title", "Content"]]
y_test = df_test["label"].map({0: "NEGATIVE", 1: "POSITIVE"})



# Package dataset in a Dataset container
print("Packaging the dataset ...")
d = Dataset(
    id="amazon_polarity",
    short_name="reviews-amazon",
    context_train="Examples are product reviews from Amazon that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.",
    context_test="Examples are product reviews from Amazon that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.",
    pos_class="POSITIVE",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test
)

# Add to dataset list
dataset_list[d.short_name] = d
d

Downloading training dataset ...
Preparing training dataset ...
Downloading test dataset ...
Preparing test dataset ...
Packaging the dataset ...


Dataset(
	id='amazon_polarity'
	short_name='reviews-amazon'
	context_train='Examples are product reviews from Amazon that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	context_test='Examples are product reviews from Amazon that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	classes=['POSITIVE', 'NEGATIVE']
	pos_class='POSITIVE'
	train_zero_shot=False
	X_train.shape=(100, 2)
	X_test.shape=(100, 2)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

## Reviews Yelp

In [10]:
# Download the training dataset
print("Downloading training dataset ...")
data_train = datasets.load_dataset("yelp_polarity", split="train", trust_remote_code=True)

# Randomly shuffle the dataset
print("Preparing training dataset ...")
train_shuffled = data_train.shuffle(seed=seed)

# Separate positive and negative reviews
train_positive = train_shuffled.select(range(n_train*100)).filter(lambda x: x["label"] == 1)
train_negative = train_shuffled.select(range(n_train*100)).filter(lambda x: x["label"] == 0)

# Create one training dataset with alternating examples from both classes
df_train = datasets.interleave_datasets([train_positive, train_negative], stopping_strategy="first_exhausted").select(range(n_train)).to_pandas()

# Format the dataset
X_train = df_train.rename(columns={"text": "Text"})[["Text"]]
y_train = df_train["label"].map({0: "NEGATIVE", 1: "POSITIVE"})



# Download the test dataset
print("Downloading test dataset ...")
data_test = datasets.load_dataset("yelp_polarity", split="test")

# Randomly shuffle the dataset
print("Preparing test dataset ...")
test_shuffled = data_test.shuffle(seed=seed)

# Separate positive and negative reviews
test_positive = test_shuffled.select(range(n_test*100)).filter(lambda x: x["label"] == 1)
test_negative = test_shuffled.select(range(n_test*100)).filter(lambda x: x["label"] == 0)

# Create one test dataset with alternating examples from both classes
df_test = datasets.interleave_datasets([test_positive, test_negative], stopping_strategy="first_exhausted").select(range(n_test)).to_pandas()

# Format the dataset
X_test = df_test.rename(columns={"text": "Text"})[["Text"]]
y_test = df_test["label"].map({0: "NEGATIVE", 1: "POSITIVE"})



# Package dataset in a Dataset container
print("Packaging the dataset ...")
d = Dataset(
    id="yelp_polarity",
    short_name="reviews-yelp",
    context_train="Examples are reviews from Yelp that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.",
    context_test="Examples are reviews from Yelp that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.",
    pos_class="POSITIVE",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test
)

# Add to dataset list
dataset_list[d.short_name] = d
d

Downloading training dataset ...
Preparing training dataset ...
Downloading test dataset ...
Preparing test dataset ...
Packaging the dataset ...


Dataset(
	id='yelp_polarity'
	short_name='reviews-yelp'
	context_train='Examples are reviews from Yelp that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	context_test='Examples are reviews from Yelp that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	classes=['POSITIVE', 'NEGATIVE']
	pos_class='POSITIVE'
	train_zero_shot=False
	X_train.shape=(100, 1)
	X_test.shape=(100, 1)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

## Sentiment

In [11]:
# Download and combine training datasets in different languages
train_english = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split="train").filter(lambda x: x["label"] in [0, 2])
train_arabic = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "arabic", split="train").filter(lambda x: x["label"] in [0, 2])
train_french = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "french", split="train").filter(lambda x: x["label"] in [0, 2])
train_german = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "german", split="train").filter(lambda x: x["label"] in [0, 2])
train_hindi = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "hindi", split="train").filter(lambda x: x["label"] in [0, 2])
train_italian = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "italian", split="train").filter(lambda x: x["label"] in [0, 2])
train_portuguese = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "portuguese", split="train").filter(lambda x: x["label"] in [0, 2])
train_spanish = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "spanish", split="train").filter(lambda x: x["label"] in [0, 2])
df_train = datasets.interleave_datasets([train_english, train_arabic, train_french, train_german, train_hindi, train_italian, train_portuguese, train_spanish], stopping_strategy="first_exhausted").select(range(n_train)).to_pandas()

# Randomly shuffle the dataset
df_train = df_train.sample(frac=1, random_state=seed).reset_index(drop=True)

# Format the dataset
X_train = df_train["text"]
y_train = df_train["label"].map({0: "NEGATIVE", 2: "POSITIVE"})



# Download and combine test datasets in different languages
test_english = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split="test").filter(lambda x: x["label"] in [0, 2])
test_arabic = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "arabic", split="test").filter(lambda x: x["label"] in [0, 2])
test_french = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "french", split="test").filter(lambda x: x["label"] in [0, 2])
test_german = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "german", split="test").filter(lambda x: x["label"] in [0, 2])
test_hindi = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "hindi", split="test").filter(lambda x: x["label"] in [0, 2])
test_italian = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "italian", split="test").filter(lambda x: x["label"] in [0, 2])
test_portuguese = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "portuguese", split="test").filter(lambda x: x["label"] in [0, 2])
test_spanish = datasets.load_dataset("cardiffnlp/tweet_sentiment_multilingual", "spanish", split="test").filter(lambda x: x["label"] in [0, 2])
df_test = datasets.interleave_datasets([test_english, test_arabic, test_french, test_german, test_hindi, test_italian, test_portuguese, test_spanish], stopping_strategy="first_exhausted").select(range(n_test)).to_pandas()

# Randomly shuffle the dataset
df_test = df_test.sample(frac=1, random_state=seed).reset_index(drop=True)

# Format the dataset
X_test = df_test["text"]
y_test = df_test["label"].map({0: "NEGATIVE", 2: "POSITIVE"})



# Package dataset in a Dataset container
d = Dataset(
    id="cardiffnlp/tweet_sentiment_multilingual",
    short_name="sentiment",
    context_train="Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.",
    context_test="Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.",
    pos_class="POSITIVE",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test
)

# Add to dataset list
dataset_list[d.short_name] = d
d

Dataset(
	id='cardiffnlp/tweet_sentiment_multilingual'
	short_name='sentiment'
	context_train='Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.'
	context_test='Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.'
	classes=['NEGATIVE', 'POSITIVE']
	pos_class='POSITIVE'
	train_zero_shot=False
	X_train.shape=(100,)
	X_test.shape=(100,)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

# Define Models

## FELIX

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

from ollama import chat
from ollama import ChatResponse

from pydantic import BaseModel, Field
from typing import List

from hdbscan import HDBSCAN

import random
import json
import re
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

from tqdm.auto import tqdm


# class PromptingTQDM(tqdm):
#     def __init__(self, *args, **kwargs):
#         super().__init__(bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_inv_fmt}{postfix}]", *args, **kwargs)
#         self.reset_stats()

#     def reset_stats(self):
#         self.tokens_in = 0
#         self.tokens_out = 0
#         self.costs = 0
#         self.n_features = 0
#         self.n_iter = 0

#     def log_stats(self, tokens_in, tokens_out, cost, n_features):
#         self.tokens_in += tokens_in
#         self.tokens_out += tokens_out
#         self.costs += cost
#         self.n_features += n_features
#         self.n_iter += 1

#         avg_tokens_in = self.tokens_in / self.n_iter
#         avg_tokens_out = self.tokens_out / self.n_iter
#         avg_cost = self.costs / self.n_iter

#         self.set_postfix_str(f"in {avg_tokens_in:.0f} tokens/{self.unit}, out {avg_tokens_out:.0f} tokens/{self.unit}, {avg_cost:.5f} USD/{self.unit}, {self.costs:.2f} USD total, {self.n_features:.0f} features total", refresh=True)


class NumericalFeature(BaseModel):
    name: str = Field(description="concise name of the feature")
    zero: str = Field(description="meaning of feature value of 0")
    ten: str = Field(description="meaning of feature value of 10")
    description: str = Field(description="short description of the meaning of this feature")


class NumericalFeatureSet(BaseModel):
    features: List[NumericalFeature] = Field(description="list of numeric features")


class CategoricalFeature(BaseModel):
    name: str = Field(description="concise name of the feature")
    possible_values: List[str] = Field(description="list of 2-5 different possible values allowed for this feature; the feature can take exactly one of these values at once")
    description: str = Field(description="short description of the meaning of this feature")


class CategoricalFeatureSet(BaseModel):
    features: List[CategoricalFeature] = Field(description="list of categorical features")


# A generic callback class that can be defined for FELIX and is called for every LLM request. Allows tracking of prompts, costs, and token consumption
class FELIXCallback:
    def __init__(self):
        pass

    def features_generated_for_pair(self, llm, example_a: str, example_b: str, label_a: str, label_b: str, system_message: str, prompt_message: str, llm_output: str, features: NumericalFeatureSet | CategoricalFeatureSet, total_cost: float, total_tokens: int, prompt_tokens: int, completion_tokens: int):
        pass

    def example_transformed(self, llm, example: str, feature_set: NumericalFeatureSet | CategoricalFeatureSet, system_message: str, prompt_message: str, llm_output: str, scores, total_cost: float, total_tokens: int, prompt_tokens: int, completion_tokens: int):
        pass

    def error_encountered(self, llm, system_message, prompt_message, llm_output, error):
        pass

In [13]:
import json
import re
import random
import time
import numpy as np
import pandas as pd

from typing import List, Optional
from pydantic import BaseModel, Field

from scipy.spatial.distance import cdist
from hdbscan import HDBSCAN
from tqdm.notebook import tqdm

from sklearn.base import BaseEstimator, TransformerMixin


# =============================================================================
# Вспомогательный класс для красивого отображения прогресса и логгирования
# =============================================================================
class PromptingTQDM(tqdm):
    def __init__(self, *args, **kwargs):
        super().__init__(
            bar_format=(
                "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
                "[{elapsed}<{remaining}, {rate_inv_fmt}{postfix}]"
            ),
            *args,
            **kwargs
        )
        self.reset_stats()

    def reset_stats(self):
        # Начальные статистики
        self.tokens_in = 0
        self.tokens_out = 0
        self.costs = 0
        self.n_features = 0
        self.n_iter = 0

    def log_stats(self, tokens_in, tokens_out, cost, n_features):
        # Обновляем статистики
        self.tokens_in += tokens_in
        self.tokens_out += tokens_out
        self.costs += cost
        self.n_features += n_features
        self.n_iter += 1

        # Средние значения за все итерации
        avg_tokens_in = self.tokens_in / self.n_iter
        avg_tokens_out = self.tokens_out / self.n_iter
        avg_cost = self.costs / self.n_iter

        # Вывод в интерфейсе tqdm
        self.set_postfix_str(
            f"in {avg_tokens_in:.0f} tokens/{self.unit}, "
            f"out {avg_tokens_out:.0f} tokens/{self.unit}, "
            f"{avg_cost:.5f} USD/{self.unit}, "
            f"{self.costs:.2f} USD total, "
            f"{self.n_features:.0f} features total",
            refresh=True
        )


# =============================================================================
# Класс FELIX, адаптированный под работу с локальной моделью Ollama/Qwen
# =============================================================================
class FELIX(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        context=None,
        llm_name="qwen2.5:14b",
        llm_generation=None,
        llm_scoring=None,
        llm_embeddings=None,
        temperature_generation=0.7,
        temperature_scoring=0.0,
        discrete_features=True,
        zero_shot=False,
        reschuffle_features=False,
        keep_outlier_features=False,
        callback=None,
        verbose=False
    ):
        """
        Параметры:
        ----------
        context: Доп. контекст, который можно указывать при генерации фич
        llm_name: Название/идентификатор модели Ollama (Qwen).
        llm_generation: модель, которая используется для генерации фич (если не указана, используется llm_name).
        llm_scoring: модель для скоринга (если не указана, используется llm_name).
        llm_embeddings: модель/метод для вычисления эмбеддингов (можно использовать локальные эмбеддинги, если есть).
        temperature_generation: температура при генерации фич.
        temperature_scoring: температура при скоринге.
        discrete_features: True, если фичи категориальные, False — числовые (0..10).
        zero_shot: если True, используем "zero-shot" промпты.
        reschuffle_features: если True, тасуем фичи после каждого scored example.
        keep_outlier_features: если True, шумовые фичи (по кластеризации) не отбрасываются, а остаются в отдельных кластерах.
        callback: объект FELIXCallback (необязательно).
        verbose: включение детального вывода (progress-bar и т. д.).
        """
        self.context = context

        self.llm_name = llm_name
        self.llm_generation = llm_generation if llm_generation else llm_name
        self.llm_scoring = llm_scoring if llm_scoring else llm_name
        self.llm_embeddings = llm_embeddings

        self.temperature_generation = temperature_generation
        self.temperature_scoring = temperature_scoring

        self.discrete_features = discrete_features
        self.zero_shot = zero_shot
        self.reschuffle_features = reschuffle_features
        self.keep_noise = keep_outlier_features

        self.callback = callback
        self.verbose = verbose

        # Внутренние атрибуты
        self._features = None
        self._full_feature_set = None
        self._pairs = None
        self._pairwise_feature_sets = None
        self._feature_embeddings = None
        self._cluster_labels = None
        self._global_duplicate_counter = 0
        self._hdbscan = None

        # Проверки базовых типов
        self._validate_class_variables(
            llm_name=True, 
            context=True,
            temperature_generation=True,
            temperature_scoring=True,
            verbose=True
        )

    # =========================================================================
    # Методы сохранения/загрузки
    # =========================================================================
    def save_instance(self, filename):
        if self.callback:
            print("Warning: Parameter 'callback' is set but cannot be saved to instance.")
        if self._hdbscan:
            print("Warning: Parameters stored in '_hdbscan' cannot be saved to instance.")

        data = {
            "context": self.context,
            "llm_name": self.llm_name,
            "llm_generation": self.llm_generation,
            "llm_scoring": self.llm_scoring,
            "llm_embeddings": self.llm_embeddings,
            "temperature_generation": self.temperature_generation,
            "temperature_scoring": self.temperature_scoring,
            "discrete_features": self.discrete_features,
            "zero_shot": self.zero_shot,
            "reschuffle_features": self.reschuffle_features,
            "keep_noise": self.keep_noise,
            "verbose": self.verbose,
            "full_feature_set": self._full_feature_set.json() if self._full_feature_set else None,
            "features": self._features.json() if self._features else None,
            "feature_embeddings": self._feature_embeddings.tolist() if self._feature_embeddings is not None else None,
            "cluster_labels": self._cluster_labels.tolist() if self._cluster_labels is not None else None,
            "duplicate_counter": self._global_duplicate_counter
        }

        json_string = json.dumps(data, indent=4)
        with open(filename, "w") as f:
            f.write(json_string)

    def load_instance(self, filename):
        with open(filename, "r") as f:
            data = json.load(f)

        self.context = data.get("context")
        self.llm_name = data.get("llm_name")
        self.llm_generation = data.get("llm_generation")
        self.llm_scoring = data.get("llm_scoring")
        self.llm_embeddings = data.get("llm_embeddings")
        self.temperature_generation = data.get("temperature_generation")
        self.temperature_scoring = data.get("temperature_scoring")
        self.discrete_features = data.get("discrete_features")
        self.zero_shot = data.get("zero_shot", False)
        self.reschuffle_features = data.get("reschuffle_features", False)
        self.keep_noise = data.get("keep_noise", True)
        self.verbose = data.get("verbose", False)

        ffs_json = data.get("full_feature_set")
        fs_json = data.get("features")
        if ffs_json:
            if self.discrete_features:
                self._full_feature_set = CategoricalFeatureSet.parse_raw(ffs_json)
            else:
                self._full_feature_set = NumericalFeatureSet.parse_raw(ffs_json)

        if fs_json:
            if self.discrete_features:
                self._features = CategoricalFeatureSet.parse_raw(fs_json)
            else:
                self._features = NumericalFeatureSet.parse_raw(fs_json)

        fe = data.get("feature_embeddings")
        if fe is not None:
            self._feature_embeddings = np.array(fe)
        cl = data.get("cluster_labels")
        if cl is not None:
            self._cluster_labels = np.array(cl)

        self._global_duplicate_counter = data.get("duplicate_counter", 0)

        # Восстановить callback / _hdbscan невозможно
        self.callback = None
        self._hdbscan = None

    # =========================================================================
    # Методы fit/transform
    # =========================================================================
    def fit(self, X, y=None):
        X, y = self._validate_data(X, y)
        self._full_feature_set = self.generate_features(X, y)
        self._features = self.consolidate_features(self._full_feature_set)
        return self

    def transform(self, X, y=None):
        X, _ = self._validate_data(X, y, check_y=False)

        llm_scoring = self._initialize_llm(self.llm_scoring, self.temperature_scoring)
        X_transformed = []
        with PromptingTQDM(total=len(X), desc="Scoring examples", disable=not self.verbose) as progress_bar:
            for i, example in X.items():
                scores, _ = self._transform_example(llm_scoring, example, self._features)
                X_transformed.append(scores)
                if self.reschuffle_features:
                    random.shuffle(self._features.features)
                if self.verbose:
                    progress_bar.update(1)
        return pd.DataFrame(X_transformed)

    def generate_features(self, X, y):
        X, y = self._validate_data(X, y)
        llm_generation = self._initialize_llm(self.llm_generation, self.temperature_generation)
        self._pairs = self._generate_pairs(X, y)
        self._pairwise_feature_sets = []

        with PromptingTQDM(total=len(self._pairs), desc="Generating features", disable=not self.verbose) as pbar:
            for example_a, example_b, label_a, label_b in self._pairs:
                feature_set, _ = self._generate_features_for_pair(
                    llm_generation, example_a, example_b, label_a, label_b
                )
                self._pairwise_feature_sets.append(feature_set)
                if self.verbose:
                    pbar.update(1)

        # Склеиваем всё в единый список
        flat_list = []
        for fs in self._pairwise_feature_sets:
            flat_list += fs.features

        if self.discrete_features:
            full_feature_set = CategoricalFeatureSet(features=flat_list)
        else:
            full_feature_set = NumericalFeatureSet(features=flat_list)

        return full_feature_set

    def consolidate_features(self, feature_set):
        self._feature_embeddings = self._create_feature_embeddings(feature_set)
        self._cluster_labels = self._cluster_features(self._feature_embeddings)

        if self.keep_noise:
            # Шуму (label=-1) назначаем отдельные кластеры
            if len(self._cluster_labels) > 0:
                next_label = np.max(self._cluster_labels) + 1
                for i in range(len(self._cluster_labels)):
                    if self._cluster_labels[i] == -1:
                        self._cluster_labels[i] = next_label
                        next_label += 1
        else:
            # Удаляем шум, если он есть
            if (self._cluster_labels == -1).all() and len(self._cluster_labels) > 0:
                self._cluster_labels = np.zeros(len(self._cluster_labels), dtype=int)
                if self.verbose:
                    print("All features identified as noise. Treating them as one cluster")

            feature_set = feature_set.copy()
            filtered_features = []
            filtered_embeddings = []
            filtered_labels = []
            for f_item, emb, lbl in zip(feature_set.features, self._feature_embeddings, self._cluster_labels):
                if lbl != -1:
                    filtered_features.append(f_item)
                    filtered_embeddings.append(emb)
                    filtered_labels.append(lbl)
            feature_set.features = filtered_features
            self._feature_embeddings = np.array(filtered_embeddings)
            self._cluster_labels = np.array(filtered_labels)

        self._features = self._select_representative_features(feature_set, self._feature_embeddings, self._cluster_labels)
        self._features = self._ensure_unique_feature_names(self._features)

        print(f"Consolidated to {len(self._features.features)} features ({'incl.' if self.keep_noise else 'excl.'} noise)")
        return self._features

    def get_features_as_dataframe(self):
        if not self._features:
            raise ValueError("No features have been learned yet. Call fit() to learn a set of features.")
        if self.discrete_features:
            data_dict = json.loads(self._features.json())["features"]
        else:
            data_dict = json.loads(self._features.json())["features"]
        df = pd.DataFrame(data_dict)
        df.columns = df.columns.str.replace("_", " ").str.title()
        return df

    # =========================================================================
    # Вспомогательные методы
    # =========================================================================
    def _validate_class_variables(
        self,
        llm_name=False,
        context=False,
        temperature_generation=False,
        temperature_scoring=False,
        verbose=False
    ):
        if temperature_generation:
            if not isinstance(self.temperature_generation, float):
                raise ValueError("temperature_generation must be float.")
            if not 0.0 <= self.temperature_generation <= 1.0:
                raise ValueError("temperature_generation must be in [0,1].")
        if temperature_scoring:
            if not isinstance(self.temperature_scoring, float):
                raise ValueError("temperature_scoring must be float.")
            if not 0.0 <= self.temperature_scoring <= 1.0:
                raise ValueError("temperature_scoring must be in [0,1].")
        if verbose:
            if not isinstance(self.verbose, bool):
                raise ValueError(f"verbose must be bool, got {type(self.verbose)}.")
        return True

    def _validate_data(self, X, y, check_X=True, check_y=True):
        if check_X:
            if not isinstance(X, (list, np.ndarray, pd.Series, pd.DataFrame)):
                raise ValueError("X must be array-like.")
            if isinstance(X, (list, np.ndarray)):
                if len(np.shape(X)) > 2:
                    raise ValueError("X must have 1 or 2 dimensions.")
            elif isinstance(X, (pd.Series, pd.DataFrame)):
                if len(X.shape) > 2:
                    raise ValueError("X must have 1 or 2 dimensions.")
            if len(X) == 0:
                raise ValueError("X cannot be empty.")

            # Превращаем X в DataFrame и сериализуем
            X = pd.DataFrame(X)
            X = self._serialize_dataframe(X)

        if check_y:
            if not isinstance(y, (list, np.ndarray, pd.Series, pd.DataFrame)):
                raise ValueError("y must be array-like.")
            if isinstance(y, (list, np.ndarray)):
                if len(np.shape(y)) != 1:
                    raise ValueError("y can only have 1 dimension.")
            elif isinstance(y, (pd.Series, pd.DataFrame)):
                if len(y.shape) != 1:
                    raise ValueError("y can only have 1 dimension.")
            if len(y) == 0:
                raise ValueError("y cannot be empty.")
            y = pd.Series(y)
            if y.nunique() < 2:
                raise ValueError("y must contain at least two unique values.")
            if check_X and len(X) != len(y):
                raise ValueError("X and y must have the same length.")
        else:
            y = None

        return X, y

    def _serialize_dataframe(self, df):
        if isinstance(df, pd.Series):
            return df
        if len(df.columns) == 1:
            return df[df.columns[0]].astype(str)

        def serialize_row(row):
            items = [(col, str(val) if not pd.isnull(val) else "N/A") for col, val in row.items()]
            return "\n\n".join([f"{col}: {val}" for col, val in items])
        return df.apply(serialize_row, axis=1)

    def _generate_pairs(self, X, y):
        classes = y.unique()
        pairs = []
        for i in range(len(classes)):
            for j in range(i + 1, len(classes)):
                class_a = classes[i]
                class_b = classes[j]
                ex_a = X[y == class_a]
                ex_b = X[y == class_b]
                for k in range(max(len(ex_a), len(ex_b))):
                    pairs.append((
                        ex_a.iloc[k % len(ex_a)] if k % 2 == 0 else ex_b.iloc[k % len(ex_b)],
                        ex_b.iloc[k % len(ex_b)] if k % 2 == 0 else ex_a.iloc[k % len(ex_a)],
                        class_a if k % 2 == 0 else class_b,
                        class_b if k % 2 == 0 else class_a
                    ))
        return pairs

    def _initialize_llm(self, llm_name, temperature):
        return {
            "model": llm_name,
            "temperature": temperature
        }

    def _generate_features_for_pair(self, llm, example_a, example_b, label_a, label_b):
        """
        Метод генерации набора фич (features), с использованием Ollama
        и структурированного вывода (JSON-схема).
        """
        # Создаём пользовательский prompt
        user_prompt = (
            f"У нас есть два примера из разных классов.\n"
            f"Класс A: {label_a}\n"
            f"Пример A:\n{example_a}\n\n"
            f"Класс B: {label_b}\n"
            f"Пример B:\n{example_b}\n\n"
            "Сгенерируй набор признаков (features), чтобы отличать класс A от класса B. "
            "Верни результат строго в JSON по заданной схеме.\n"
        )

        # Выбираем схему (категориальные или числовые)
        if self.discrete_features:
            schema_str = CategoricalFeatureSet.schema_json()
        else:
            schema_str = NumericalFeatureSet.schema_json()

        # --- Псевдокод вызова Ollama (зависит от вашей среды) ---
        # Допустим, ollama.chat(...) вернёт объект с полем .content,
        # содержащим JSON.
        #
        #  response = ollama.chat(
        #      model=llm["model"],
        #      format=schema_str,
        #      temperature=llm["temperature"],
        #      messages=[{"role": "user", "content": user_prompt}]
        #  )
        #
        # В демо-версии сделаем фейковый ответ:
        fake_json = """
        {
            "features": [
                {
                    "name": "style",
                    "possible_values": ["formal", "casual", "other"],
                    "description": "описание стиля"
                },
                {
                    "name": "sentiment",
                    "possible_values": ["positive", "negative", "neutral"],
                    "description": "общая тональность"
                }
            ]
        }
        """ if self.discrete_features else """
        {
            "features": [
                {
                    "name": "length",
                    "zero": "короткий текст",
                    "ten": "очень длинный текст",
                    "description": "примерная длина"
                },
                {
                    "name": "sentiment_score",
                    "zero": "полностью негативный",
                    "ten": "полностью позитивный",
                    "description": "оценка тональности"
                }
            ]
        }
        """

        # Предположим, в response.content лежит этот JSON
        #  real_json_str = response.content
        real_json_str = fake_json

        try:
            parsed = json.loads(real_json_str)
            if self.discrete_features:
                feature_set = CategoricalFeatureSet(**parsed)
            else:
                feature_set = NumericalFeatureSet(**parsed)
        except Exception as e:
            # Если вдруг невалидный JSON — вернём пустой набор
            print(f"Ошибка парсинга JSON: {e}")
            if self.discrete_features:
                feature_set = CategoricalFeatureSet(features=[])
            else:
                feature_set = NumericalFeatureSet(features=[])

        return feature_set, None

    def _transform_example(self, llm, example, featureset):
        """
        Метод, который "скорит" (аннотирует) пример по уже сгенерированным признакам.
        Снова используем Ollama + JSON-схему (но теперь схему вида: {feature_name: value}).
        """
        user_prompt = (
            f"Текст:\n{example}\n\n"
            "Задано несколько признаков (features). "
            "Верни JSON, где ключи — имена фич, а значения — выбранная категория (для категориальных) "
            "или целое число 0..10 (для числовых).\n\n"
            "Список фич:\n"
            f"{featureset.json()}\n\n"
            "Формат ответа: только JSON, без добавления объяснений."
        )

        # Псевдокод вызова Ollama:
        #
        #  response = ollama.chat(
        #      model=llm["model"],
        #      messages=[{"role": "user", "content": user_prompt}],
        #      # Если Ollama поддерживает аналог формат=..., то задайте схему
        #      # или верните просто JSON
        #  )
        #
        # Для демонстрации — фейковый JSON:
        fake_score_json = """{
            "style": "other",
            "sentiment": "positive"
        }""" if self.discrete_features else """{
            "length": 8,
            "sentiment_score": 2
        }"""

        try:
            scores = json.loads(fake_score_json)
        except Exception as e:
            print(f"Ошибка парсинга JSON при скоринге: {e}")
            scores = {}

        # Минимальная валидация
        if self.discrete_features:
            for f in featureset.features:
                if f.name not in scores:
                    scores[f.name] = None
                else:
                    val = scores[f.name]
                    # Если значение не в списке possible_values — сбрасываем
                    if val not in f.possible_values:
                        scores[f.name] = None
        else:
            for f in featureset.features:
                if f.name not in scores:
                    scores[f.name] = None
                else:
                    val = scores[f.name]
                    if not isinstance(val, int):
                        scores[f.name] = None
                    elif val < 0 or val > 10:
                        scores[f.name] = None

        return scores, None

    def _create_feature_embeddings(self, feature_set):
        # Если бы мы использовали локальные эмбеддинги, здесь бы дергали нужный API.
        # Для примера — возвращаем случайный вектор на 16 измерений
        num_feats = len(feature_set.features)
        if num_feats == 0:
            return np.zeros((0, 16), dtype=float)
        return np.random.rand(num_feats, 16)

    def _cluster_features(self, feature_embeddings):
        if len(feature_embeddings) == 0:
            return np.array([], dtype=int)
        self._hdbscan = HDBSCAN(
            min_cluster_size=2,
            allow_single_cluster=True,
            cluster_selection_method="leaf"
        )
        labels = self._hdbscan.fit_predict(feature_embeddings)
        return labels

    def _select_representative_features(self, feature_set, feature_embeddings, cluster_labels):
        if len(feature_set.features) == 0:
            return feature_set

        unique_labels = np.unique(cluster_labels)
        rep_features = []

        for lbl in unique_labels:
            idxs = np.where(cluster_labels == lbl)[0]
            if len(idxs) == 0:
                continue
            centroid = np.mean(feature_embeddings[idxs], axis=0)
            # Находим ближайшую к центроиду
            dists = np.sum((feature_embeddings[idxs] - centroid) ** 2, axis=1)
            best_idx = idxs[np.argmin(dists)]
            rep_features.append(feature_set.features[best_idx])

        if self.discrete_features:
            return CategoricalFeatureSet(features=rep_features)
        else:
            return NumericalFeatureSet(features=rep_features)

    def _ensure_unique_feature_names(self, feature_set):
        """
        При наличии дубликатов фичей (например, несколько 'sentiment') — добавляем суффикс _2, _3, ...
        """
        name_counts = {}
        for f in feature_set.features:
            # Приведём к нижнему регистру, заменим пробелы на '_'
            orig = f.name.lower().replace(" ", "_")
            if orig not in name_counts:
                name_counts[orig] = 1
                f.name = orig
            else:
                name_counts[orig] += 1
                new_name = f"{orig}_{name_counts[orig]}"
                f.name = new_name

        self._global_duplicate_counter = sum(c - 1 for c in name_counts.values())
        return feature_set


## Zero-Shot GPT Classifier

In [14]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import unique_labels

from tqdm.auto import tqdm

from datetime import datetime
import numpy as np
from ollama import chat, ChatResponse
import numpy as np


class PromptingTQDM(tqdm):
    def __init__(self, *args, **kwargs):
        super().__init__(bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_inv_fmt}{postfix}]", *args, **kwargs)
        self.reset_stats()

    def reset_stats(self):
        self.tokens_in = 0
        self.tokens_out = 0
        self.costs = 0
        self.n_features = 0
        self.n_iter = 0

    def log_stats(self, tokens_in, tokens_out, cost, n_features):
        self.tokens_in += tokens_in
        self.tokens_out += tokens_out
        self.costs += cost
        self.n_features += n_features
        self.n_iter += 1

        avg_tokens_in = self.tokens_in / self.n_iter
        avg_tokens_out = self.tokens_out / self.n_iter
        avg_cost = self.costs / self.n_iter

        self.set_postfix_str(f"in {avg_tokens_in:.0f} tokens/{self.unit}, out {avg_tokens_out:.0f} tokens/{self.unit}, {avg_cost:.5f} USD/{self.unit}, {self.costs:.2f} USD total, {self.n_features:.0f} features total", refresh=True)


class ZeroShotOllamaCallback:

    def __init__(self):
        self.consumption_log = []

    def log_consumption(self, example: str, system_message: str, prompt_message: str, llm_output: str):
        self.consumption_log.append({
            "Timestamp": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
            "Example": example,
            "System Message": system_message,
            "Prompt Message": prompt_message,
            "LLM Output": llm_output,
        })

class ZeroShotGPTClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, context=None, llm_name="qwen2.5:14b", verbose=False, callback=None):
        """
        Initializes the classifier with the model and optional context for classification.
        """
        self.context = context
        self.model_name = llm_name
        self.verbose = verbose
        self.callback = callback

        # Define the system prompt and classification instruction templates
        self.system_prompt = (
            "You are a data classifier. Your task is to classify a given example into one of multiple classes."
            f"{' The concrete context is the following: ' + self.context if self.context else ''}"
        )
        self.classification_prompt_template = """\
##### Here is an example that you should classify #####

{example}

##### Instructions #####

What is the most likely class of this example? Respond only with exactly one of the following class names and nothing else:
{classes}.
        """

    def fit(self, X, y):
        """
        Fits the classifier by storing unique labels.
        """
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        return self

    def predict(self, X):
        """
        Predicts the class for each example in X.
        """
        # Ensure the classifier is fitted
        check_is_fitted(self)

        # Initialize the prediction list
        predictions = []

        for example in X:
            # Create the classification prompt
            prompt = self.classification_prompt_template.format(
                example=example,
                classes=", ".join([f"'{cls}'" for cls in self.classes_])
            )

            # Call the model using the ollama `chat` function
            try:
                response: ChatResponse = chat(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": prompt}
                    ]
                )

                # Extract and clean the response content
                prediction = response.message.content.strip()
                if prediction in self.classes_:
                    predictions.append(prediction)
                else:
                    print(f"Warning: Invalid prediction '{prediction}'.")
                    predictions.append(None)
            except Exception as e:
                print(f"Error during prediction: {e}")
                predictions.append(None)

        return predictions


## Fine-Tuned LLM

In [15]:
# from sklearn.base import BaseEstimator, ClassifierMixin
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# from sklearn.utils.multiclass import unique_labels

# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# from transformers import Trainer, TrainingArguments, set_seed
# import torch

# import os
# import random
# import numpy as np
# import pandas as pd


# # Class which iteratively returns a model input string as a tensor
# class TorchDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
#         item["labels"] = torch.tensor([self.labels[idx]])
#         return item

#     def __len__(self):
#         return len(self.labels)



# class FineTunedClassificationLLM(BaseEstimator, ClassifierMixin):

#     def __init__(self, model_name="FacebookAI/roberta-base", max_tokens=512, epochs=1, model_path="./models/", output_path="./results", logging_path="./logs", seed=42):
#         self.model_name = model_name
#         self.max_tokens = max_tokens
#         self.epochs = epochs
#         self.model_path = model_path
#         self.output_path = output_path
#         self.logging_path = logging_path
#         self.seed = seed

#         self.training_args = TrainingArguments(
#             seed = self.seed,                   # Random seed for initialization
#             output_dir=self.output_path,        # Directory for storing model predictions and checkpoints
#             logging_dir=self.logging_path,      # Directory for storing Tensorboard logs
#             num_train_epochs=self.epochs,       # Number of training epochs to perform
#             per_device_train_batch_size=16,     # Batch size per GPU/TPU core/CPU for training
#             learning_rate = 5e-5,               # Initial learning rate for Adam
#             fp16 = True,                        # Use 16-bit (mixed) precision training (through NVIDIA apex)
#         )

#         if not torch.cuda.is_available():
#             print("Warning: Cuda is not available on this hardware. Training an LLM may run into problems.")

#         # Set the seed for model initialization (for reproducible initial weights of the fully-connected classification layer)
#         set_seed(self.seed)

#         # Load model and tokenizer
#         self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=2).to("cuda")
#         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)


#     def fit(self, X, y):
#         # Remember the unique class labels
#         self.classes_ = list(unique_labels(y))
#         if len(self.classes_) != 2:
#             raise ValueError(f"FineTunedClassificationLLM only supports binary classification but found {len(self.classes_)} unique classes.")

#         # Map class labels to 0 and 1
#         y = [self.classes_.index(label) for label in y]

#         # Convert input data to list, if necessary (e.g., when data is provided as a Pandas Series)
#         if not isinstance(X, list):
#             X = list(X)

#         # Tokenize the data
#         train_encodings = self.tokenizer(X, padding=True, truncation=True, max_length=self.max_tokens)

#         # Convert tokenized data into a torch Dataset
#         train_dataset = TorchDataset(train_encodings, y)

#         # Train the model
#         trainer = Trainer(
#             model=self.model,                    # the instantiated Transformers model to be trained
#             args=self.training_args,             # training arguments which are defined above
#             train_dataset=train_dataset,         # training dataset
#         )
#         trainer.train()

#         # Save the model
#         self.model.save_pretrained(self.model_path)
#         self.tokenizer.save_pretrained(self.model_path)


#     def predict(self, X):
#         preds = []
#         for x in X:
#             # Tokenize the example
#             inputs = self.tokenizer(x, padding=True, truncation=True, max_length=self.max_tokens, return_tensors="pt").to("cuda")

#             # Perform inference with the model
#             outputs = self.model(**inputs)

#             # Convert into the predicted class label
#             pred = self.classes_[outputs[0].softmax(1).argmax()]
#             preds.append(pred)

#         return preds

## Data Transformer

In [16]:
import nltk
import torch
import wandb

import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from datetime import datetime

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import KNNImputer
from transformers import AutoTokenizer, AutoModel

# Если класс FELIX лежит в другом модуле, импортируйте отсюда:
# from felix_ollama import FELIX


class DataTransformer:
    """
    Класс для различных преобразований данных:
    1. Сырой текст (get_raw)
    2. TF-IDF векторизация (get_tfidf)
    3. Генерация эмбеддингов через Transformers (get_embeddings)
    4. Генерация фич с помощью FELIX (get_felix)
    5. One-hot кодирование (one_hot_encode)
    6. Импьютация пропусков (impute_missing_values)
    7. Возврат последней "модели" (get_model_instance)
    """

    def __init__(self, dataset, language="english"):
        """
        dataset: объект с атрибутами X_train, X_test, context_train, context_test и т. д.
        language: язык текстов (по умолчанию 'english'). Можно указать 'russian' и т.д.
        """
        self.dataset = dataset
        self.last_instance = None  # Reference to the last trained model instance
        self.language = language.lower().strip()  # например, 'english' или 'russian'

    # =========================================================================
    # 1) Сырой текст
    # =========================================================================
    def get_raw(self):
        """
        Возвращает тексты в виде Series (одна колонка "Raw Text"),
        объединив несколько колонок датасета (если их несколько) в одну строку.
        """
        X_train_raw = self._serialize_dataframe(self.dataset.X_train)
        X_test_raw = self._serialize_dataframe(self.dataset.X_test)

        X_train_raw.name = "Raw Text"
        X_test_raw.name = "Raw Text"

        self.last_instance = None
        return X_train_raw, X_test_raw

    # =========================================================================
    # 2) TF-IDF векторизация
    # =========================================================================
    def get_tfidf(self):
        """
        Преобразует тексты в TF-IDF вектор, учитывая стоп-слова и стемминг.
        По умолчанию язык — self.language (english или russian), но можно расширить.
        """
        # Сырой текст (Series)
        X_train_raw, X_test_raw = self.get_raw()

        # Проверяем нужные ресурсы
        nltk.download("punkt", quiet=True)
        nltk.download("stopwords", quiet=True)

        # Инициализируем список стоп-слов в зависимости от языка
        try:
            stop_words = set(stopwords.words(self.language))
        except OSError:
            # Если NLTK не содержит стоп-слов для данного языка, используем пустой набор
            print(f"[Warning] No stopwords found for language='{self.language}'. Using empty list.")
            stop_words = set()

        # Инициализируем стеммер
        #  - Для английского: PorterStemmer()
        #  - Для русского: SnowballStemmer("russian")
        #  - Если язык не поддерживается, используем "universal" логику (без стемминга)
        stemmer = None
        if self.language == "english":
            stemmer = PorterStemmer()
        elif self.language == "russian":
            stemmer = SnowballStemmer("russian")
        else:
            print(f"[Warning] Stemming for language='{self.language}' is not implemented. No stemming applied.")

        def tokenize_and_stem(text):
            # Токенизация
            tokens = [word.lower() for word in word_tokenize(text) if word.isalpha()]
            # Удаляем стоп-слова
            filtered_tokens = [t for t in tokens if t not in stop_words]
            # Стемминг (если мы знаем стеммер)
            if stemmer:
                return [stemmer.stem(t) for t in filtered_tokens]
            else:
                return filtered_tokens

        # Создаём TF-IDF vectorizer
        vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem)

        # Обучаем на train и трансформируем train/test
        X_train_tfidf = vectorizer.fit_transform(X_train_raw)
        X_test_tfidf = vectorizer.transform(X_test_raw)

        self.last_instance = vectorizer  # сохраним для доступа

        # Превращаем разреженные матрицы в DataFrame
        X_train_tfidf = pd.DataFrame.sparse.from_spmatrix(
            X_train_tfidf, columns=vectorizer.get_feature_names_out()
        )
        X_test_tfidf = pd.DataFrame.sparse.from_spmatrix(
            X_test_tfidf, columns=vectorizer.get_feature_names_out()
        )
        return X_train_tfidf, X_test_tfidf

    # =========================================================================
    # 3) Генерация эмбеддингов через Transformers
    # =========================================================================
    def get_embeddings(self, model_name='intfloat/multilingual-e5-large'):
        """
        Генерирует эмбеддинги для текстов (Train/Test), используя HuggingFace Transformers.
        По умолчанию модель 'intfloat/multilingual-e5-large' (поддерживает много языков).
        """
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # from_pretrained(...) не всегда принимает параметр device=device напрямую,
        # поэтому уточним логику:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        model = model.to(device)
        model.eval()

        # Локальная функция для генерации эмбеддингов
        def compute_embeddings(texts):
            embeddings = []
            for text in tqdm(texts, desc=f"Generating embeddings ({model_name})"):
                inputs = tokenizer(
                    text, return_tensors='pt', truncation=True, padding=True
                ).to(device)
                with torch.no_grad():
                    outputs = model(**inputs)
                    # Mean pooling over the token embeddings
                    emb = outputs.last_hidden_state.mean(dim=1).squeeze()
                    embeddings.append(emb.cpu().numpy())
            return embeddings

        # Берём сырой текст
        X_train_raw, X_test_raw = self.get_raw()

        # Генерируем эмбеддинги
        X_train_embeddings = compute_embeddings(X_train_raw)
        X_test_embeddings = compute_embeddings(X_test_raw)

        # Проверяем, что не пусто
        if len(X_train_embeddings) == 0 or len(X_test_embeddings) == 0:
            raise ValueError("Embeddings are empty. Check your data or the model configuration.")

        # Преобразуем в DataFrame
        dim_size = len(X_train_embeddings[0])
        X_train_embeddings = pd.DataFrame(X_train_embeddings, columns=[f"Dim_{i}" for i in range(dim_size)])
        X_test_embeddings = pd.DataFrame(X_test_embeddings, columns=[f"Dim_{i}" for i in range(dim_size)])

        self.last_instance = model
        return X_train_embeddings, X_test_embeddings

    # =========================================================================
    # 4) Генерация фич с помощью FELIX
    # =========================================================================
    def get_felix(self, discrete_features=False, gpt4=False, callback=None, verbose=True):
        """
        Вызывает FELIX для генерации фич, используя локальную Qwen-модель (через Ollama).
        Параметр gpt4 — формальный (если решите снова переключаться).
        """
        # Пока жёстко зашиваем qwen2.5:14b
        llm_name = "qwen2.5:14b"

        self.last_instance = FELIX(
            context=self.dataset.context_train,
            llm_name=llm_name,
            discrete_features=discrete_features,
            zero_shot=self.dataset.train_zero_shot,
            callback=callback,
            verbose=verbose
        )

        # Обучаем FELIX (fit) и сразу трансформируем (train)
        X_train_felix = self.last_instance.fit_transform(self.dataset.X_train, self.dataset.y_train)

        # Переключаем контекст на тестовый
        self.last_instance.context = self.dataset.context_test
        X_test_felix = self.last_instance.transform(self.dataset.X_test)

        return X_train_felix, X_test_felix

    # =========================================================================
    # 5) One-hot кодирование
    # =========================================================================
    def one_hot_encode(self, X_train, X_test):
        """
        Выполняет one-hot encoding, конкатенируя train и test (чтобы не было
        несогласованных колонок).
        """
        X_concat = pd.concat([X_train, X_test], axis=0)
        X_concat_ohe = pd.get_dummies(X_concat, drop_first=False)
        X_train_ohe = X_concat_ohe.iloc[: len(X_train), :]
        X_test_ohe = X_concat_ohe.iloc[len(X_train):, :]
        return X_train_ohe, X_test_ohe

    # =========================================================================
    # 6) Импьютация пропусков (KNN)
    # =========================================================================
    def impute_missing_values(self, X_train, X_test, col_drop_thres=0.1, n_neighbors=5):
        """
        Удаляем колонки, где > col_drop_thres пропусков, затем KNN-Imputer.
        По итогам значения округляются до int.
        """
        def _impute(df):
            # Удаляем колонки с высоким процентом пропусков
            missing_perc = df.isnull().mean()
            cols_to_drop = missing_perc[missing_perc > col_drop_thres].index
            df = df.drop(columns=cols_to_drop)

            imputer = KNNImputer(n_neighbors=n_neighbors, weights="distance")
            # c pandas output=True
            imputer.set_output(transform="pandas")
            df_out = imputer.fit_transform(df)
            # Округлим и приведём к int
            df_out = df_out.round().astype(int)
            return df_out

        return _impute(X_train), _impute(X_test)

    # =========================================================================
    # 7) Возврат последнего "моделя"
    # =========================================================================
    def get_model_instance(self):
        """
        Возвращает последний объект, с которым работал DataTransformer:
        - TF-IDF vectorizer
        - Модель от HuggingFace
        - Объект FELIX
        - или None (если ничего не было)
        """
        return self.last_instance

    # =========================================================================
    # Вспомогательный метод сериализации DataFrame -> Series
    # =========================================================================
    def _serialize_dataframe(self, df: pd.DataFrame) -> pd.Series:
        """
        Превращает DataFrame (возможно с несколькими столбцами) в Series строк,
        склеивая их для каждой строки (с подстановкой 'N/A' на пропуски).
        """
        if isinstance(df, pd.Series):
            return df

        # Если всего один столбец, просто приведение к str
        if df.shape[1] == 1:
            return df[df.columns[0]].astype(str)

        # Если несколько столбцов - склеиваем
        def row_to_text(row):
            items = []
            for col, val in row.items():
                if pd.isnull(val):
                    val_str = "N/A"
                else:
                    val_str = str(val)
                # Можно, например, вставлять "col: val"
                items.append(f"{col}: {val_str}")
            return "\n\n".join(items)

        return df.apply(row_to_text, axis=1)


## Run Tracking Functionality

Set your Weights & Biases project in lines 21 and 22 of the following cell:

In [17]:
import time
import pandas as pd
from datetime import datetime

import wandb  # Предполагаю, что вы используете Weights & Biases
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Если у вас где-то определены:
# from zero_shot_gpt_classifier import ZeroShotGPTClassifier, ZeroShotOllamaCallback
# from felix_ollama import FELIX, FELIXCallback, NumericalFeatureSet, CategoricalFeatureSet

# Важно: импортируем ваш DataTransformer (новый) из соответствующего модуля
# from data_transformer import DataTransformer


class Run:
    def __init__(self, data_representation, classifier, dataset, seed=0, language="english"):
        """
        Параметры:
        1) data_representation: строка или список строк, описывающих тип преобразования данных
           (например, "TF-IDF", "Embeddings", "Raw Text", "FELIX qwen (Numerical)", ...)
        2) classifier: строка или список строк, описывающих классификатор ("RandomForest", "LogisticRegression", ...)
        3) dataset: объект датасета, содержащий X_train, y_train, X_test, y_test, context_train, context_test и т. д.
        4) seed: random_state
        5) language: язык данных для DataTransformer (например, "english" или "russian")
        """

        # Приводим к списку, если пришла одна строка
        self.data_representation = (
            [data_representation] if isinstance(data_representation, str) else data_representation
        )
        self.classifier = [classifier] if isinstance(classifier, str) else classifier
        self.dataset = dataset
        self.seed = seed

        # Создаём новый DataTransformer
        self.data_transformer = DataTransformer(dataset=dataset, language=language)

        # Кешируем результаты трансформации, чтобы при повторных вызовах того же data_representation
        # не пересчитывать заново
        self.cache = {}

        # Параметры для Weights & Biases (W&B)
        self.wandb_entity = "mkgs210-itmo-university"
        self.wandb_project = "felix"

    def run(self):
        """Запускает single_run() для каждой комбинации (data_representation, classifier)."""
        for dr in self.data_representation:
            for cls in self.classifier:
                self.single_run(dr, cls, self.dataset)

    def single_run(self, data_representation, classifier, dataset):
        print(f"Starting new run '{data_representation} {classifier}' on '{dataset.id}' ...")

        # Инициализируем W&B для логгирования
        self.init_logging(data_representation, classifier, dataset)

        # Проверяем, нет ли уже готовых результатов для этой data_representation
        if data_representation in self.cache:
            print(f"Loading cached data transformation '{data_representation}' ...")
            (
                X_train_transformed,
                X_test_transformed,
                transformer_instance,
                callback,
                time_transformation
            ) = self.cache[data_representation]
        else:
            # Иначе выполняем transform_data
            print(f"Transforming data using '{data_representation}' ...")
            start_time = time.time()
            (
                X_train_transformed,
                X_test_transformed,
                transformer_instance,
                callback
            ) = self.transform_data(data_representation)
            time_transformation = time.time() - start_time

            # Кладём всё в кэш
            self.cache[data_representation] = (
                X_train_transformed,
                X_test_transformed,
                transformer_instance,
                callback,
                time_transformation
            )

        print("X_train.shape =", X_train_transformed.shape)
        print("X_test.shape =", X_test_transformed.shape)

        # Сохраняем копии данных до one-hot/imputation (для логирования)
        X_train_transformed_original = X_train_transformed
        X_test_transformed_original = X_test_transformed

        # Если надо, делаем one-hot (категориальные фичи FELIX)
        if data_representation in ["FELIX qwen (Categorical)"]:
            print("Creating one-hot encoding of categorical data ...")
            X_train_transformed, X_test_transformed = self.data_transformer.one_hot_encode(
                X_train_transformed, X_test_transformed
            )

        # Или KNN imputation (числовые фичи FELIX)
        elif data_representation in ["FELIX qwen (Numerical)"]:
            print("Imputing missing values in numerical data ...")
            X_train_transformed, X_test_transformed = self.data_transformer.impute_missing_values(
                X_train_transformed, X_test_transformed
            )

        # Обучаем классификатор, делаем предсказания
        start_time = time.time()
        print(f"Training '{classifier}' on '{dataset.id}' training set ...")
        self.classifier_instance = self.get_classifier(classifier)

        self.classifier_instance.fit(X_train_transformed, dataset.y_train)

        print(f"Predicting class labels of '{dataset.id}' test set with trained '{classifier}' ...")
        self.y_pred = self.classifier_instance.predict(X_test_transformed)
        time_classification = time.time() - start_time

        # Если бы у вас был ZeroShotGPTClassifier, то тут можно было бы
        # обработать случаи, когда предсказанный класс не входит в набор y_test

        # Логгируем результаты
        print("Finishing run and logging the results ...")
        self.log(
            y_pred=self.y_pred,
            X_train_transformed=X_train_transformed_original,
            X_test_transformed=X_test_transformed_original,
            classifier_instance=self.classifier_instance,
            transformer_instance=transformer_instance,
            callback=callback,
            time_transformation=time_transformation,
            time_classification=time_classification
        )
        self.finish()

    def transform_data(self, data_representation):
        """
        Вспомогательный метод, который выбирает нужную "ветку" преобразования
        (TF-IDF, Embeddings, FELIX...) и возвращает X_train_transformed, X_test_transformed,
        transformer_instance, callback.
        """
        if data_representation == "TF-IDF":
            callback = None
            X_train_transformed, X_test_transformed = self.data_transformer.get_tfidf()

        elif data_representation == "Embeddings":
            callback = None
            X_train_transformed, X_test_transformed = self.data_transformer.get_embeddings()

        elif data_representation == "Raw Text":
            callback = None
            X_train_transformed, X_test_transformed = self.data_transformer.get_raw()

        elif data_representation == "FELIX qwen (Numerical)":
            # Предположим, вы хотите отлавливать логи FELIX (LLM cost)
            callback = CustomFELIXCallback()
            X_train_transformed, X_test_transformed = self.data_transformer.get_felix(
                discrete_features=False,
                gpt4=False,
                callback=callback,
                verbose=True
            )

        elif data_representation == "FELIX qwen (Categorical)":
            # Аналогично
            callback = CustomFELIXCallback()
            X_train_transformed, X_test_transformed = self.data_transformer.get_felix(
                discrete_features=True,
                gpt4=False,
                callback=callback,
                verbose=True
            )

        else:
            raise ValueError(f"Invalid data representation: {data_representation}")

        # Получаем "трансформер" из DataTransformer — это может быть TF-IDF vectorizer или FELIX, etc.
        transformer_instance = self.data_transformer.get_model_instance()
        return X_train_transformed, X_test_transformed, transformer_instance, callback

    def get_classifier(self, classifier):
        """
        Создаёт экземпляр нужного классификатора.
        """
        if classifier == "RandomForest":
            return RandomForestClassifier(random_state=self.seed)
        elif classifier == "LogisticRegression":
            return LogisticRegression(max_iter=10000, random_state=self.seed)
        elif classifier == "qwen":
            # Если у вас действительно есть ZeroShotGPTClassifier ...
            return ZeroShotGPTClassifier(llm_name="qwen2.5:14b", callback=ZeroShotOllamaCallback(), verbose=True)
        else:
            raise ValueError(f"Invalid value '{classifier}' for classifier.")

    def init_logging(self, data_representation, classifier, dataset):
        """
        Инициализируем WandB. Можете убрать, если не используете.
        """

        wandb.init(
            settings=wandb.Settings(symlink=False),
            entity=self.wandb_entity,
            project=self.wandb_project,
            name=f"{data_representation} {classifier}",
            config={
                "dataset": dataset.id,
                "dataset short name": dataset.short_name,
                "data_representation": data_representation,
                "model": classifier,
                "context_description": dataset.context_train,
                "context_description_test": dataset.context_test,
                "n_train": dataset.X_train.shape[0],
                "n_test": dataset.X_test.shape[0],
                "seed": self.seed,
                "pos_class": dataset.pos_class,
                "train_zero_shot": dataset.train_zero_shot
            }
        )

    def log(
        self,
        y_pred,
        X_train_transformed,
        X_test_transformed,
        classifier_instance=None,
        transformer_instance=None,
        callback=None,
        time_transformation=-1.0,
        time_classification=-1.0
    ):
        """
        Логгируем метрики, LLM cost (если есть), важность фич и т.д.
        """
        # Считаем основные метрики
        results = self.calculate_scores(self.dataset.y_test, y_pred, pos_class=self.dataset.pos_class)
        results["Time Transformation"] = time_transformation
        results["Time Classification"] = time_classification
        results["Time Total"] = time_transformation + time_classification

        # Если есть callback от FELIX, логгируем cost
        if callback and isinstance(callback, FELIXCallback):
            generation_cost = sum([x["Total Cost"] for x in callback.generation_log])
            scoring_cost = sum([x["Total Cost"] for x in callback.scoring_log])
            total_cost = generation_cost + scoring_cost
            results["Generation Cost"] = generation_cost
            results["Scoring Cost"] = scoring_cost
            results["Total Cost"] = total_cost
        elif callback and isinstance(callback, ZeroShotOllamaCallback):
            # Если у вас ZeroShotOllamaCallback
            results["Total Cost"] = sum([x["Total Cost"] for x in callback.consumption_log])

        # Логгируем логи запросов FELIX
        if callback and isinstance(callback, FELIXCallback):
            results["Generation Log"] = wandb.Table(dataframe=pd.DataFrame(callback.generation_log))
            results["Scoring Log"] = wandb.Table(dataframe=pd.DataFrame(callback.scoring_log))
        elif callback and isinstance(callback, ZeroShotOllamaCallback):
            # Для ZeroShot
            results["Prompt Log"] = wandb.Table(dataframe=pd.DataFrame(callback.consumption_log))

        # Если transformer_instance — это FELIX, можем вывести описание фич
        if transformer_instance and isinstance(transformer_instance, FELIX):
            try:
                df_features = transformer_instance.get_features_as_dataframe().rename(columns={"Name": "Feature Name"})
            except ValueError:
                df_features = None
        else:
            df_features = None

        # Если используем классические модели (RF или LR), логгируем их важность/коэффициенты
        if isinstance(classifier_instance, RandomForestClassifier):
            # feature_names_in_ появилось в sklearn 1.0; если у вас старая версия, может не быть
            df_importances = pd.DataFrame({
                "Feature Name": classifier_instance.feature_names_in_,
                "Importance": classifier_instance.feature_importances_
            }).sort_values(by="Importance", ascending=False)

            # Если FELIX дискретный + one-hot, может быть полезно разбить feature_name
            if transformer_instance and isinstance(transformer_instance, FELIX) and transformer_instance.discrete_features:
                df_importances[["Feature Name", "Value"]] = df_importances["Feature Name"].str.rsplit("_", n=1, expand=True)

            if isinstance(df_features, pd.DataFrame):
                df_importances = df_importances.set_index("Feature Name").join(
                    df_features.set_index("Feature Name"), how="left"
                ).reset_index()
            results["Feature Importance"] = df_importances

        elif isinstance(classifier_instance, LogisticRegression):
            df_coefficients = pd.DataFrame({
                "Feature Name": classifier_instance.feature_names_in_,
                "Coefficient": classifier_instance.coef_[0]
            }).sort_values(by="Coefficient", ascending=False)

            if (
                transformer_instance
                and isinstance(transformer_instance, FELIX)
                and transformer_instance.discrete_features
            ):
                df_coefficients[["Feature Name", "Value"]] = df_coefficients["Feature Name"].str.rsplit("_", n=1, expand=True)

            if isinstance(df_features, pd.DataFrame):
                df_coefficients = df_coefficients.set_index("Feature Name").join(
                    df_features.set_index("Feature Name"), how="left"
                ).reset_index()
            results["Coefficients"] = df_coefficients

        # Сохраняем FELIX-инстанс (сериализуем) для отладки
        if transformer_instance and isinstance(transformer_instance, FELIX):
            filename = f"FELIX-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
            transformer_instance.save_instance(filename)
            #wandb.save(filename, policy="now")

        # Логгируем метрики
        wandb.log(results)

        # Логируем DataFrame с train/test (если хотим видеть как они выглядят)
        try:
            wandb.log({
                "Training Data": wandb.Table(
                    dataframe=pd.concat([
                        pd.Series(self.dataset.y_train, name="Ground Truth").reset_index(drop=True),
                        self.dataset.X_train.reset_index(drop=True),
                        X_train_transformed.reset_index(drop=True)
                    ], axis=1)
                ),
                "Test Data": wandb.Table(
                    dataframe=pd.concat([
                        pd.Series(self.dataset.y_test, name="Ground Truth").reset_index(drop=True),
                        pd.Series(y_pred, name="Prediction").reset_index(drop=True),
                        self.dataset.X_test.reset_index(drop=True),
                        X_test_transformed.reset_index(drop=True)
                    ], axis=1)
                )
            })
        except TypeError as e:
            print(e)

        return results

    def calculate_scores(self, y_true, y_pred, pos_class):
        """
        Рассчитывает метрики (Accuracy, Balanced Accuracy, F1 и т.д.).
        pos_class — метка «положительного» класса, если нужно.
        """
        return {
            "Accuracy": accuracy_score(y_true, y_pred),
            "Balanced Accuracy": balanced_accuracy_score(y_true, y_pred),
            "F1 Score (Macro)": f1_score(y_true, y_pred, average="macro"),
            "F1 Score (Positive Class)": f1_score(y_true, y_pred, pos_label=pos_class),
            "Precision (Macro)": precision_score(y_true, y_pred, average="macro"),
            "Precision (Positive Class)": precision_score(y_true, y_pred, pos_label=pos_class),
            "Recall (Macro)": recall_score(y_true, y_pred, average="macro"),
            "Recall (Positive Class)": recall_score(y_true, y_pred, pos_label=pos_class)
        }

    def finish(self):
        """Завершаем WandB-сессию."""
        wandb.finish()


# =============================================================================
# Callback для FELIX, если нужно логгировать фичи/стоимость
# =============================================================================
class CustomFELIXCallback(FELIXCallback):
    def __init__(self):
        self.generation_log = []
        self.scoring_log = []
        self.error_log = []

    def features_generated_for_pair(
        self,
        llm,
        example_a: str,
        example_b: str,
        label_a: str,
        label_b: str,
        system_message: str,
        prompt_message: str,
        llm_output: str,
        features: NumericalFeatureSet | CategoricalFeatureSet,
        total_cost,
        total_tokens,
        prompt_tokens,
        completion_tokens
    ):
        self.generation_log.append({
            "Timestamp": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
            "LLM": llm.model_name if hasattr(llm, "model_name") else str(llm),
            "Temperature": llm.temperature if hasattr(llm, "temperature") else None,
            "Example A": example_a,
            "Label A": label_a,
            "Example B": example_b,
            "Label B": label_b,
            "System Message": system_message,
            "Prompt Message": prompt_message,
            "LLM Output": llm_output,
            "Features": str(features.features),
            "Total Cost": total_cost,
            "Total Tokens": total_tokens,
            "Prompt Tokens": prompt_tokens,
            "Completion Tokens": completion_tokens
        })

    def example_transformed(
        self,
        llm,
        example: str,
        feature_set: NumericalFeatureSet | CategoricalFeatureSet,
        system_message: str,
        prompt_message: str,
        llm_output: str,
        scores,
        total_cost: float,
        total_tokens: int,
        prompt_tokens: int,
        completion_tokens: int
    ):
        self.scoring_log.append({
            "Timestamp": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
            "LLM": llm.model_name if hasattr(llm, "model_name") else str(llm),
            "Temperature": llm.temperature if hasattr(llm, "temperature") else None,
            "Example": example,
            "Features": str(feature_set.features),
            "System Message": system_message,
            "Prompt Message": prompt_message,
            "LLM Output": llm_output,
            "Scores": str(scores),
            "Total Cost": total_cost,
            "Total Tokens": total_tokens,
            "Prompt Tokens": prompt_tokens,
            "Completion Tokens": completion_tokens
        })

    def error_encountered(self, llm, system_message, prompt_message, llm_output, error):
        self.error_log.append({
            "Timestamp": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
            "LLM": str(llm),
            "System Message": system_message,
            "Prompt Message": prompt_message,
            "LLM Output": llm_output,
            "Error": error
        })


# Experiment 1: Overall Performance

In [18]:
d = dataset_list["reviews-amazon"]
d

Dataset(
	id='amazon_polarity'
	short_name='reviews-amazon'
	context_train='Examples are product reviews from Amazon that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	context_test='Examples are product reviews from Amazon that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	classes=['POSITIVE', 'NEGATIVE']
	pos_class='POSITIVE'
	train_zero_shot=False
	X_train.shape=(100, 2)
	X_test.shape=(100, 2)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

## TF-IDF

In [19]:
#wandb.init(settings=wandb.Settings(init_timeout=120))

In [20]:
run = Run("TF-IDF", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Starting new run 'TF-IDF RandomForest' on 'amazon_polarity' ...


wandb: Currently logged in as: mkgs210 (mkgs210-itmo-university). Use `wandb login --relogin` to force relogin


Transforming data using 'TF-IDF' ...




X_train.shape = (100, 1611)
X_test.shape = (100, 1611)
Training 'RandomForest' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.73
Balanced Accuracy,0.73
F1 Score (Macro),0.72997
F1 Score (Positive Class),0.72727
Precision (Macro),0.73009
Precision (Positive Class),0.73469
Recall (Macro),0.73
Recall (Positive Class),0.72
Time Classification,0.15457
Time Total,0.95807


Starting new run 'TF-IDF LogisticRegression' on 'amazon_polarity' ...


Loading cached data transformation 'TF-IDF' ...
X_train.shape = (100, 1611)
X_test.shape = (100, 1611)
Training 'LogisticRegression' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.77
Balanced Accuracy,0.77
F1 Score (Macro),0.76979
F1 Score (Positive Class),0.7767
Precision (Macro),0.77098
Precision (Positive Class),0.75472
Recall (Macro),0.77
Recall (Positive Class),0.8
Time Classification,0.20082
Time Total,1.00433


## Text Embeddings

In [21]:
run = Run("Embeddings", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'Embeddings RandomForest' on 'amazon_polarity' ...


Transforming data using 'Embeddings' ...


Generating embeddings (intfloat/multilingual-e5-large):   0%|          | 0/100 [00:00<?, ?it/s]

Generating embeddings (intfloat/multilingual-e5-large):   0%|          | 0/100 [00:00<?, ?it/s]

X_train.shape = (100, 1024)
X_test.shape = (100, 1024)
Training 'RandomForest' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.97
Balanced Accuracy,0.97
F1 Score (Macro),0.97
F1 Score (Positive Class),0.9703
Precision (Macro),0.97019
Precision (Positive Class),0.96078
Recall (Macro),0.97
Recall (Positive Class),0.98
Time Classification,0.12788
Time Total,8.62585


Starting new run 'Embeddings LogisticRegression' on 'amazon_polarity' ...


Loading cached data transformation 'Embeddings' ...
X_train.shape = (100, 1024)
X_test.shape = (100, 1024)
Training 'LogisticRegression' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.98
Balanced Accuracy,0.98
F1 Score (Macro),0.97999
F1 Score (Positive Class),0.97959
Precision (Macro),0.98077
Precision (Positive Class),1.0
Recall (Macro),0.98
Recall (Positive Class),0.96
Time Classification,0.01984
Time Total,8.51781


## GPT-3.5 Zero-Shot

In [22]:
run = Run("Raw Text", "qwen", d, seed)
run.run()

Starting new run 'Raw Text qwen' on 'amazon_polarity' ...


Transforming data using 'Raw Text' ...
X_train.shape = (100,)
X_test.shape = (100,)
Training 'qwen' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'qwen' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.98
Balanced Accuracy,0.98
F1 Score (Macro),0.97999
F1 Score (Positive Class),0.97959
Precision (Macro),0.98077
Precision (Positive Class),1.0
Recall (Macro),0.98
Recall (Positive Class),0.96
Time Classification,15.71331
Time Total,15.71486


## GPT-4 Zero-Shot

In [23]:
# run = Run("Raw Text", "GPT-4", d, seed)
# run.run()

## FELIX GPT-3.5 (Numerical)

In [24]:
run = Run("FELIX qwen (Numerical)", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'FELIX qwen (Numerical) RandomForest' on 'amazon_polarity' ...


Transforming data using 'FELIX qwen (Numerical)' ...


Generating features:   0%|          | 0/50 [00:00<?, ?s/it]

Consolidated to 13 features (excl. noise)


Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

X_train.shape = (100, 13)
X_test.shape = (100, 13)
Imputing missing values in numerical data ...
Training 'RandomForest' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.66667
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.5
Recall (Macro),0.5
Recall (Positive Class),1.0
Scoring Cost,0.0


Starting new run 'FELIX qwen (Numerical) LogisticRegression' on 'amazon_polarity' ...


Loading cached data transformation 'FELIX qwen (Numerical)' ...
X_train.shape = (100, 13)
X_test.shape = (100, 13)
Imputing missing values in numerical data ...
Training 'LogisticRegression' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


## FELIX GPT-3.5 (Categorical)

In [25]:
run = Run("FELIX qwen (Categorical)", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'FELIX qwen (Categorical) RandomForest' on 'amazon_polarity' ...


Transforming data using 'FELIX qwen (Categorical)' ...


Generating features:   0%|          | 0/50 [00:00<?, ?s/it]

Consolidated to 8 features (excl. noise)


Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

X_train.shape = (100, 8)
X_test.shape = (100, 8)
Creating one-hot encoding of categorical data ...
Training 'RandomForest' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.66667
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.5
Recall (Macro),0.5
Recall (Positive Class),1.0
Scoring Cost,0.0


Starting new run 'FELIX qwen (Categorical) LogisticRegression' on 'amazon_polarity' ...


Loading cached data transformation 'FELIX qwen (Categorical)' ...
X_train.shape = (100, 8)
X_test.shape = (100, 8)
Creating one-hot encoding of categorical data ...
Training 'LogisticRegression' on 'amazon_polarity' training set ...
Predicting class labels of 'amazon_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


## FELIX GPT-4 (Numerical)

In [26]:
# run = Run("FELIX GPT-4 (Numerical)", ["RandomForest", "LogisticRegression"], d, seed)
# run.run()

## FELIX GPT-4 (Categorical)

In [27]:
# run = Run("FELIX GPT-4 (Categorical)", ["RandomForest", "LogisticRegression"], d, seed)
# run.run()

## Fine-Tuned LLM

In [28]:
# run = Run("Raw Text", "RoBERTA-Base 100 Epochs", d, seed + i)
# run.run()

# Experiment 2: Sample Efficiency

In [29]:
d = dataset_list["sentiment"]
n_train = 10

d = Dataset(
    id=d.id,
    short_name=d.short_name,
    X_train=d.X_train.head(n_train),
    X_test=d.X_test,
    y_train=d.y_train.head(n_train),
    y_test=d.y_test,
    pos_class=d.pos_class,
    context_train=d.context_train,
    context_test=d.context_test,
    train_zero_shot=False
)

d

Dataset(
	id='cardiffnlp/tweet_sentiment_multilingual'
	short_name='sentiment'
	context_train='Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.'
	context_test='Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.'
	classes=['NEGATIVE', 'POSITIVE']
	pos_class='POSITIVE'
	train_zero_shot=False
	X_train.shape=(10,)
	X_test.shape=(100,)
	y_train.shape=(10,)
	y_test.shape=(100,)
)

## TF-IDF

In [30]:
run = Run("TF-IDF", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'TF-IDF RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' ...


Transforming data using 'TF-IDF' ...




X_train.shape = (10, 113)
X_test.shape = (100, 113)
Training 'RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.52
Balanced Accuracy,0.5
F1 Score (Macro),0.34211
F1 Score (Positive Class),0.0
Precision (Macro),0.26
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Time Classification,0.06776
Time Total,0.09263


Starting new run 'TF-IDF LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' ...


Loading cached data transformation 'TF-IDF' ...
X_train.shape = (10, 113)
X_test.shape = (100, 113)
Training 'LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.52
Balanced Accuracy,0.5
F1 Score (Macro),0.34211
F1 Score (Positive Class),0.0
Precision (Macro),0.26
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Time Classification,0.00939
Time Total,0.03427


## Text Embeddings

In [31]:
run = Run("Embeddings", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'Embeddings RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' ...


Transforming data using 'Embeddings' ...


Generating embeddings (intfloat/multilingual-e5-large):   0%|          | 0/10 [00:00<?, ?it/s]

Generating embeddings (intfloat/multilingual-e5-large):   0%|          | 0/100 [00:00<?, ?it/s]

X_train.shape = (10, 1024)
X_test.shape = (100, 1024)
Training 'RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.54
Balanced Accuracy,0.52083
F1 Score (Macro),0.38667
F1 Score (Positive Class),0.08
Precision (Macro),0.76531
Precision (Positive Class),1.0
Recall (Macro),0.52083
Recall (Positive Class),0.04167
Time Classification,0.06507
Time Total,4.35025


Starting new run 'Embeddings LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' ...


Loading cached data transformation 'Embeddings' ...
X_train.shape = (10, 1024)
X_test.shape = (100, 1024)
Training 'LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.59
Balanced Accuracy,0.57532
F1 Score (Macro),0.51645
F1 Score (Positive Class),0.32787
Precision (Macro),0.66622
Precision (Positive Class),0.76923
Recall (Macro),0.57532
Recall (Positive Class),0.20833
Time Classification,0.01598
Time Total,4.30116


## FELIX GPT-3.5 (Numerical)

In [33]:
run = Run("FELIX qwen (Numerical)", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'FELIX qwen (Numerical) RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' ...
Transforming data using 'FELIX qwen (Numerical)' ...


Generating features:   0%|          | 0/7 [00:00<?, ?s/it]

All features identified as noise. Treating them as one cluster
Consolidated to 1 features (excl. noise)


Scoring examples:   0%|          | 0/10 [00:00<?, ?s/it]

Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

X_train.shape = (10, 2)
X_test.shape = (100, 2)
Imputing missing values in numerical data ...
Training 'RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.52
Balanced Accuracy,0.5
F1 Score (Macro),0.34211
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.26
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


Starting new run 'FELIX qwen (Numerical) LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' ...


Loading cached data transformation 'FELIX qwen (Numerical)' ...
X_train.shape = (10, 2)
X_test.shape = (100, 2)
Imputing missing values in numerical data ...
Training 'LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.52
Balanced Accuracy,0.5
F1 Score (Macro),0.34211
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.26
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


## FELIX GPT-3.5 (Categorical)

In [34]:
run = Run("FELIX qwen (Categorical)", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'FELIX qwen (Categorical) RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' ...


Transforming data using 'FELIX qwen (Categorical)' ...


Generating features:   0%|          | 0/7 [00:00<?, ?s/it]

Consolidated to 2 features (excl. noise)


Scoring examples:   0%|          | 0/10 [00:00<?, ?s/it]

Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

X_train.shape = (10, 3)
X_test.shape = (100, 3)
Creating one-hot encoding of categorical data ...
Training 'RandomForest' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.52
Balanced Accuracy,0.5
F1 Score (Macro),0.34211
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.26
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


Starting new run 'FELIX qwen (Categorical) LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' ...


Loading cached data transformation 'FELIX qwen (Categorical)' ...
X_train.shape = (10, 3)
X_test.shape = (100, 3)
Creating one-hot encoding of categorical data ...
Training 'LogisticRegression' on 'cardiffnlp/tweet_sentiment_multilingual' training set ...
Predicting class labels of 'cardiffnlp/tweet_sentiment_multilingual' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.52
Balanced Accuracy,0.5
F1 Score (Macro),0.34211
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.26
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


## FELIX GPT-4 (Numerical)

In [None]:
# run = Run("FELIX GPT-4 (Numerical)", ["RandomForest", "LogisticRegression"], d, seed)
# run.run()

## FELIX GPT-4 (Categorical)

In [None]:
# run = Run("FELIX GPT-4 (Categorical)", ["RandomForest", "LogisticRegression"], d, seed)
# run.run()

## Fine-Tuned LLM

In [None]:
# run = Run("Raw Text", "RoBERTA-Base 100 Epochs", d_sample, seed + i)
# run.run()

# Experiment 3: Domain Adaptation

In [35]:
train_name = "reviews-amazon"
test_name = "reviews-yelp"

d_train = dataset_list[train_name]
d_test = dataset_list[test_name]

if d_train.pos_class != d_test.pos_class:
    print("Warning: Train and test datasets have different classes. Do you still want to continue?\n")

d = Dataset(
    id=f"{d_train.id}>{d_test.id}",
    short_name=f"{d_train.short_name}>{d_test.short_name}",
    X_train=d_train.X_train,
    X_test=d_test.X_test,
    y_train=d_train.y_train,
    y_test=d_test.y_test,
    pos_class=d_train.pos_class,
    context_train=d_train.context_train,
    context_test=d_test.context_test,
    train_zero_shot=False
)

d

Dataset(
	id='amazon_polarity>yelp_polarity'
	short_name='reviews-amazon>reviews-yelp'
	context_train='Examples are product reviews from Amazon that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	context_test='Examples are reviews from Yelp that are either 'NEGATIVE' (rating with 1 or 2 stars) or 'POSITIVE' (rating with 4 or 5 stars). The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' reviews.'
	classes=['POSITIVE', 'NEGATIVE']
	pos_class='POSITIVE'
	train_zero_shot=False
	X_train.shape=(100, 2)
	X_test.shape=(100, 1)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

## TF-IDF

In [36]:
run = Run("TF-IDF", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'TF-IDF RandomForest' on 'amazon_polarity>yelp_polarity' ...


Transforming data using 'TF-IDF' ...




X_train.shape = (100, 1611)
X_test.shape = (100, 1611)
Training 'RandomForest' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.61
Balanced Accuracy,0.61
F1 Score (Macro),0.58822
F1 Score (Positive Class),0.68293
Precision (Macro),0.63952
Precision (Positive Class),0.57534
Recall (Macro),0.61
Recall (Positive Class),0.84
Time Classification,0.1568
Time Total,1.18015


Starting new run 'TF-IDF LogisticRegression' on 'amazon_polarity>yelp_polarity' ...


Loading cached data transformation 'TF-IDF' ...
X_train.shape = (100, 1611)
X_test.shape = (100, 1611)
Training 'LogisticRegression' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.62
Balanced Accuracy,0.62
F1 Score (Macro),0.6124
F1 Score (Positive Class),0.66667
Precision (Macro),0.63021
Precision (Positive Class),0.59375
Recall (Macro),0.62
Recall (Positive Class),0.76
Time Classification,0.27674
Time Total,1.30009


## Text Embeddings

In [37]:
run = Run("Embeddings", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'Embeddings RandomForest' on 'amazon_polarity>yelp_polarity' ...


Transforming data using 'Embeddings' ...


Generating embeddings (intfloat/multilingual-e5-large):   0%|          | 0/100 [00:00<?, ?it/s]

Generating embeddings (intfloat/multilingual-e5-large):   0%|          | 0/100 [00:00<?, ?it/s]

X_train.shape = (100, 1024)
X_test.shape = (100, 1024)
Training 'RandomForest' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.93
Balanced Accuracy,0.93
F1 Score (Macro),0.92994
F1 Score (Positive Class),0.93204
Precision (Macro),0.93155
Precision (Positive Class),0.90566
Recall (Macro),0.93
Recall (Positive Class),0.96
Time Classification,0.12938
Time Total,5.62617


Starting new run 'Embeddings LogisticRegression' on 'amazon_polarity>yelp_polarity' ...


Loading cached data transformation 'Embeddings' ...
X_train.shape = (100, 1024)
X_test.shape = (100, 1024)
Training 'LogisticRegression' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Time Classification,▁
Time Total,▁

0,1
Accuracy,0.95
Balanced Accuracy,0.95
F1 Score (Macro),0.94995
F1 Score (Positive Class),0.95146
Precision (Macro),0.95163
Precision (Positive Class),0.92453
Recall (Macro),0.95
Recall (Positive Class),0.98
Time Classification,0.01627
Time Total,5.51305


## FELIX GPT-3.5 (Numerical)

In [38]:
run = Run("FELIX qwen (Numerical)", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'FELIX qwen (Numerical) RandomForest' on 'amazon_polarity>yelp_polarity' ...


Transforming data using 'FELIX qwen (Numerical)' ...


Generating features:   0%|          | 0/50 [00:00<?, ?s/it]

Consolidated to 9 features (excl. noise)


Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

X_train.shape = (100, 9)
X_test.shape = (100, 9)
Imputing missing values in numerical data ...
Training 'RandomForest' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.66667
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.5
Recall (Macro),0.5
Recall (Positive Class),1.0
Scoring Cost,0.0


Starting new run 'FELIX qwen (Numerical) LogisticRegression' on 'amazon_polarity>yelp_polarity' ...


Loading cached data transformation 'FELIX qwen (Numerical)' ...
X_train.shape = (100, 9)
X_test.shape = (100, 9)
Imputing missing values in numerical data ...
Training 'LogisticRegression' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


## FELIX GPT-3.5 (Categorical)

In [39]:
run = Run("FELIX qwen (Categorical)", ["RandomForest", "LogisticRegression"], d, seed)
run.run()

Starting new run 'FELIX qwen (Categorical) RandomForest' on 'amazon_polarity>yelp_polarity' ...


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888925108, max=1.0…

Transforming data using 'FELIX qwen (Categorical)' ...


Generating features:   0%|          | 0/50 [00:00<?, ?s/it]

Consolidated to 9 features (excl. noise)


Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

Scoring examples:   0%|          | 0/100 [00:00<?, ?s/it]

X_train.shape = (100, 9)
X_test.shape = (100, 9)
Creating one-hot encoding of categorical data ...
Training 'RandomForest' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'RandomForest' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.66667
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.5
Recall (Macro),0.5
Recall (Positive Class),1.0
Scoring Cost,0.0


Starting new run 'FELIX qwen (Categorical) LogisticRegression' on 'amazon_polarity>yelp_polarity' ...


Loading cached data transformation 'FELIX qwen (Categorical)' ...
X_train.shape = (100, 9)
X_test.shape = (100, 9)
Creating one-hot encoding of categorical data ...
Training 'LogisticRegression' on 'amazon_polarity>yelp_polarity' training set ...
Predicting class labels of 'amazon_polarity>yelp_polarity' test set with trained 'LogisticRegression' ...
Finishing run and logging the results ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




0,1
Accuracy,▁
Balanced Accuracy,▁
F1 Score (Macro),▁
F1 Score (Positive Class),▁
Generation Cost,▁
Precision (Macro),▁
Precision (Positive Class),▁
Recall (Macro),▁
Recall (Positive Class),▁
Scoring Cost,▁

0,1
Accuracy,0.5
Balanced Accuracy,0.5
F1 Score (Macro),0.33333
F1 Score (Positive Class),0.0
Generation Cost,0.0
Precision (Macro),0.25
Precision (Positive Class),0.0
Recall (Macro),0.5
Recall (Positive Class),0.0
Scoring Cost,0.0


## FELIX GPT-4 (Numerical)

In [40]:
# run = Run("FELIX GPT-4 (Numerical)", ["RandomForest", "LogisticRegression"], d, seed)
# run.run()

## FELIX GPT-4 (Categorical)

In [41]:
# run = Run("FELIX GPT-4 (Categorical)", ["RandomForest", "LogisticRegression"], d, seed)
# run.run()

## Fine-Tuned LLM

In [42]:
# run = Run("Raw Text", "RoBERTA-Base 100 Epochs", d, seed + i)
# run.run()

# Experiment 5: Internal Validity

## Experiment 5.1: Clustering Validity

### Learn Numeric Feature Set

In [43]:
results_log = []

In [44]:
def log_results(dataset_id, method, felix, features, f1_rf, f1_lr, probs_lr, comment=""):
    results_log.append({
        "dataset": dataset_id,
        "method": method,
        "felix_variant": f"FELIX GPT-{'3.5' if '3.5' in felix.llm_scoring else '4'} {'Categorical' if felix.discrete_features else 'Numerical'}",
        "n_features": len(features),
        "feature_set": features,
        "f1_rf": f1_rf,
        "f1_lr": f1_lr,
        "probs_lr": probs_lr,
        "comment": comment
    })

In [45]:
# Select a dataset
d = dataset_list["sentiment"]
d

Dataset(
	id='cardiffnlp/tweet_sentiment_multilingual'
	short_name='sentiment'
	context_train='Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.'
	context_test='Examples are tweets in multiple languages which express either a 'NEGATIVE' or 'POSITIVE' sentiment. The downstream machine learning task is learning a classifier (e.g., Random Forest) that learns to distinguish 'NEGATIVE' and 'POSITIVE' tweets, independent of the language they are written in.'
	classes=['NEGATIVE', 'POSITIVE']
	pos_class='POSITIVE'
	train_zero_shot=False
	X_train.shape=(100,)
	X_test.shape=(100,)
	y_train.shape=(100,)
	y_test.shape=(100,)
)

In [46]:
# Configure FELIX
felix = FELIX(
    context=d.context_train,
    temperature_scoring=0.0,
    llm_scoring="gpt-3.5-turbo-16k",
    discrete_features=False,
    verbose=True
)

In [47]:
n_generation = 30

# Fit FELIX to the dataset to learn features
felix._full_feature_set = felix.generate_features(d.X_train.head(n_generation), d.y_train.head(n_generation))
felix._full_feature_set = felix._ensure_unique_feature_names(felix._full_feature_set)
felix._features = felix._full_feature_set

Generating features:   0%|          | 0/17 [00:00<?, ?s/it]

Save the learned features for later use:

In [48]:
from google.colab import files
import datetime

# Serializing json
json_object = felix._features.json(indent=4)

# Store the results
filename = f"{datetime.date.today().strftime('%Y_%m_%d')} - FELIX Features {d.short_name}.json"
with open(filename, "w") as f:
    f.write(json_object)

# Download the results
files.download(filename)

ModuleNotFoundError: No module named 'google.colab'

In [None]:
from google.colab import files
import json

# Upload the features JSON
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Parse the results
flat_list = json.loads(uploaded[filename])["features"]
if felix.discrete_features:
    felix._features = CategoricalFeatureSet(features=flat_list)
else:
    felix._features = NumericalFeatureSet(features=flat_list)
felix._full_feature_set = felix._features
len(felix._features.features)

### Score All Learned Features

In [None]:
df_scores = felix.transform(d.X_test)
df_scores

In [None]:
from google.colab import files
import datetime

# Save the scores data as a CSV
filename = f"{datetime.date.today().strftime('%Y_%m_%d')} - FELIX Scores {d.short_name}.csv"
df_scores.to_csv(filename, index=False)

# Download the results
files.download(filename)

In [None]:
from google.colab import files
import datetime

# Upload the scores CSV
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load the scores
df_scores = pd.read_csv(filename)
df_scores

### Handle Missing Values (Numerical)

In [None]:
col_drop_thres = 0.1
n_neighbors = 5

# Remove all columns that have more than 'col_drop_thres' missing values
missing_percentage = df_scores.isnull().mean()                                  # Calculate the percentage of missing values for each column
cols_to_drop = missing_percentage[missing_percentage > col_drop_thres].index    # Identify columns that have more than 'col_drop_thres' % missing values
df_scores = df_scores.drop(columns=cols_to_drop)                                # Drop the columns from the DataFrame

# Estimate missing values with k-Nearest-Neighbors imputations
imputer = KNNImputer(n_neighbors=n_neighbors, weights="distance")
imputer.set_output(transform="pandas")
df_scores = imputer.fit_transform(df_scores)

print("Columns removed:", cols_to_drop)
print("Columns with missing features remaining:", df_scores.isnull().any().sum())

In [None]:
# Remove features from FELIX that could not be scored reliably (i.e., more than 'col_drop_thres' missing values)
felix._features.features = [f for f in felix._features.features if f.name not in cols_to_drop]

print(len(felix._features.features))
print(df_scores.shape)

### Handle Missing Values (Categorical)

In [None]:
nan_columns = df_scores.isna().all()[df_scores.isna().all() == True].index.to_list()

print(nan_columns)

In [None]:
# Remove features from FELIX that could not be scored (i.e., all rows have NaN values)
felix._features.features = [f for f in felix._features.features if f.name not in nan_columns]

# Remove respective columns from the scores dataset
df_scores = df_scores.drop(columns=nan_columns)

print(len(felix._features.features))
print(df_scores.shape)

### Create Feature Embeddings

In [None]:
# Create text embeddings for each feature
felix._feature_embeddings = felix._create_feature_embeddings(felix._features)

### Prepare Feature Selection Code

In [None]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


def encode_data(X_train, X_test):
    # Drop columns that have no values
    nan_columns_train = X_train.isna().all()[X_train.isna().all() == True].index.to_list()
    nan_columns_test = X_test.isna().all()[X_test.isna().all() == True].index.to_list()
    # nan_columns = nan_columns_train + nan_columns_test
    nan_columns = [x for x in nan_columns_train if x in nan_columns_test]
    X_train = X_train.drop(columns=nan_columns)
    X_test = X_test.drop(columns=nan_columns)

    # Identify categorical columns (assuming the same columns in train and test)
    categorical_columns = X_train.select_dtypes(include=[object]).columns
    non_categorical_columns = X_train.select_dtypes(exclude=[object]).columns

    # Encode values based on column type
    if len(categorical_columns) > 0 and len(non_categorical_columns) == 0:
        data_transformer = DataTransformer(dataset=None)
        return data_transformer.one_hot_encode(X_train, X_test)
    elif len(non_categorical_columns) > 0 and len(categorical_columns) == 0:
        return X_train, X_test
    else:
        print("Categorical columns =", categorical_columns)
        print("Numerical columns =", non_categorical_columns)
        print("NaN columns training =", nan_columns_train)
        print("NaN columns test =", nan_columns_test)
        print("NaN columns =", nan_columns)
        raise ValueError("Cannot handle mixed dataset of categorical and numerical columns.")


def evaluate_feature_set(feature_set, X, y, cv_splits=5, random_state=42, lr_probs=True, lr_f1=True, rf_f1=True):
    # Define classifier models
    if lr_probs or lr_f1:
        lr_model = LogisticRegression(max_iter=10000, random_state=random_state)
    if rf_f1:
        rf_model = RandomForestClassifier(random_state=random_state)

    # Create lists to store the scores from each cross-validation fold
    avg_probs_lr = []
    avg_scores_lr = []
    avg_scores_rf = []

    # Evaluate performance using k-fold cross validation
    kf = KFold(n_splits=cv_splits)
    for train_index, test_index in kf.split(X):
        # Create train-test split
        X_train = X.iloc[train_index][feature_set]
        y_train = y.iloc[train_index]
        X_test = X.iloc[test_index][feature_set]
        y_test = y.iloc[test_index]

        # Convert columns that contain categorical data into a one-hot representation
        X_train, X_test = encode_data(X_train, X_test)

        # Train the models with the feature set
        if lr_probs or lr_f1:
            lr_model.fit(X_train, y_train)
        if rf_f1:
            rf_model.fit(X_train, y_train)

        # Evaluate the trained models on the test set
        if lr_probs:
            probs_lr = lr_model.predict_proba(X_test)
            avg_prob = np.mean([p[lr_model.classes_.tolist().index(c)] for p, c in zip(probs_lr, y_test)]) # Calculate the average predicted probability of belonging to the ground truth class
            avg_probs_lr.append(avg_prob)
        else:
            avg_probs_lr.append(0.0)
        if lr_f1:
            preds_lr = lr_model.predict(X_test)
            f1_lr = f1_score(y_test, preds_lr, average="macro")
            avg_scores_lr.append(f1_lr)
        else:
            avg_scores_lr.append(0.0)
        if rf_f1:
            preds_rf = rf_model.predict(X_test)
            f1_rf = f1_score(y_test, preds_rf, average="macro")
            avg_scores_rf.append(f1_rf)
        else:
            avg_scores_rf.append(0.0)

    # Calculate and return average scores across all folds
    return np.mean(avg_probs_lr), np.mean(avg_scores_lr), np.mean(avg_scores_rf)


def forward_selection(X, y, cv_splits=5, random_state=42, lr_probs=True, lr_f1=True, rf_f1=True):
    # Create lists to store the features selected (in order of forward selection) and corresponding scores
    feature_set = []
    scores = []

    # Iteratively add the feature that leads to the best classification performance
    while len(feature_set) < X.shape[1]:
        # Keep track of the best performance so far
        best_feature = None
        best_score = -np.inf
        prob_lr = -np.inf
        score_lr = -np.inf
        score_rf = -np.inf

        # Iterate over all features that are not yet in the feature set and find the one that leads to the best performance
        for feature in X.columns:
            if feature not in feature_set:
                # Add the current feature to the set
                features_to_use = feature_set + [feature]

                # Evaluate the classification performance using this feature set
                lr_prob, lr_score, rf_score = evaluate_feature_set(features_to_use, X, y, cv_splits, random_state, lr_probs=lr_probs, lr_f1=lr_f1, rf_f1=rf_f1)

                # Calculate the final decision criterion
                if lr_f1 and rf_f1:
                    score = np.mean([lr_score, rf_score])
                elif lr_f1:
                    score = lr_score
                elif rf_f1:
                    score = rf_score
                elif lr_probs:
                    score = lr_prob

                # Check if this is the best score
                if score > best_score:
                    best_feature = feature
                    best_score = score

                    # Also store the other scores as reference
                    prob_lr = lr_prob
                    score_lr = lr_score
                    score_rf = rf_score

        # Add best feature to our feature set
        feature_set.append(best_feature)
        scores.append(best_score)

        log_results(d.id, "Supervised Forward Selection", felix, feature_set, score_rf, score_lr, prob_lr, f"Added feature {best_feature}")

        print(f"n_features = {len(feature_set)}. Score = {best_score}. LR_prob = {prob_lr}. LR_F1 = {score_lr}. RF_F1 = {score_rf}. Added feature {best_feature}")

    return feature_set, scores

### Supervised Forward Selection

In [None]:
features_forward, scores_forward = forward_selection(df_scores, d.y_test, cv_splits=5, random_state=seed, lr_probs=True, lr_f1=True, rf_f1=True)

In [None]:
from google.colab import files

df_forward = pd.DataFrame({"Feature": features_forward, "F1 (LR-RF avg.)": scores_forward})

filename = "2023_11_02 - FELIX Forward Selection Fake News.csv"
df_forward.to_csv(filename, index=False)

# Download the results
files.download(filename)

In [None]:
# Upload the results
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load the scores
df_forward = pd.read_csv(filename)

features_forward = df_forward["Feature"].tolist()
scores_forward = df_forward["F1 (LR-RF avg.)"].tolist()

### HDBSCAN

In [None]:
def run_hdbscan(feature_embeddings, df_scores, d, keep_noise=True, cluster_selection_method="leaf", cluster_selection_epsilon=0.0, alpha=1.0, verbose=False):
    # Cluster the features with HDBSCAN
    np.random.seed(0)
    hdbscan = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, cluster_selection_method=cluster_selection_method, cluster_selection_epsilon=cluster_selection_epsilon, alpha=alpha)
    cluster_labels = hdbscan.fit_predict(feature_embeddings)

    if keep_noise:
        # Assign all features identified as noise (i.e., not part of a cluster) to their own cluster
        next_label = np.max(cluster_labels) + 1
        for i in range(len(cluster_labels)):
            if cluster_labels[i] == -1:
                cluster_labels[i] = next_label
                next_label += 1

        # Select the most representative feature from each cluster
        features_hdbscan = felix._select_representative_features(felix._features, felix._feature_embeddings, cluster_labels)
        if verbose:
            print(f"HDBSCAN selected {len(features_hdbscan.features)} features (including noise)")
    else:
        # If all features have been identified as noise, put them all in one cluster
        if (np.array(cluster_labels) == -1).all():
            cluster_labels = [0 for _ in cluster_labels]
            if verbose:
                print("All features identified as noise. Treating them as one cluster")

        # Prune the feature set to only those features that are not identified as noise
        pruned_feature_set = felix._features.copy(deep=True)
        pruned_feature_set.features = [f for f, l in zip(felix._features.features, cluster_labels) if l != -1]
        pruned_embeddings = np.array([e for e, l in zip(felix._feature_embeddings, cluster_labels) if l != -1])
        pruned_labels = [l for l in cluster_labels if l != -1]

        # Select the most representative feature from each cluster (excluding noise)
        features_hdbscan = felix._select_representative_features(pruned_feature_set, pruned_embeddings, pruned_labels)
        if verbose:
            print(f"HDBSCAN selected {len(features_hdbscan.features)} features (without noise)")


    lr_prob, lr_score, rf_score = evaluate_feature_set([f.name for f in features_hdbscan.features], df_scores, d.y_test, cv_splits=5, random_state=42, lr_probs=True, lr_f1=True, rf_f1=True)
    score_hdbscan = np.mean([lr_score, rf_score])

    return features_hdbscan, rf_score, lr_score, lr_prob

In [None]:
features_hdbscan, rf_score, lr_score, lr_prob = run_hdbscan(felix._feature_embeddings, df_scores, d, keep_noise=True, cluster_selection_method="leaf", cluster_selection_epsilon=0.0, alpha=1.0)
log_results(d.id, "HDBSCAN (w/ noise)", felix, [f.name for f in features_hdbscan.features], rf_score, lr_score, lr_prob, comment="keep_noise=True, cluster_selection_method='leaf', cluster_selection_epsilon=0.0, alpha=1.0")
print(f"F1 = {np.mean([rf_score, lr_score])}. n_features = {len(features_hdbscan.features)}")

features_hdbscan, rf_score, lr_score, lr_prob = run_hdbscan(felix._feature_embeddings, df_scores, d, keep_noise=False, cluster_selection_method="leaf", cluster_selection_epsilon=0.0, alpha=1.0)
log_results(d.id, "HDBSCAN (w/o noise)", felix, [f.name for f in features_hdbscan.features], rf_score, lr_score, lr_prob, comment="keep_noise=False, cluster_selection_method='leaf', cluster_selection_epsilon=0.0, alpha=1.0")
print(f"F1 = {np.mean([rf_score, lr_score])}. n_features = {len(features_hdbscan.features)}")

Perform a grid search for the optimal HDBSCAN hyperparameters

In [None]:
options_keep_noise = [True, False]
options_cluster_selection_method = ["leaf", "eom"]
options_cluster_selection_epsilon = [float(x) / 10 for x in range(0, 11, 2)]
options_alpha = [float(x) / 10 for x in range(5, 16, 1)]

tuning_results = []

In [None]:
for kn in options_keep_noise:
    for csm in options_cluster_selection_method:
        for epsilon in options_cluster_selection_epsilon:
            for alpha in options_alpha:
                features_hdbscan, rf_score, lr_score, lr_prob = run_hdbscan(felix._feature_embeddings, df_scores, d, keep_noise=kn, cluster_selection_method=csm, cluster_selection_epsilon=epsilon, alpha=alpha)
                r = {
                    "Dataset": d.id,
                    "keep_noise": kn,
                    "cluster_selection_method": csm,
                    "epsilon": epsilon,
                    "alpha": alpha,
                    "F1 Average": np.mean([rf_score, lr_score]),
                    "F1 LR": lr_score,
                    "F1 RF": rf_score,
                    "n_features": len(features_hdbscan.features)
                }
                tuning_results.append(r)
                print(r)

Download/upload the results of hyperparameter optimization

In [None]:
df_tuning_results = pd.DataFrame(tuning_results)
df_tuning_results

In [None]:
from google.colab import files
import datetime

# Save the scores data as a CSV
filename = f"{datetime.date.today().strftime('%Y_%m_%d')} - HDBSCAN Hyperparameter Optimization.csv"
df_tuning_results.to_csv(filename, index=False)

# Download the results
files.download(filename)

In [None]:
from google.colab import files
import datetime

# Upload the scores CSV
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load the scores
df_tuning_results = pd.read_csv(filename)
df_tuning_results

Plot the results of hyperparameter optimization

In [None]:
# Simplify the dataset IDs for visualization
df_tuning_results["Dataset"] = df_tuning_results["Dataset"].map({
    "cardiffnlp/tweet_sentiment_multilingual": "Sentiment",
    "hate_speech18": "Hate Speech",
    "amazon_polarity": "Amazon",
    "GonzaloA/fake_news": "Fake News",
    "tum-nlp/IDMGSP": "Papers"
})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from google.colab import files

# Set up the matplotlib figure
fig, axs = plt.subplots(1, 2, figsize=(12, 4))

# Unpack the axis for easy reference
(ax1, ax2) = axs

# Plot boxplot for keep_noise
sns.boxplot(x='Dataset', y='F1 Average', hue='keep_noise', data=df_tuning_results, ax=ax1)
ax1.set_title("Performance for Different Values of 'keep_noise'")
ax1.set_ylabel("F1 Score")
ax1.set_xlabel("")
ax1.set_ylim([0.3, 1.0])
ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

# Plotting boxplot for cluster_selection_method
sns.boxplot(x='Dataset', y='F1 Average', hue='cluster_selection_method', data=df_tuning_results, ax=ax2)
ax2.set_title("Performance for Different Values of 'cluster_selection_method'")
ax2.set_ylabel("F1 Score")
ax2.set_xlabel("")
ax2.set_ylim([0.3, 1.0])
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

# Add an overall title
fig.suptitle("HDBSCAN: Hyperparameter Tuning (Categorical Features)")

# Adjust the layout
plt.tight_layout()

# Download the plot as PDF
filename = "HDBSCAN Categorical Hyperparameter Optimization A.pdf"
plt.savefig(filename)
files.download(filename)

# Display the plots
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd


# Filter the dataframe for the optimal values of other hyperparameters
df_filtered = df_tuning_results[
    (df_tuning_results['keep_noise'] == False) &
    (df_tuning_results['cluster_selection_method'] == 'leaf')
]

# Get unique datasets
unique_datasets = df_filtered['Dataset'].unique()

# Prepare contour plots data with the filtered dataframe
contour_plots_data = []
for dataset in unique_datasets:
    dataset_df = df_filtered[df_filtered['Dataset'] == dataset]
    dataset_grouped = dataset_df.groupby(['alpha', 'epsilon'])['F1 Average'].mean().reset_index()
    pivot_table = dataset_grouped.pivot(index='alpha', columns='epsilon', values='F1 Average')
    contour_plots_data.append((dataset, pivot_table))

# Calculate the average F1 score across all datasets
average_f1 = df_filtered.groupby(['alpha', 'epsilon'])['F1 Average'].mean().reset_index()
average_f1_pivot = average_f1.pivot(index='alpha', columns='epsilon', values='F1 Average')
contour_plots_data.append(('Average across Datasets', average_f1_pivot))

# Determine global min and max F1 scores for consistent color scaling across all plots
f1_min = min(data.min().min() for _, data in contour_plots_data)
f1_max = 1.0

# Create a new figure for the plots
fig, axes = plt.subplots(2, 3, figsize=(12, 7))

# Flatten the axes array for easy indexing
axes = axes.flatten()

# Plot each dataset with the unified color scaling
for i, (dataset, data) in enumerate(contour_plots_data):
    sns.heatmap(data, ax=axes[i], cmap="YlGnBu_r", cbar=False, linewidths=0, vmin=f1_min, vmax=f1_max)
    axes[i].invert_yaxis()
    axes[i].set_title(f'{dataset}')
    axes[i].set_ylabel('Alpha' if i % 3 == 0 else '')   # Only show the y axis label once for each row
    axes[i].set_xlabel('Epsilon' if i >= 3 else '')     # Only show the x axis label once for each column
    axes[i].set_yticks(axes[i].get_yticks(), axes[i].get_yticklabels(), rotation=0)

# Place a color bar at the right of the plots
cbar_ax = fig.add_axes([0.88, 0.15, 0.03, 0.7])
cbar = fig.colorbar(axes[0].collections[0], cax=cbar_ax)
cbar.set_label('F1 Score')
cbar.set_ticks(np.linspace(f1_min, f1_max, num=5))
cbar.ax.set_yticklabels([f'{tick:.0%}' for tick in cbar.get_ticks()])

# Add an overall title
fig.suptitle("HDBSCAN: Hyperparameter Tuning (Numerical Features)")

# Adjust layout for better fit and to make room for the color bar
fig.subplots_adjust(right=0.85)

# Download the plot as PDF
filename = "HDBSCAN Numerical Hyperparameter Optimization B.pdf"
plt.savefig(filename, bbox_inches='tight')
files.download(filename)

# Display the plots
plt.show()

In [None]:
clusters_hdbscan = felix._cluster_features(felix._feature_embeddings)
features_hdbscan = felix._select_representative_features(felix._features, felix._feature_embeddings, clusters_hdbscan)
print(f"HDBSCAN selected {len(features_hdbscan.features)} features")

lr_prob, lr_score, rf_score = evaluate_feature_set([f.name for f in features_hdbscan.features], df_scores, d.y_test, cv_splits=5, random_state=42, lr_probs=True, lr_f1=True, rf_f1=True)
score_hdbscan = np.mean([lr_score, rf_score])
print(f"Score = {score_hdbscan}. LR_prob = {lr_prob}. LR_F1 = {lr_score}. RF_F1 = {rf_score}")

### K-means

In [None]:
from sklearn.cluster import KMeans

scores_kmeans = []

for n_clusters in range(1, len(df_scores.columns)+1):
    kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto")
    kmeans.fit(felix._feature_embeddings)
    clusters_kmeans = kmeans.labels_
    features_kmeans = felix._select_representative_features(felix._features, felix._feature_embeddings, clusters_kmeans)

    lr_prob, lr_score, rf_score = evaluate_feature_set([f.name for f in features_kmeans.features], df_scores, d.y_test, cv_splits=5, random_state=42, lr_probs=True, lr_f1=True, rf_f1=True)
    score = np.mean([lr_score, rf_score])
    scores_kmeans.append(score)

    log_results(d.id, "K-means", felix, [f.name for f in features_kmeans.features], rf_score, lr_score, lr_prob)

    print(f"n_features = {n_clusters}. Score = {score}. LR_prob = {lr_prob}. LR_F1 = {lr_score}. RF_F1 = {rf_score}")

### Hierarchical Agglomerative Clustering

In [None]:
import numpy as np
from scipy.cluster.hierarchy import dendrogram, fcluster
from sklearn.cluster import AgglomerativeClustering

# Fit agglomerative hierarchical clustering
agg_clustering = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
agg_clustering.fit(felix._feature_embeddings)

# Create a linkage matrix from the AgglomerativeClustering output
num_samples = len(agg_clustering.labels_)
linkage_matrix = np.column_stack([
    agg_clustering.children_,
    agg_clustering.distances_,
    [num_samples + i for i in range(num_samples - 1)]
])

scores_agg = []

for n_clusters in range(1, len(df_scores.columns)+1):
    # Use fcluster to get the cluster labels based on a threshold for number of clusters
    threshold = linkage_matrix[:, 2][-(n_clusters - 2)]
    clusters_agg = fcluster(linkage_matrix, t=threshold, criterion="distance")

    features_agg = felix._select_representative_features(felix._features, felix._feature_embeddings, clusters_agg)

    lr_prob, lr_score, rf_score = evaluate_feature_set([f.name for f in features_agg.features], df_scores, d.y_test, cv_splits=5, random_state=42, lr_probs=True, lr_f1=True, rf_f1=True)
    score = np.mean([lr_score, rf_score])
    scores_agg.append(score)

    log_results(d.id, "Agglomerative Clustering", felix, [f.name for f in features_agg.features], rf_score, lr_score, lr_prob)

    print(f"n_features = {n_clusters}. Score = {score}. LR_prob = {lr_prob}. LR_F1 = {lr_score}. RF_F1 = {rf_score}")

### Random Feature Selection

Run random feature selection 5 times for each dataset and average the results

In [None]:
import random

for i in range(5):
    scores_random = []

    # Create a random pertubation of the feature set
    features_shuffled = felix._features.copy(deep=True)
    random.shuffle(features_shuffled.features)

    for n_features in range(1, len(df_scores.columns)+1):
        feature_set = features_shuffled.copy(deep=True)
        feature_set.features = features_shuffled.features[:n_features]
        feature_names = [f.name for f in features_shuffled.features[:n_features]]

        lr_prob, lr_score, rf_score = evaluate_feature_set(feature_names, df_scores, d.y_test, cv_splits=5, random_state=42, lr_probs=True, lr_f1=True, rf_f1=True)
        score = np.mean([lr_score, rf_score])
        scores_random.append(score)

        log_results(d.id, "Random Feature Selection", felix, feature_names, rf_score, lr_score, lr_prob)

        print(f"iteration = {i + 1}. n_features = {n_features}. Score = {score}. LR_prob = {lr_prob}. LR_F1 = {lr_score}. RF_F1 = {rf_score}")

### Save Results

In [None]:
from google.colab import files
import datetime
import json


json_string = json.dumps(results_log, indent=4)

# Store the results as a JSON
filename = f"{datetime.date.today().strftime('%Y_%m_%d')} - FELIX Consolidation Results {d.short_name}.json"
with open(filename, "w") as f:
    f.write(json_string)

# Download the results
files.download(filename)


# Convert the results log to a Pandas DataFrame
df_results = pd.DataFrame(results_log)

# Save the results as a CSV
filename = f"{datetime.date.today().strftime('%Y_%m_%d')} - FELIX Consolidation Results {d.short_name}.csv"
df_results.to_csv(filename, index=False)

# Download the results
files.download(filename)

In [None]:
from google.colab import files
import json

# Upload the results JSON
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load the scores
results_log = json.loads(uploaded[filename])
len(results_log)

### Plot Results

In [None]:
import pandas as pd
import numpy as np

# Convert the results log to a Pandas DataFrame
df_results = pd.DataFrame(results_log)

# Calcualte average F1 scores from Random Forest and Logistic Regression
df_results["F1 Score"] = df_results.apply(lambda row: np.mean(row[["f1_rf", "f1_lr"]]), axis=1)

df_results

In [None]:
df_results.groupby(["dataset", "method"]).count()

In [None]:
import pandas as pd
import numpy as np

# Convert the results log to a Pandas DataFrame
df_results = pd.DataFrame(results_log)

# Calcualte average F1 scores from Random Forest and Logistic Regression
df_results["F1 Score"] = df_results.apply(lambda row: np.mean(row[["f1_rf", "f1_lr"]]), axis=1)

df_results

In [None]:
df_results.groupby(["dataset", "method"]).count()

In [None]:
# Simplify the dataset IDs for visualization
df_results["dataset"] = df_results["dataset"].map({
    "cardiffnlp/tweet_sentiment_multilingual": "Sentiment",
    "hate_speech18": "Hate Speech",
    "amazon_polarity": "Amazon",
    "GonzaloA/fake_news": "Fake News",
    "tum-nlp/IDMGSP": "Papers"
})

In [None]:
# Set the order for the datasets
datasets_order = [
    "Sentiment",
    "Hate Speech",
    "Amazon",
    "Fake News",
    "Papers"
]

# Set the order for the legend
methods_order = [
    "Random Feature Selection",
    "Supervised Forward Selection",
    "Agglomerative Clustering",
    "K-means",
    "HDBSCAN"
]

# Define a color for each method
colors = {
    "Supervised Forward Selection": "#969696",
    "HDBSCAN": "#e6550d",
    "Agglomerative Clustering": "#6baed6",
    "K-means": "#74c476",
    "Random Feature Selection": "#d9d9d9"
}

In [None]:
import matplotlib.ticker as mtick

# Group by 'n_features' and 'method' to calculate the mean of 'F1 Score' for duplicated entries (e.g., for random feature selection which has been executed multiple times)
df_adjusted = df_results.groupby(['n_features', 'method', 'dataset'])['F1 Score'].mean().reset_index()

# Drop entries for HDBSCAN with noise
df_adjusted = df_adjusted[df_adjusted["method"] != "HDBSCAN (w/ noise)"]
df_adjusted["method"] = df_adjusted["method"].replace("HDBSCAN (w/o noise)", "HDBSCAN")
df_adjusted = df_adjusted.reset_index()

# Create a grid of 2x3 for the scatter plots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
axes = axes.flatten()

# Plot a scatter plot for each dataset
for i, dataset in enumerate(datasets_order):
    df_subset = df_adjusted[df_adjusted['dataset'] == dataset]
    for method in methods_order:
        df_method = df_subset[df_subset['method'] == method]
        axes[i].scatter(df_method['n_features'], df_method['F1 Score'], label=method, s=(15 if method == "HDBSCAN" else 3), color=colors[method], zorder=3)
    axes[i].set_title(dataset)
    axes[i].set_ylabel('F1 Score' if i % 2 == 0 else '')
    axes[i].set_xlabel('Number of Selected Features' if i > 3 else '')
    axes[i].set_xlim([0, max(df_subset['n_features'])])
    axes[i].yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
    axes[i].grid(True, color='lightgrey', zorder=0)

# Plot a scotter plot of the average across all datasets
df_global_average = df_adjusted[df_adjusted["method"] != "HDBSCAN"].groupby(['n_features', 'method']).mean(numeric_only=True).reset_index()
hdbscan_average = df_adjusted[df_adjusted["method"] == "HDBSCAN"][["n_features", "F1 Score"]].mean()
least_features = df_adjusted[["dataset", "n_features"]].groupby("dataset").max()["n_features"].min()
for method in methods_order:
    df_method = df_global_average[df_global_average['method'] == method]
    axes[5].scatter(df_method['n_features'], df_method['F1 Score'], label=method, s=3, color=colors[method], zorder=3)
axes[5].scatter(hdbscan_average["n_features"], hdbscan_average["F1 Score"], label="HDBSCAN", s=15, color=colors["HDBSCAN"], zorder=3)
axes[5].set_title("Average across Datasets")
axes[5].set_xlim([0, least_features])
axes[5].set_xlabel('Number of Selected Features')
axes[5].yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
axes[5].grid(True, color='lightgrey', zorder=0)

# Create a single legend for all plots
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', ncol=len(methods_order), bbox_to_anchor=(0.5, 1.02))

# Add a chart title
fig.suptitle('Feature Consolidation Performance (Categorical Features)', y=1.05)

# Adjust the layout
plt.tight_layout()

# Download the plot as PDF
filename = "Feature Consolidation Methods Categorical (All Datasets).pdf"
plt.savefig(filename, bbox_inches='tight')
files.download(filename)

plt.show()

In [None]:
import matplotlib.pyplot as plt


# Group by 'n_features' and 'method' to calculate the mean of 'F1 Score' for duplicated entries (e.g., for random feature selection which has been executed multiple times)
df_adjusted = df_results.groupby(['n_features', 'method', 'dataset'])['F1 Score'].mean().reset_index()

# Drop entries for HDBSCAN with noise
df_adjusted = df_adjusted[df_adjusted["method"] != "HDBSCAN (w/ noise)"]
df_adjusted["method"] = df_adjusted["method"].replace("HDBSCAN (w/o noise)", "HDBSCAN")
df_adjusted = df_adjusted.reset_index()

# Calculate average results across all datasets
df_global_average = df_adjusted[df_adjusted["method"] != "HDBSCAN"].groupby(['n_features', 'method']).mean(numeric_only=True).reset_index()
least_features = df_adjusted[["dataset", "n_features"]].groupby("dataset").max()["n_features"].min()

# Create a plot
fig = plt.figure(figsize=(6, 3.5))

# Add results for each consolidation method
for method in methods_order:
    if method == "HDBSCAN":
        hdbscan_average = df_adjusted[df_adjusted["method"] == "HDBSCAN"][["n_features", "F1 Score"]].mean()
        plt.scatter(hdbscan_average["n_features"], hdbscan_average["F1 Score"], label="HDBSCAN", s=25, color=colors["HDBSCAN"], zorder=3)
    else:
        df_method = df_global_average[df_global_average['method'] == method]
        plt.scatter(df_method['n_features'], df_method['F1 Score'], label=method, s=3, color=colors[method], zorder=3)

# Format the plot
plt.title("Feature Consolidation Performance (Categorical Features)")
plt.xlim([0, least_features + 0.5])
plt.ylim([0.57, 0.87])
plt.xlabel('Number of Selected Features')
plt.ylabel('F1 Score')
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
plt.grid(True, color='lightgrey', zorder=0)
plt.legend()
plt.tight_layout()

# Download the plot as PDF
filename = "Feature Consolidation Methods Categorical (Average).pdf"
plt.savefig(filename, bbox_inches='tight')
files.download(filename)

# Display the plot
plt.show()

In [None]:
from google.colab import files

df_feature_selection = pd.DataFrame({"Forward Selection": scores_forward, "K-means": scores_kmeans, "Agglomerative": scores_agg, "Random": scores_random})

filename = "2023_11_02 - FELIX Feature Selection F1 Scores Fake News.csv"
df_feature_selection.to_csv(filename, index=False)

# Download the results
files.download(filename)

In [None]:
from google.colab import files

# Upload the results
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load the scores
df_feature_selection = pd.read_csv(filename)

scores_forward = df_feature_selection["Forward Selection"].tolist()
scores_kmeans = df_feature_selection["K-means"].tolist()
scores_agg = df_feature_selection["Agglomerative"].tolist()
scores_random = df_feature_selection["Random"].tolist()

## Experiment 5.2: Scoring Validity

In [None]:
from collections import Counter

def gini_index(categories):
    # Count the occurrences of each category
    category_counts = Counter(categories)
    # Calculate the proportion of each category and square the proportions
    squared_proportions = [(count / len(categories)) ** 2 for count in category_counts.values()]
    # Sum the squared proportions and subtract from 1
    gini = 1 - sum(squared_proportions)
    return gini

# Given array
categories = ["A", "A", "C", "A", "B", "C"]
categories = ["A", "B", "C", "D", "E", "F", "G", "H"]

# Calculate Gini index
gini = gini_index(categories)

print(f"Gini index of the array: {gini}")


### Setup: Learn Numeric Feature Set

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import random

In [None]:
# Select a dataset
d = dataset_list["papers"]
d

In [None]:
# Fit FELIX to the dataset to learn features
felix = FELIX(
    context=d.context,
    temperature_scoring=0.0,
    discrete_features=False,
    verbose=True
)

felix.fit(d.X_train, d.y_train)

# Store the features learned by FELIX
feature_set = felix._features

### Case: Stable Feature Set

In [None]:
# Get a random example
sample_row = d.X_test.sample(1, random_state=seed)

# Replicate the example 10 times and create a new dataframe
df = pd.concat([sample_row]*10, ignore_index=True)

# Score the example using FELIX
print("Scoring the same example 10 times ...")
df_scores_stable = felix.transform(df)

# Score different examples using FELIX
print("Scoring 10 different examples ...")
df_sample = d.X_test.sample(10, random_state=seed)
df_scores_general = felix.transform(df_sample)

In [None]:
df_variance = pd.DataFrame({"Inner-sample": df_scores_stable.var(), "Inter-sample": df_scores_general.var()})

plt.figure(figsize=(8, 2))
ax = df_variance.boxplot(vert=False)
ax.set_xlabel("Score variance")
ax.set_title("Distribution of score variance inner-sample vs. inter-sample")
plt.tight_layout()
plt.show()

df_variance.describe()

Alternative plot showing the score distribution per feature:

In [None]:
# Create a boxplot for the value distribution of each feature
plt.figure(figsize=(8, df_scores_stable.shape[1] / 5))
ax = df_scores_stable.boxplot(vert=False)
ax.set_xlabel('Value')
ax.set_title('Score distribution (stable order)')
plt.tight_layout()

plt.show()

### Case: Random Reordering

In [None]:
# Get a random example
sample_row = d.X_test.sample(1, random_state=seed)

# Keep the scoring LLM stable with a 16K context window
felix.llm_scoring = "gpt-3.5-turbo-16k"
felix.verbose = True

# Score the original feature set 10 times
print("Scoring the original feature set ...")
felix._features = feature_set
results_rows_original = []
for _ in tqdm(range(10)):
    results_rows_original.append(felix.transform(sample_row))
df_scores_original = pd.concat(results_rows_original).reset_index(drop=True)

# Score a randomly reordered feature set 10 times
print("Scoring the randomly reordered feature set ...")
random.shuffle(felix._features.features) # Randomly reorders the feature set in place
results_rows_random = []
for _ in tqdm(range(10)):
    results_rows_random.append(felix.transform(sample_row))
df_scores_random = pd.concat(results_rows_random).reset_index(drop=True)

In [None]:
# Perform t-test to see if changing the scoring LLM results in significantly different scores
def test_significance(df_model_a, df_model_b):
    p_values = {}
    for column in df_model_a.columns:
        _, p_value = stats.ttest_rel(df_model_a[column], df_model_b[column])
        p_values[column] = p_value
    return pd.Series(p_values)

p_values = test_significance(df_scores_original, df_scores_random)

# Excluding NaN values for plotting
p_values_cleaned = p_values.dropna()

# Plotting boxplots
plt.figure(figsize=(7, 1.5))
plt.boxplot([p_values_cleaned], vert=False, labels=["Random reordering"])
plt.xlabel("p-value")
plt.title("t-test for score differences when randomly reordering the feature set")
plt.tight_layout()
plt.show()

# Computing statistics
stats_random_reordering = {
    "Avg. p-value": [p_values_cleaned.mean()],
    "% of p-values < 0.01": [(p_values_cleaned < 0.01).mean() * 100],
    "% of p-values < 0.05": [(p_values_cleaned < 0.05).mean() * 100],
    "% of p-values < 0.1": [(p_values_cleaned < 0.1).mean() * 100]
}

# Creating a DataFrame
df_stats_random_reordering = pd.DataFrame(stats_random_reordering, index=["Random reordering"])
df_stats_random_reordering

Alternative experiment showing the large variance in scores when the feature set is randomly reschuffled in each scoring request:

In [None]:
# Get a random example
sample_row = d.X_test.sample(1, random_state=seed)

# List to store the transformed rows
result_rows = []

for _ in range(10):
    # Randomly reorder the features
    random.shuffle(felix._features.features)
    print(f"Iteration {_}: first feature is {felix._features.features[0].name}")

    # Transform the sample row and append the result to the list
    transformed_row = felix.transform(sample_row)
    result_rows.append(transformed_row)

# Concatenate all resulting rows into a single dataframe
df_scores_random = pd.concat(result_rows).reset_index(drop=True)

In [None]:
# Create a boxplot for the value distribution of each feature
plt.figure(figsize=(8, df_scores_random.shape[1] / 5))
ax = df_scores_random.boxplot(vert=False)
ax.set_xlabel('Value')
ax.set_title('Score distribution (random order)')
plt.tight_layout()

plt.show()

### Case: Splitting scoring requests

In [None]:
# Get a random example
sample_row = d.X_test.sample(1, random_state=seed)

# Set scoring LLM to the 16K context variant of GPT-3.5 so that all features fit into the context of one request
felix.llm_scoring="gpt-3.5-turbo-16k"
felix.verbose=False

# Score the example 10 times with all features in one request
print("Scoring with all features in one request ...")
felix._features = feature_set
result_rows_no_split = []
for _ in tqdm(range(10)):
    # Transform the sample row and append the result to the list
    result_rows_no_split.append(felix.transform(sample_row))
df_scores_no_split = pd.concat(result_rows_no_split).reset_index(drop=True)

# Score the example 10 times with only the first half of the feature set
print("Scoring wih first half of the feature set ...")
felix._features = NumericalFeatureSet(features=feature_set.features[:len(feature_set.features)//2])
result_rows_split_a = []
for _ in tqdm(range(10)):
    # Transform the sample row and append the result to the list
    result_rows_split_a.append(felix.transform(sample_row))
df_scores_split_a = pd.concat(result_rows_split_a).reset_index(drop=True)

# Score the example 10 times with only the second half of the feature set
print("Scoring wih second half of the feature set ...")
felix._features = NumericalFeatureSet(features=feature_set.features[len(feature_set.features)//2:])
result_rows_split_b = []
for _ in tqdm(range(10)):
    # Transform the sample row and append the result to the list
    result_rows_split_b.append(felix.transform(sample_row))
df_scores_split_b = pd.concat(result_rows_split_b).reset_index(drop=True)

In [None]:
# Perform t-test to see if splitting up scoring into multiple requests results in significantly different scores
def test_significance(df_split, df_no_split):
    p_values = {}
    for column in df_split.columns:
        if column in df_no_split.columns:
            _, p_value = stats.ttest_rel(df_split[column], df_no_split[column])
            p_values[column] = p_value
    return pd.Series(p_values)

p_values_split_a = test_significance(df_scores_split_a, df_scores_no_split)
p_values_split_b = test_significance(df_scores_split_b, df_scores_no_split)

# Excluding NaN values for plotting
p_values_a_cleaned = p_values_split_a.dropna()
p_values_b_cleaned = p_values_split_b.dropna()

# Plotting boxplots
plt.figure(figsize=(7, 3))
plt.boxplot([p_values_a_cleaned, p_values_b_cleaned], vert=False, labels=["First half", "Second half"])
plt.xlabel("p-value")
plt.title("t-test for score differences in first and second half of feature set vs. full feature set")
plt.tight_layout()
plt.show()

# Computing statistics
stats_split = {
    "Avg. p-value": [p_values_a_cleaned.mean(), p_values_b_cleaned.mean()],
    "% of p-values < 0.01": [(p_values_a_cleaned < 0.01).mean() * 100, (p_values_b_cleaned < 0.01).mean() * 100],
    "% of p-values < 0.05": [(p_values_a_cleaned < 0.05).mean() * 100, (p_values_b_cleaned < 0.05).mean() * 100],
    "% of p-values < 0.1": [(p_values_a_cleaned < 0.1).mean() * 100, (p_values_b_cleaned < 0.1).mean() * 100]
}

# Creating a DataFrame
df_stats_split = pd.DataFrame(stats_split, index=["First half", "Second half"])
df_stats_split

### Case: Changing LLM

In [None]:
# Get a random example
sample_row = d.X_test.sample(1, random_state=seed)

# Limit the number of features to score due to small context window
max_features = 40
felix._features = NumericalFeatureSet(features=feature_set.features[:min(max_features, len(feature_set.features))])
felix.verbose=True

# Score the example 10 times with 4K context window
print("Scoring with with 4K context window ...")
felix.llm_scoring="gpt-3.5-turbo"
result_rows_4k = []
for _ in tqdm(range(10)):
    # Transform the sample row and append the result to the list
    result_rows_4k.append(felix.transform(sample_row))
df_scores_4k = pd.concat(result_rows_4k).reset_index(drop=True)

# Score the example 10 times with 16K context window
print("Scoring with with 16K context window ...")
felix.llm_scoring="gpt-3.5-turbo-16k"
result_rows_16k = []
for _ in tqdm(range(10)):
    # Transform the sample row and append the result to the list
    result_rows_16k.append(felix.transform(sample_row))
df_scores_16k = pd.concat(result_rows_16k).reset_index(drop=True)

In [None]:
# Perform t-test to see if changing the scoring LLM results in significantly different scores
def test_significance(df_model_a, df_model_b):
    p_values = {}
    for column in df_model_a.columns:
        _, p_value = stats.ttest_rel(df_model_a[column], df_model_b[column])
        p_values[column] = p_value
    return pd.Series(p_values)

p_values = test_significance(df_scores_4k, df_scores_16k)

# Excluding NaN values for plotting
p_values_cleaned = p_values.dropna()

# Plotting boxplots
plt.figure(figsize=(7, 1.5))
plt.boxplot([p_values_cleaned], vert=False, labels=["Model change"])
plt.xlabel("p-value")
plt.title("t-test for score differences when changing the scoring LLM")
plt.tight_layout()
plt.show()

# Computing statistics
stats_model_change = {
    "Avg. p-value": [p_values_cleaned.mean()],
    "% of p-values < 0.01": [(p_values_cleaned < 0.01).mean() * 100],
    "% of p-values < 0.05": [(p_values_cleaned < 0.05).mean() * 100],
    "% of p-values < 0.1": [(p_values_cleaned < 0.1).mean() * 100]
}

# Creating a DataFrame
df_stats_model_change = pd.DataFrame(stats_model_change, index=["Model change"])
df_stats_model_change

### E2E Performance when Reschuffling

In [None]:
d = dataset_list["papers"]
d

In [None]:
callback = CustomFELIXCallback()
felix = FELIX(context=d.context_train, discrete_features=False, callback=callback, verbose=True)

felix.fit(d.X_train, d.y_train)

In [None]:
original_features = felix._features
len(original_features.features)

In [None]:
felix.reschuffle_features = True

# Transform data using feature reschuffling
df_scores_train_reschuffle = felix.transform(d.X_train)
df_scores_test_reschuffle = felix.transform(d.X_test)

In [None]:
data_transformer = DataTransformer(dataset=d)
df_scores_train_reschuffle_imputed, df_scores_test_reschuffle_imputed = data_transformer.impute_missing_values(df_scores_train_reschuffle, df_scores_test_reschuffle)

columns = [col for col in df_scores_train_reschuffle_imputed.columns if col in df_scores_test_reschuffle_imputed.columns]

df_scores_train_reschuffle_imputed = df_scores_train_reschuffle_imputed[columns]
df_scores_test_reschuffle_imputed = df_scores_test_reschuffle_imputed[columns]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

rf = RandomForestClassifier()
rf.fit(df_scores_train_reschuffle_imputed, d.y_train)
y_pred_rf = rf.predict(df_scores_test_reschuffle_imputed)

print("F1 Random Forest:", f1_score(d.y_test, y_pred_rf, average="macro"))

lr = LogisticRegression(max_iter=1000)
lr.fit(df_scores_train_reschuffle_imputed, d.y_train)
y_pred_lr = lr.predict(df_scores_test_reschuffle_imputed)

print("F1 Logistic Regression:", f1_score(d.y_test, y_pred_lr, average="macro"))

In [None]:
felix.reschuffle_features = False
felix._features = original_features

df_scores_train_stable = felix.transform(d.X_train)
df_scores_test_stable = felix.transform(d.X_test)

In [None]:
data_transformer = DataTransformer(dataset=d)
df_scores_train_stable_imputed, df_scores_test_stable_imputed = data_transformer.impute_missing_values(df_scores_train_stable, df_scores_test_stable)

columns = [col for col in df_scores_train_stable_imputed.columns if col in df_scores_test_stable_imputed.columns]

df_scores_train_stable_imputed = df_scores_train_stable_imputed[columns]
df_scores_test_stable_imputed = df_scores_test_stable_imputed[columns]

In [None]:
rf = RandomForestClassifier()
rf.fit(df_scores_train_stable_imputed, d.y_train)
y_pred_rf = rf.predict(df_scores_test_stable_imputed)

print("F1 Random Forest:", f1_score(d.y_test, y_pred_rf, average="macro"))

lr = LogisticRegression(max_iter=1000)
lr.fit(df_scores_train_stable_imputed, d.y_train)
y_pred_lr = lr.predict(df_scores_test_stable_imputed)

print("F1 Logistic Regression:", f1_score(d.y_test, y_pred_lr, average="macro"))