In [2]:
import os
import gzip
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from concurrent.futures import ProcessPoolExecutor

In [3]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = df['review'].tolist()
    labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
    return texts, labels

data_file = "./IMDB.csv"
texts, labels = load_imdb_data(data_file)

In [4]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        return text, label

In [13]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
val_texts, val_labels = val_texts[:500], val_labels[:500]
batch_size=1

train_dataset = TextClassificationDataset(train_texts, train_labels)
val_dataset = TextClassificationDataset(val_texts, val_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [14]:
def predict_class(x1, dataloader, k):
    distances_from_x1 = []
    train_labels = []

    for x2, train_label in dataloader:
        x2 = x2[0]
        Cx1 = len(gzip.compress(x1.encode()))
        Cx2 = len(gzip.compress(x2.encode()))
        x1x2 = "".join([x1, x2])
        Cx1x2 = len(gzip.compress(x1x2.encode()))
        ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)

        distances_from_x1.append(ncd)
        train_labels.append(train_label.item())

    sorted_idx = np.argsort(np.array(distances_from_x1))
    top_k_class = np.array(train_labels)[sorted_idx[:k]].tolist()

    pred = max(set(top_k_class), key=top_k_class.count)

    return pred

In [11]:
def func(x, y):
    return x + y


data_x = np.array([1, 2, 3, 4])
data_y = np.array([1, 2, 3, 4])

with ProcessPoolExecutor() as pool:
    results = pool.map(func, data_x)

results

<generator object _chain_from_iterable_of_lists at 0x7f1720d22cf0>

In [15]:
k_m = [1, 5, 15, 25, 50]
tasks = [(x1[0], train_dataloader, 1) for x1, _ in val_dataloader]

In [16]:
preds = Parallel(n_jobs=-1)(delayed(predict_class)(*task) for task in tqdm(tasks))

  0%|          | 0/500 [00:00<?, ?it/s]

In [19]:
classification_report(val_labels, preds, output_dict=True)['macro avg']['f1-score']

0.7114413504544799