In [1]:
import os
import gzip
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tnrange, tqdm

In [2]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = df['review'].tolist()
    labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
    return texts, labels

In [3]:
data_file = "../data/IMDB.csv"
texts, labels = load_imdb_data(data_file)

In [14]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        return text, label

In [121]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
val_texts, val_labels = val_texts[:500], val_labels[:500]

In [122]:
batch_size=1

train_dataset = TextClassificationDataset(train_texts, train_labels)
val_dataset = TextClassificationDataset(val_texts, val_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [19]:
k_m = [1, 5, 10, 15, 25, 50]

In [124]:
len(val_dataloader)

500

In [125]:
f1_scores = []
with tqdm(total=len(k_m) * len(val_dataloader)) as pbar:
    for k in k_m:
        predicts = []
        labels_ = []
        for x1, val_label in val_dataloader:
            distances_from_x1 = []
            tr_labels = []
            for x2, train_label in train_dataloader:
                x1, x2 = x1[0], x2[0]
                Cx1 = len(gzip.compress(x1.encode()))
                Cx2 = len(gzip.compress(x2.encode()))
                x1x2 = "".join([x1, x2])
                Cx1x2 = len(gzip.compress(x1x2.encode()))

                ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
                distances_from_x1.append(ncd)
                tr_labels.append(train_label[0])

            sorted_idx = np.argsort(np.array(distances_from_x1))
            top_k_class = np.array(tr_labels)[sorted_idx[:k]].tolist()
            predict_class = max(set(top_k_class), key=top_k_class.count)
            predicts.append(predict_class)
            labels_.append(val_label)
            pbar.update(1)
        report = classification_report(labels_, predicts, output_dict=True)
        f1_macro = report['macro avg']['f1-score']
        f1_scores.append(f1_macro)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3000/3000 [3:24:16<00:00,  4.09s/it]  


In [127]:
f_scores_macro_split = np.array_split(f1_scores, len(k_m))

In [131]:
table_macro = pd.DataFrame(f_scores_macro_split, index=k_m, columns=['f1_score'])


In [132]:
table_macro.sort_values(by="f1_score", ascending=False)

Unnamed: 0,f1_score
25,0.488035
1,0.335989
5,0.335989
10,0.335989
15,0.335989
50,0.328859


In [133]:
table_macro

Unnamed: 0,f1_score
1,0.335989
5,0.335989
10,0.335989
15,0.335989
25,0.488035
50,0.328859
