In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

In [2]:
dataset = load_dataset('sst2')
dataset.save_to_disk('/Users/makarwuckert/Desktop/gzippy')

Downloading readme:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 4.78MB/s]
Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 218kB/s]
Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 445kB/s]


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/67349 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/872 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1821 [00:00<?, ? examples/s]

In [3]:
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

In [4]:
print("5 Training samples:\n", df_train.sample(5))
print("5 Validation samples:\n", df_val.sample(5))
print("5 Testing samples:\n", df_test.sample(5))

5 Training samples:
          idx                                           sentence  label
44530  44530  nettelbeck has crafted an engaging fantasy of ...      1
23286  23286                                     is more fully       1
23892  23892  working from a surprisingly sensitive script c...      1
34140  34140               the astute direction of cardoso and       1
35118  35118                                          schiffer       1
5 Validation samples:
      idx                                           sentence  label
507  507  the minor figures surrounding ( bobby ) ... fo...      1
282  282  while there 's something intrinsically funny a...      1
327  327  it proves quite compelling as an intense , bro...      1
96    96  it 's difficult to imagine the process that pr...      0
284  284           directed in a paint-by-numbers manner .       0
5 Testing samples:
        idx                                           sentence  label
1143  1143              a worthwhile wa

In [6]:
df_train['label'].value_counts()

label
1    37569
0    29780
Name: count, dtype: int64

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67349 entries, 0 to 67348
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idx       67349 non-null  int64 
 1   sentence  67349 non-null  object
 2   label     67349 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [8]:
df_train.describe()

Unnamed: 0,idx,label
count,67349.0,67349.0
mean,33674.0,0.557826
std,19442.125977,0.496649
min,0.0,0.0
25%,16837.0,0.0
50%,33674.0,1.0
75%,50511.0,1.0
max,67348.0,1.0


### Little bit preprocessing

In [9]:
df_train['sentence'] = df_train['sentence'].str.lower()
df_train['sentence'].head(3)

0         hide new secretions from the parental units 
1                 contains no wit , only labored gags 
2    that loves its characters and communicates som...
Name: sentence, dtype: object

In [15]:
import gzip 

def compression_ratio(text):
    compressed = len(gzip.compress(text.encode()))
    original = len(text.encode())
    compression_ratio = f"{(original / compressed):2f}"
    return compression_ratio

print("Compression ratio Training Set:\n", compression_ratio(''.join(df_train['sentence'])))

Compression ratio Training Set:
 2.631293


In [16]:
def calculate_ncd(x1, x2):
    Cx1 = len(gzip.compress(x1.encode()))
    Cx2 = len(gzip.compress(x2.encode()))
    x1x2 = ' '.join([x1, x2])
    Cx1x2 = len(gzip.compress(x1x2.encode()))

    ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
    return ncd

In [18]:
df_val.iterrows()

<generator object DataFrame.iterrows at 0x7f7b9558f890>

In [39]:
from tqdm import tqdm
from collections import Counter

k = 2

predicted_classes = []

for row_val in tqdm(df_val.iterrows(), total=df_val.shape[0]):
    val_sentence = row_val[1]['sentence']
    val_label = row_val[1]['label']
    c_val_sentence = len(gzip.compress(val_sentence.encode()))
    distance_from_val_instance = []

    for row_train in df_train.iterrows():
        train_sentence = row_train[1]['sentence']
        train_label = row_train[1]['label']
        c_train_sentence = len(gzip.compress(train_sentence.encode()))

        train_plus_val = ' '.join([val_sentence, train_sentence])
        c_train_plus_val = len(gzip.compress(train_plus_val.encode()))

        ncd = ((c_train_plus_val - min(c_train_sentence, c_val_sentence)) / 
                max(c_val_sentence, c_train_sentence))
        
        distance_from_val_instance.append(ncd)
    
    sorted_idx = np.argsort(np.array(distance_from_val_instance))
    top_k_class = np.array(df_train['sentence'])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[1][1]

    predicted_classes.append(predicted_class)

print("Accuracy:\n", np.mean(np.array(predicted_classes) == df_val['label'].values))
    

100%|██████████| 872/872 [2:11:05<00:00,  9.02s/it]  

Accuracy:
 0.5091743119266054





In [41]:
df_test['label'].value_counts()

Unnamed: 0,idx,sentence,label
0,0,uneasy mishmash of styles and genres .,-1
1,1,this film 's relationship to actual tension is...,-1
2,2,"by the end of no such thing the audience , lik...",-1


In [40]:
from sklearn.metrics import classification_report

true_labels = df_test['label']
true_labels

0      -1
1      -1
2      -1
3      -1
4      -1
       ..
1816   -1
1817   -1
1818   -1
1819   -1
1820   -1
Name: label, Length: 1821, dtype: int64