In [1]:
import gzip
import pickle
from sklearn.neighbors import KNeighborsClassifier

In [4]:
N_SAMPLES = 500

with open(f"sentiment-dataset-{N_SAMPLES}.pickle", "rb") as f:
    dataset = pickle.load(f)

train_x, train_y, test_x, test_y = dataset

train_x[0], train_y[0]

("Previous reviewer Claudio Carvalho gave a much better recap of the film's plot details than I could. What I recall mostly is that it was just so beautiful, in every sense - emotionally, visually, editorially - just gorgeous.<br /><br />If you like movies that are wonderful to look at, and also have emotional content to which that beauty is relevant, I think you will be glad to have seen this extraordinary and unusual work of art.<br /><br />On a scale of 1 to 10, I'd give it about an 8.75. The only reason I shy away from 9 is that it is a mood piece. If you are in the mood for a really artistic, very romantic film, then it's a 10. I definitely think it's a must-see, but none of us can be in that mood all the time, so, overall, 8.75.",
 1)

In [7]:
x_compressed = len(gzip.compress(train_x[0].encode("utf-8")))
x_compressed

451

In [8]:
x2_compressed = len(gzip.compress(train_x[1].encode("utf-8")))
x2_compressed

639

In [9]:
xx2 = len(gzip.compress((" ".join([train_x[0], train_x[1]])).encode("utf-8")))
xx2

1024

In [13]:
ncd = (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)
ncd

0.8967136150234741

In [21]:
def ncd(x, x2):
    x_compressed = len(gzip.compress(x.encode("utf-8")))
    x2_compressed = len(gzip.compress(x2.encode("utf-8")))
    xx2 = len(gzip.compress((" ".join([x, x2])).encode("utf-8")))
    return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

In [22]:
train_ncd = [[ncd(train_x[i], train_x[j]) for j in range(len(train_x))] for i in range(len(train_x))]

In [23]:
print(len(train_ncd[0]))
train_ncd[0]

401


[0.026607538802660754,
 0.8967136150234741,
 0.8337028824833703,
 0.844789356984479,
 0.83991683991684,
 0.8226164079822617,
 0.8969072164948454,
 0.8021505376344086,
 0.8631790744466801,
 0.8625277161862528,
 0.8048780487804879,
 0.8824833702882483,
 0.862992125984252,
 0.8520408163265306,
 0.8512544802867383,
 0.835920177383592,
 0.8603104212860311,
 0.811529933481153,
 0.8580931263858093,
 0.8755980861244019,
 0.8181818181818182,
 0.8314855875831486,
 0.9324866310160428,
 0.8489361702127659,
 0.9553372041089773,
 0.8717105263157895,
 0.8492239467849224,
 0.9112627986348123,
 0.8381374722838137,
 0.8470066518847007,
 0.8350515463917526,
 0.8425720620842572,
 0.8871866295264624,
 0.9157581764122894,
 0.835920177383592,
 0.8898963730569949,
 0.9588457899716177,
 0.8514412416851441,
 0.8217391304347826,
 0.85,
 0.8582089552238806,
 0.8536585365853658,
 0.835920177383592,
 0.8615384615384616,
 0.8292682926829268,
 0.9052734375,
 0.8625277161862528,
 0.8812392426850258,
 0.943045563549160

In [30]:
test_ncd = [[ncd(test_x[i], train_x[j]) for j in range(len(train_x))] for i in range(len(test_x))]

In [31]:
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(train_ncd, train_y)

In [32]:
print(f"Accuracy: {neigh.score(test_ncd, test_y)}")

Accuracy: 0.7029702970297029


70% accuracy with 500 samples and only a few lines of code. Not bad!

# Sped up version

In [43]:
import pickle
import gzip
from sklearn.neighbors import KNeighborsClassifier
import multiprocessing

NUM_PROCESSES = multiprocessing.cpu_count()
N_SAMPLES = 10000

with open(f"sentiment-dataset-{N_SAMPLES}.pickle", "rb") as f:
    dataset = pickle.load(f)

train_x, train_y, test_x, test_y = dataset

def ncd(x, x2):
    x_compressed = len(gzip.compress(x.encode("utf-8")))
    x2_compressed = len(gzip.compress(x2.encode("utf-8")))
    xx2 = len(gzip.compress((" ".join([x, x2])).encode("utf-8")))
    return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

In [44]:
# initialize NCD matricies
train_ncd = [[0] * len(train_x) for _ in range(len(train_x))]
test_ncd = [[0] * len(train_x) for _ in range(len(test_x))]

def ncd_worker(data_row: tuple[int, str]):
    i = data_row[0]
    row = [ncd(data_row[1], train_x[j]) for j in range(len(train_x))]
    return i, row

In [45]:
with multiprocessing.Pool(NUM_PROCESSES) as pool:
    train_data = enumerate(train_x)
    train_results = pool.map(ncd_worker, train_data)

    test_data = enumerate(test_x)
    test_results = pool.map(ncd_worker, test_data)

In [46]:
for i, row in train_results:
    train_ncd[i] = row
    for i, row in test_results:
        test_ncd[i] = row

In [47]:
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(train_ncd, train_y)
accuracy = neigh.score(test_ncd, test_y)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7571214392803598


The reason why this works is because when two strings are similar, gzip will be able to compress the two strings concatenated together more effectively than two strings that are dissimilar.