In [None]:
import sklearn
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine
from sklearn.neighbors import NearestNeighbors

from sentence_transformers import SentenceTransformer

## Data loading

In [None]:
russia_df = pd.read_csv('data/ru_sample_ex.csv')
italy_df = pd.read_csv('data/it_sample_ex.csv')

In [None]:
italy_docs = [dd for d in italy_df['content'].values for dd in d.split('\n\n')]
russia_docs = [dd for d in russia_df['content'].values for dd in d.split('\n\n')]

In [None]:
italy_sents = [dd for d in italy_df['content'].values for dd in d.split('.') if len(dd) > 20]
russia_sents = [dd for d in russia_df['content'].values for dd in d.split('.') if len(dd) > 20]

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
embeddings_rus = model.encode(russia_docs)
embeddings_ita = model.encode(italy_docs)

In [None]:
sembeddings_rus = model.encode(russia_sents)
sembeddings_ita = model.encode(italy_sents)

In [None]:
S_train, S_test, _, _ = sklearn.model_selection.train_test_split(russia_sents + italy_sents, [1] * len(russia_sents) + [0] * len(italy_sents), test_size=0.5, random_state=42)

In [None]:
Z_train, Z_test, z_train, z_test = sklearn.model_selection.train_test_split(np.vstack([sembeddings_rus, sembeddings_ita]), [1] * len(russia_sents) + [0] * len(italy_sents), test_size=0.5, random_state=42)

In [None]:
D_train, D_test, _, _ = sklearn.model_selection.train_test_split(russia_docs + italy_docs, [1] * len(russia_docs) + [0] * len(italy_docs), test_size=0.5, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(np.vstack([embeddings_rus, embeddings_ita]), [1] * len(russia_docs) + [0] * len(italy_docs), test_size=0.5, random_state=42)

## paragraph classification

In [None]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)
outputs = neigh.predict(X_test)
f1_5nn = sklearn.metrics.f1_score(y_test, outputs)
f1_5nn

In [None]:
neigh1 = KNeighborsClassifier(n_neighbors=1)
neigh1.fit(X_train, y_train)
outputs1 = neigh1.predict(X_test)
f1_1nn = sklearn.metrics.f1_score(y_test, outputs1)
f1_1nn

In [None]:
neigh3 = KNeighborsClassifier(n_neighbors=3)
outputs3 = neigh3.predict(X_test)
f1_3nn = sklearn.metrics.f1_score(y_test, outputs3)
f1_3nn

In [None]:
f1_5nn = sklearn.metrics.f1_score(y_test, outputs)
f1_5nn

In [None]:
len(y_test)

In [None]:
sum([outputs[i] != y_test[i] for i in range(len(D_test))])

## Error analysis

In [None]:
for i, d in enumerate(D_test):
    if outputs[i] != y_test[i]:
        print(d)
        print('Predicted:', "Russie" if outputs[i] else "Pas Russie")

In [None]:
len(z_train)

## Sentence classification

In [None]:
neigh_sent = KNeighborsClassifier(n_neighbors=5)
neigh_sent.fit(Z_train, z_train)
outputs_sent = neigh_sent.predict(Z_test)
fsent_5nn = sklearn.metrics.f1_score(z_test, outputs_sent)
fsent_5nn

In [None]:
neigh_sent = KNeighborsClassifier(n_neighbors=3)
neigh_sent.fit(Z_train, z_train)
outputs_sent = neigh_sent.predict(Z_test)
fsent_5nn = sklearn.metrics.f1_score(z_test, outputs_sent)
fsent_5nn

In [None]:
neigh_sent = KNeighborsClassifier(n_neighbors=9)
neigh_sent.fit(Z_train, z_train)
outputs_sent = neigh_sent.predict(Z_test)
fsent_5nn = sklearn.metrics.f1_score(z_test, outputs_sent)
fsent_5nn