In [None]:
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

all_lemmas = []

for folder in tqdm(['1', '2', '3', '4']):
    folder_path = os.path.join('assets/annotated_corpus/train/', folder)

    for file in os.listdir(folder_path):
        try:

            if file.endswith('.tsv') and file.startswith('annotation'):

                file_path = os.path.join(folder_path, file)

                df = pd.read_csv(file_path, sep='\t', header=None)

                lemma_list = df[0].tolist()
                sentence_lemmas = []
                for lemma in lemma_list:
                    if str(lemma) != 'nan':
                        sentence_lemmas.append(lemma)
                    else:
                        all_lemmas.append(sentence_lemmas)
                        sentence_lemmas = []

                if len(sentence_lemmas) > 0:
                    all_lemmas.append(sentence_lemmas)
        except: Exception

In [None]:
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

all_lemmas_test = []

for folder in tqdm(['1', '2', '3', '4']):
    folder_path = os.path.join('assets/annotated_corpus/test/', folder)

    for file in os.listdir(folder_path):

        if file.endswith('.tsv') and file.startswith('annotation'):

            file_path = os.path.join(folder_path, file)

            df = pd.read_csv(file_path, sep='\t', header=None)

            lemma_list = df[0].tolist()
            sentence_lemmas = []
            for lemma in lemma_list:
                if str(lemma) != 'nan':
                    sentence_lemmas.append(lemma)
                else:
                    all_lemmas_test.append(sentence_lemmas)
                    sentence_lemmas = []

            if len(sentence_lemmas) > 0:
                all_lemmas_test.append(sentence_lemmas)

In [None]:
all_lemmas = all_lemmas_test

In [None]:
all_words = [item for sublist in all_lemmas for item in sublist]

In [None]:
all_words

#### Delete stop words

In [None]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

filtered = []
 
for w in all_words:
    if w not in stop_words:
        filtered.append(w)


In [None]:
filtered

#### count words and delete rear

In [None]:
counts = {}
for word in filtered:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

In [None]:
rear_words = [k for k, v in counts.items() if v < 5]

filtered_final = [filt for filt in tqdm(filtered) if filt not in rear_words]

In [None]:
counted_filtered = {}
for word in filtered_final:
        if word in counted_filtered:
            counted_filtered[word] += 1
        else:
            counted_filtered[word] = 1

In [None]:
{k: v for k, v in sorted(counted_filtered.items(), key=lambda item: item[1], reverse=True)}

In [None]:
with open("counted_tokens.json", "w") as file:
    json.dump({k: v for k, v in sorted(counted_filtered.items(), key=lambda item: item[1], reverse=True)}, file)

#### tf-idf

In [None]:
from math import log
from tqdm import tqdm

def compute_term_idf(term_document_matrix, terms):
    tdm_np = np.array(term_document_matrix)
    term_idf = {}
    
    for term, i in zip(terms, range(len(terms))):
        term_idf[term] = np.count_nonzero(tdm_np[:,i])
        
    return term_idf


def create_term_document_matrix(docs, counted_filtered):
    term_document_matrix = []
    terms = list(counted_filtered.keys())
    for doc in tqdm(docs):
        row = [doc.count(term) for term in terms]
        term_document_matrix.append(row)

    return term_document_matrix, terms

N = len(all_lemmas)

def compute_tf_idf(term_document_matrix, term_doc_count, terms):
    tf_idf_matrix = []
    for row in tqdm(term_document_matrix):
        tf_idf_row = [(tf * log(N / term_idf[term])) for tf, term in zip(row, terms)]
        tf_idf_matrix.append(tf_idf_row)

    return tf_idf_matrix

def get_doc_vector(matrix, sent):
    return matrix[sent]

def get_word_vector(matrix, word):
    return np.array(matrix)[:, word]

**Считаем матрицу term doc**

In [None]:
term_document_matrix, terms = create_term_document_matrix(all_lemmas, counted_filtered)

In [None]:
import pickle
with open("term_document_matrix_tr", "wb") as fp:   #Pickling
    pickle.dump(term_document_matrix, fp)
with open("terms_tr", "wb") as fp:   #Pickling
    pickle.dump(terms, fp)

In [None]:
plt.figure(dpi=500)
plt.imshow(np.asarray(term_document_matrix), interpolation='none')
plt.show()

Считаем idf для каждого слова

In [None]:
term_idf = compute_term_idf(term_document_matrix, terms)

#### Считаем tf-idf матрицу

In [None]:
tf_idf_matrix = compute_tf_idf(term_document_matrix, term_idf, terms)

In [None]:
plt.figure(dpi=500)
plt.imshow(np.asarray(tf_idf_matrix), interpolation='none')
plt.show()

In [None]:
get_word_vector(tf_idf_matrix,1)

In [None]:
get_doc_vector(tf_idf_matrix,1)

In [None]:
vector1 = get_word_vector(tf_idf_matrix,terms.index('monday'))

In [None]:
vector2 = get_word_vector(tf_idf_matrix,terms.index('wednesday'))

In [None]:
print(cosine_similarity(vector1, vector2))

### word2vec

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
# model = Word2Vec(sentences=all_lemmas, vector_size=100, window=3, min_count=1, workers=4)
# model.save("word2vec.model")

In [None]:
model = Word2Vec.load("word2vec.model")

In [None]:
model.train(all_lemmas, total_examples=len(all_lemmas), epochs=100)

In [None]:
# model.save("word2vec.model")

In [None]:
vector1 = model.wv['georgian']
vector2 = model.wv['british']

vector3 = model.wv['war']
vector4 = model.wv['crime']
vector4_1 = model.wv['soldier']
vector4_2 = model.wv['military']



vector5 = model.wv['republican']
vector6 = model.wv['democrat']


vector7 = model.wv['tuesday']
vector8 = model.wv['monday']
vector9 = model.wv['sunday']
vector10 = model.wv['wednesday']

vector11 = model.wv['afghanistan']
vector12 = model.wv['iraq']
vector12_1 = model.wv['iran']



vector13 = model.wv['panda']
vector14 = model.wv['monkey']

In [None]:
from scipy.spatial import distance
1 - distance.cosine(vector1, vector2)

In [None]:
import math
def cosine_similarity(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]
        y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

In [None]:
print(cosine_similarity(vector1, vector2))
print(cosine_similarity(vector3, vector4))
print(cosine_similarity(vector7, vector8))
print(cosine_similarity(vector11, vector12))
print(cosine_similarity(vector13, vector14))

In [None]:
from sklearn.decomposition import PCA
words = list(model.wv.key_to_index)
X = [model.wv[word] for i, word in enumerate(words)]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

In [None]:
plt.figure(dpi=500)

for vec, name in zip([vector1,vector2],['georgian', 'british']):
    plt.plot(pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1], 'o', color='red')
    plt.annotate(name, (pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1]))

for vec, name in zip([vector3,vector4,vector4_1,vector4_2], ['war','crime','soldier','military']):
    plt.plot(pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1], 'o', color='blue')
    plt.annotate(name, (pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1]))
    
for vec, name in zip([vector5,vector6], ['republican', 'democrat']):
    plt.plot(pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1], 'o', color='orange')
    plt.annotate(name, (pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1]))
    
for vec, name in zip([vector7,vector8,vector9,vector10], ['tuesday', 'monday', 'sunday', 'wednesday']):
    plt.plot(pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1], 'o', color='black')
    plt.annotate(name, (pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1]))
    
for vec, name in zip([vector11,vector12], ['afghanistan', 'iraq']):
    plt.plot(pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1], 'o', color='green')
    plt.annotate(name, (pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1]))   

for vec, name in zip([vector13,vector14], ['panda', 'monkey']):
    plt.plot(pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1], 'o', color='yellow')
    plt.annotate(name, (pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1])) 

plt.show()

In [None]:
def get_sentence_emb(sent):
    sum_vector = np.zeros(100)
    for token in sent:
        try:
            emd = model.wv[token]
        except:
            emd = np.zeros(100)
        sum_vector += emd
    return sum_vector/len(sent)

In [None]:
get_sentence_emb(all_lemmas[6])

### Write test data to files

In [None]:
def create_annotation(all_lemmas):
    
    output_filename = f"assets/annotated_corpus/test/embedding_test.tsv"
    with open(output_filename, "w", encoding="utf-8") as f:
        
        for index, sent in zip(range(len(all_lemmas)),all_lemmas):

                    embed = get_sentence_emb(sent)

                    f.write(f"{index}\t") 

                    for emb in embed:
                        f.write(f"{emb}\t")

                    f.write("\n")  

In [None]:
create_annotation(all_lemmas_test)