In [1]:
import spacy
import nltk
import os
import json
import glob
from pathlib import Path
from itertools import zip_longest
import pandas

from math import sqrt, pow, exp


nltk.download("stopwords")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sergeipetrov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
nlp = spacy.load("ru_core_news_lg")

spacy_ru_stopwords = spacy.lang.ru.STOP_WORDS

for word in spacy_ru_stopwords:
    nlp.vocab[word].is_stop = True



In [3]:
def json_to_txt(path: str, segments_key=None) -> None:
    path_ = Path(os.path.abspath(path))
    name = path_.stem
    file_dir = path_.parent

    with open(str(path_), 'r') as f:
        data = json.load(f)

    if segments_key in data:
        segments = data[segments_key]
    else:
        segments = data

    content = ' '.join([r['text'] for r in segments])
    with open(file_dir / f'{name}.txt', 'w') as f:
        f.write(content)


def convert_jsons_in_folder(folder: str):
    for file_path in glob.glob(folder + '**/*.json', recursive=True):
        json_to_txt(file_path, segments_key='segments')



In [4]:
convert_jsons_in_folder("./experiments/test1_noised")
convert_jsons_in_folder("./experiments/test2_noised")
# convert_jsons_in_folder("./experiments/test2/")
# convert_jsons_in_folder("./experiments/test3/")
# convert_jsons_in_folder("./experiments/test4/")
# convert_jsons_in_folder("./experiments/test5_full_day1/")

In [5]:
from collections import defaultdict
from pprint import pprint


def remove_punctuation(text: str) -> str:
    text = text.replace('?', '').replace('!', '').replace('.', '').replace(',', '')
    return text

def process_text(text: str) -> list[str]:
    tokens = nlp(remove_punctuation(text))
    lemmas = [token.lemma_ for token in tokens]
    return lemmas


def spacy_sim(example: str, to_validate: str) -> float:
    doc1 = nlp(remove_punctuation(example))
    doc2 = nlp(remove_punctuation(to_validate))
    return doc1.similarity(doc2)


def jaccard_similarity(example: str, to_validate: str, is_raw: bool = True) -> float:
    if is_raw:
        x = list(filter(None, example.split()))
        y = list(filter(None, to_validate.split()))
    else:
        x = process_text(example)
        y = process_text(to_validate)

    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality / float(union_cardinality)


def squared_sum(x) -> float:
    """ return 3 rounded square rooted value """
    return round(sqrt(sum([a*a for a in x])), 3)


def euclidean_sim(example: str, to_validate: str) -> float:
    def euclidean_distance(x,y):
        """ return euclidean distance between two lists """
        return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

    x = nlp(remove_punctuation(example)).vector
    y = nlp(remove_punctuation(to_validate)).vector

    distance = euclidean_distance(x, y)
    return 1 / exp(distance)


def cosine_sim(example: str, to_validate: str) -> float:
    """ return cosine similarity between two lists """
    x = nlp(remove_punctuation(example)).vector
    y = nlp(remove_punctuation(to_validate)).vector
    numerator = sum(a*b for a,b in zip(x, y))
    denominator = squared_sum(x) * squared_sum(y)
    return numerator / float(denominator)


def find_sim(example_path: str, to_validate_path: str) -> list:
    with open(example_path, 'r') as f:
        example_text = f.read()
    
    path_ = Path(os.path.abspath(to_validate_path))
    name = path_.stem

    with open(to_validate_path, 'r') as f:
        to_validate_text = f.read()

    return {
        "name": name,
        "spacy": spacy_sim(example=example_text, to_validate=to_validate_text),
        "jaccard_raw": jaccard_similarity(example=example_text, to_validate=to_validate_text, is_raw=True),
        "jaccard_process": jaccard_similarity(example=example_text, to_validate=to_validate_text, is_raw=False),
        "euclidean": euclidean_sim(example=example_text, to_validate=to_validate_text),
        "cosine": cosine_sim(example=example_text, to_validate=to_validate_text),
    }

def find_sims(example_path: str, folder: str) -> list[dict]:
    # print(f"{folder=}")
    data = []
    for file_path in glob.glob(folder + '**/*.txt', recursive=True):
        data.append(
            find_sim(example_path, file_path)
        )
    # pprint(data)
    return data


In [6]:
data_to_compare = [
    ("./nir_audio/example_1.txt", "./experiments/test1_noised/"),
    ("./nir_audio/example_2.txt", "./experiments/test2_noised/"),
]

frames = []

for example, folder in data_to_compare:
    sim_data = find_sims(example_path=example, folder=folder)
    data_for_pandas = defaultdict(list)
    for d in sim_data:
        for k, v in d.items():
            data_for_pandas[k].append(v)

    df = pandas.DataFrame(data_for_pandas)
    frames.append(df)
    df.to_csv(f"{folder}/result.csv", decimal=',')

df_combined = pandas.concat(frames)
df_combined.to_csv("./experiments/noised_combined_result.csv", decimal=',')