Inspired by https://easychair.org/publications/preprint/dKWn/open

## Imports

In [None]:
from collections import OrderedDict
import numpy as np
import matplotlib.pyplot as plt
from plotly import graph_objects as go

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

In [None]:
from mogra.datatypes import SSwar
from mogra.raagfinder.parse import RAAG_DB

## Vectorize

In [None]:
raagdocs = OrderedDict({
    ii: " . ".join([" ".join(phrase) for phrase in RAAG_DB[ii]["mukhyanga"] if len(phrase) > 0])
    for ii in RAAG_DB.keys()
})

In [None]:
# raagdocs

In [None]:
# tfidf = TfidfVectorizer(
#     analyzer="word",
#     ngram_range=(2, 4),
#     lowercase=False,
#     token_pattern=r"\S+",
#     preprocessor=None,
#     tokenizer=None,
#     strip_accents=None,
#     # stop_words=["\n", "X"]
# )

# X = tfidf.fit_transform([
#     'S R . ,D S . S R . `S D',
#     'S R ,D S . S R S D',
# ])
# # get the vocabulary and idfs
# vocab = tfidf.vocabulary_
# idf = tfidf.idf_


# sorted_idf = sorted(zip(vocab.keys(), idf), key=lambda x: x[1], reverse=True)
# plt.plot([x[1] for x in sorted_idf])
# plt.xticks(
#     range(len(sorted_idf)),
#     [x[0] for x in sorted_idf],
#     rotation=90,
# )
# plt.xlabel("N-grams")
# plt.ylabel("IDF")


In [None]:
tfidf = TfidfVectorizer(
    analyzer="word",
    ngram_range=(2, 5),
    lowercase=False,
    token_pattern=r"\S+",
    preprocessor=None,
    tokenizer=None,
    strip_accents=None,
)

X = tfidf.fit_transform(raagdocs.values())
# get the vocabulary and idfs
vocab = tfidf.vocabulary_
idf = tfidf.idf_

In [None]:
sorted_idf = sorted(zip(vocab.keys(), idf), key=lambda x: x[1], reverse=True)
plt.plot([x[1] for x in sorted_idf])

In [None]:
max_idf = max([x[1] for x in sorted_idf])
# indices of idf that are < max_idf
indices = [i for i, x in enumerate([x[1] for x in sorted_idf]) if x < max_idf]
# only take the columns of X that are in the indices
X_f = X[:, indices].toarray()

## tSNE

In [None]:
tsne = TSNE(
    n_components=2,
    perplexity=8,
    n_iter=1000,
    random_state=0,
    verbose=1,
)
X_embedded = tsne.fit_transform(X_f)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=X_embedded[:, 0],
        y=X_embedded[:, 1],
        mode="markers+text",
        text=list(raagdocs.keys()),
        textposition="top center",
        marker=dict(size=5),
        textfont=dict(size=8),  # Reduced font size
    )
)
fig.show()

## Test

In [None]:
# sargam
bhoop_seq = "G G R G R S . S ,D S . " \
    "S P G . S ,D S . " \
    "S G R . R P G . G D P . P `S D . D R `S . `S D `S . " \
    "`S P D G P R G ." \
    "S P G . S ,D S . " \
    "G P R G R S"

# transcription!
bhoop_seq = "S G R S ,D S R S R G R G P D P G P D P G S P G R S R S R G R G R G P G R G P ,D D P D `S D P G R G P D P G R G R S R G R G R ,D R S ,D S ,D S R S ,D S R G P D P G"

In [None]:
bhoopseq_transform = tfidf.transform([bhoop_seq]).toarray()[0]

In [None]:
# distance from bhoop to each of the other raags
dist_array = np.array([
    np.linalg.norm(bhoopseq_transform[indices] - X_f[ii])
    for ii in range(len(raagdocs))
])
sorted_dist = sorted(zip(list(raagdocs.keys()), dist_array), key=lambda x: x[1])
for raag, dist in sorted_dist:
    if raag in ["all", "none"]:
        continue
    print(f"distance from {raag}: {dist:.2f}")