In [None]:
%load_ext autoreload
%autoreload 2

from TexSoup import TexSoup
import glob
import pandas as pd

from obsidianizer.latex_tools.utils import load_drafts_entries, save_cleaned_sentences_to_latex, print_differences_in_journals
from obsidianizer.latex_tools.journal_processing import get_sentences
from obsidianizer.nlp.bow import generate_word_cloud_image
from obsidianizer.latex_tools.plots import get_statistics_email_draft
from obsidianizer.nlp.translation import get_translator, get_journal_translator
import datetime as dt
from obsidianizer.nlp.text_cleanup import n_grams_function
from obsidianizer.obsidian.journal_tools import create_obsidian_files_from_journal

from obsidianizer.nlp.text_cleanup import get_most_used_words, remove_stop_words
from obsidianizer.obsidian.vault import load_vault
import itertools

from obsidianizer.nlp.text_cleanup import filter_entries_by_languages

In [None]:
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


In [None]:
if 0:
    from keras.preprocessing.sequence import pad_sequences

In [None]:
# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == "cuda":
    n_gpu = torch.cuda.device_count()
    cuda_core = torch.cuda.get_device_name(0)

## Load item email drafts from file

In the following it is shown how to load the items generated by the email function

In [None]:
filepath = "../../../knowledge/Randiary.txt"

In [None]:
journal_df = load_drafts_entries(filepath)
journal_df

In [None]:
journal_df = journal_df.iloc[:1000]

# 1. Preprocess entries

We need to preprocess the sentences properly. This includes:
- Dividing the entry text into sentences.
- Autocorrect words (no matter how bad this is).
- Translate into a common language (English)
- Tokenization of the words.

In [None]:
journal_df = get_sentences(journal_df)
journal_df = filter_entries_by_languages(journal_df, ["en"], mode  = "all")

In [None]:
x_train = list(itertools.chain.from_iterable(journal_df["sentences"]))
x_train

In [None]:
#
# queries are stored in the variable query_data_train
# correct intent labels are stored in the variable labels
#

# add special tokens for BERT to work properly
sentences = ["[CLS] " + query + " [SEP]" for query in x_train]
print(sentences[0])

# Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
sentence_embeddings = model.encode(x_train)

In [None]:
sentence_embeddings.shape

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px

features = sentence_embeddings

tsne = TSNE(n_components=3, random_state=0)
projections = tsne.fit_transform(features)


In [None]:
from umap import UMAP
import plotly.express as px


umap_3d = UMAP(n_components=3, init='random', random_state=0)
projections = umap_3d.fit_transform(features)

In [None]:
projections.shape

In [None]:
sentences_df = pd.DataFrame()
sentences_df["sentence"] = x_train
sentences_df["projection_x"] = projections[:,0]
sentences_df["projection_y"] = projections[:,1]
sentences_df["projection_z"] = projections[:,2]
sentences_df

In [None]:
fig = px.scatter(sentences_df, x="projection_x", y="projection_y",
                 hover_data=["sentence"])

fig.show()

In [None]:
fig_3d = px.scatter_3d(
    sentences_df, x="projection_x", y="projection_y", z="projection_z",
    hover_data=["sentence"],
)
fig_3d.update_traces(marker_size=2)
fig_3d.show()

In [None]:
from umap import UMAP
import plotly.express as px

df = px.data.iris()

features = df.loc[:, :'petal_width']

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(features)
proj_3d = umap_3d.fit_transform(features)

fig_2d = px.scatter(
    proj_2d, x=0, y=1,
    color=df.species, labels={'color': 'species'}
)
fig_3d = px.scatter_3d(
    proj_3d, x=0, y=1, z=2,
    color=df.species, labels={'color': 'species'}
)
fig_3d.update_traces(marker_size=5)

fig_2d.show()
fig_3d.show()

In [None]:
def community_detection(embeddings, threshold=0.75, min_community_size=10, init_max_size=1000):
    """
    Function for Fast Community Detection
    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.
    """

    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [None]:
import time 
from sentence_transformers import SentenceTransformer, util
print("Start clustering")
start_time = time.time()

#Two parameter to tune:
#min_cluster_size: Only consider cluster that have at least 25 elements (30 similar sentences)
#threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = community_detection(sentence_embeddings, min_community_size=10, threshold=0.81)


#Print all cluster / communities
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster:
        print("\t", x_train[sentence_id])



print("Clustering done after {:.2f} sec".format(time.time() - start_time))


In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:


# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=10, distance_threshold=None) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(sentence_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(x_train[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")


In [None]:
sentences_df["color"] = cluster_assignment

In [None]:
fig_3d = px.scatter_3d(
    sentences_df, x="projection_x", y="projection_y", z="projection_z",
    hover_data=["sentence"], color = "color"
)
fig_3d.update_traces(marker_size=2)
fig_3d.show()