In [None]:
%load_ext autoreload
%autoreload 2

from TexSoup import TexSoup
import glob
import pandas as pd

from obsidianizer.latex_tools.utils import load_drafts_entries, save_cleaned_sentences_to_latex, print_differences_in_journals
from obsidianizer.latex_tools.journal_processing import get_sentences
from obsidianizer.nlp.bow import generate_word_cloud_image
from obsidianizer.latex_tools.plots import get_statistics_email_draft
from obsidianizer.nlp.translation import get_translator, get_journal_translator
import datetime as dt
from obsidianizer.nlp.text_cleanup import n_grams_function
from obsidianizer.obsidian.journal_tools import create_obsidian_files_from_journal

from obsidianizer.nlp.text_cleanup import get_most_used_words, remove_stop_words
from obsidianizer.obsidian.vault import load_vault


from obsidianizer.nlp.text_cleanup import filter_entries_by_languages

## Load item email drafts from file

In the following it is shown how to load the items generated by the email function

In [None]:
filepath = "../../../knowledge/Randiary.txt"

In [None]:
journal_df = load_drafts_entries(filepath)
journal_df

In [None]:
journal_df = journal_df.iloc[:1000]

# 1. Preprocess entries

We need to preprocess the sentences properly. This includes:
- Dividing the entry text into sentences.
- Autocorrect words (no matter how bad this is).
- Translate into a common language (English)
- Tokenization of the words.

### Split into sentences

In [None]:
journal_df = get_sentences(journal_df)
journal_df

In [None]:
journal_df = filter_entries_by_languages(journal_df, ["en"], mode  = "all")
journal_df

In [None]:
import itertools
x_train = list(itertools.chain.from_iterable(journal_df["sentences"]))
x_train

# Tokenization

we will

In [None]:
from obsidianizer.nlp.tokenizers import nltk_tokenizer

In [None]:
journal_df["tokenized_sentences"] = journal_df["sentences"].apply(nltk_tokenizer)

In [None]:
journal_df["tokenized_sentences"]

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def build_lda(x_train, num_of_topic=10):
    vec = CountVectorizer()
    transformed_x_train = vec.fit_transform(x_train)
    feature_names = vec.get_feature_names()

    lda = LatentDirichletAllocation(n_components=num_of_topic, max_iter=5, learning_method="online", random_state=0)
    lda.fit(transformed_x_train)

    return lda, vec, feature_names


def display_word_distribution(model, feature_names, n_word):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        words = []
        for i in topic.argsort()[: -n_word - 1 : -1]:
            words.append(feature_names[i])
        print(words)



In [None]:
lda_model, vec, feature_names = build_lda(x_train)
display_word_distribution(model=lda_model, feature_names=feature_names, n_word=5)

In [None]:
from umap import UMAP
import plotly.express as px

df = px.data.iris()

features = df.loc[:, :'petal_width']

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(features)
proj_3d = umap_3d.fit_transform(features)

fig_2d = px.scatter(
    proj_2d, x=0, y=1,
    color=df.species, labels={'color': 'species'}
)
fig_3d = px.scatter_3d(
    proj_3d, x=0, y=1, z=2,
    color=df.species, labels={'color': 'species'}
)
fig_3d.update_traces(marker_size=5)

fig_2d.show()
fig_3d.show()

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px

df = px.data.iris()

features = df.loc[:, :'petal_width']

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(features)

fig = px.scatter(
    projections, x=0, y=1,
    color=df.species, labels={'color': 'species'}
)
fig.show()

In [None]:
import spacy
# Load the spacy model that you have installed
nlp = spacy.load('en_core_web_md')
# process a sentence using the model
doc = nlp("This is some text that I am processing with Spacy", )
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':


In [None]:
import numpy as np
word_embeddings = np.stack([word.vector for word in doc], axis = 1)
word_embeddings.shape

In [None]:
doc[3].vector