# Example of journal analysis

This document contains a set of analysis performed over entries of my diary from 2019-2020.

In [None]:
%load_ext autoreload
%autoreload 2

from TexSoup import TexSoup
import glob
import pandas as pd

from obsidianizer.latex_tools.utils import load_drafts_entries, save_cleaned_sentences_to_latex, print_differences_in_journals

from obsidianizer.nlp.bow import generate_word_cloud_image

from obsidianizer.latex_tools.plots import get_statistics_email_draft
from obsidianizer.latex_tools.journal_processing import get_sentences, get_translations, get_autocorrections, get_stems, get_lemmatizations
from obsidianizer.nlp.translation import get_translator, get_journal_translator
from obsidianizer.journal.summary_plots import get_size_of_writings_per_entry_figs, get_amount_writing_figs
import datetime as dt

from obsidianizer.journal.summary_plots import get_ngrams_figs 
from obsidianizer.nlp.text_cleanup import join_list_of_lists_of_strings

from obsidianizer.plots.utils import charts_dict_to_tab

from obsidianizer import EXAMPLE_JOURNAL_PATH, EXAMPLE_CLEANED_JOURNAL_PATH, JOURNALS_PATH, EXAMPLE_CLEANED_JOURNAL_PATH

## Load item email drafts from file

In the following it is shown how to load the items generated by the email function

In [None]:
filepath = EXAMPLE_JOURNAL_PATH

In [None]:
journal_df = load_drafts_entries(filepath)
journal_df

# 1. Preprocess entries

We need to preprocess the sentences properly. This includes:
- Dividing the entry text into sentences.
- Autocorrect words (no matter how bad this is).
- Translate into a common language (English)
- Tokenization of the words.

### Split into sentences

In [None]:
journal_df = get_sentences(journal_df)
journal_df

### Translation and autocorrection

In [None]:
journal_df = get_translations(journal_df)

In [None]:
#journal_df = get_autocorrections(journal_df)
#journal_df

### Lemmatization and stemming

In [None]:
journal_df = get_lemmatizations(journal_df)

In [None]:
journal_df = get_stems(journal_df)

### Show final dataframe

In [None]:
journal_df

# 2. Basic statistics of the entries

## 2.1 Timeseries of journal entries

In [None]:
statistics_chart = get_statistics_email_draft(journal_df)

In [None]:
statistics_chart.show()

## 2.2 Amount of writing by categories

In [None]:
amount_writting_charts = get_amount_writing_figs(journal_df)

In [None]:
amount_writting_tabs = charts_dict_to_tab(amount_writting_charts)

In [None]:
amount_writting_tabs

In [None]:
charts_dict_to_tab(get_size_of_writings_per_entry_figs(journal_df))

# 2.3 N grams analysis

In [None]:
charts = get_ngrams_figs(journal_df.iloc[:200], n_list = [2,3,4,5])

In [None]:
charts_dict_to_tab(charts)

## Word-cloud

Generate word cloud of the entries

In [None]:
all_joined_text = join_list_of_lists_of_strings(journal_df.iloc[:1000]["stemming"])

In [None]:
word_cloud = generate_word_cloud_image("../../img/ying_jang.png",all_joined_text )

In [None]:
display(word_cloud.to_image())