/
note_stats.py
92 lines (73 loc) · 2.42 KB
/
note_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Analyze all joplin notes.
Requirements: pip install joppy markdown beautifulsoup4 nltk
Usage: API_TOKEN=XYZ python note_stats.py
"""
import re
import string
import os
from bs4 import BeautifulSoup
from joppy.api import Api
from markdown import Markdown
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
def markdown_to_text(markdown_string: str) -> str:
# convert markdown to html
md = Markdown(extensions=["nl2br", "sane_lists", "tables"])
html = md.convert(markdown_string)
exclude_patterns_html = (
r"<pre>.*?<\/pre>", # code
r"<code>.*?<\/code>", # code
r"\$.*?\$", # formulas (https://meta.stackexchange.com/a/263344)
)
for pattern in exclude_patterns_html:
html = re.sub(pattern, " ", html, flags=re.DOTALL)
# convert html to text
text = BeautifulSoup(html, "html.parser").get_text()
exclude_patterns_text = (r"http[A-Za-z0-9-._~:/?#\[\]@!$&'\(\)\*+,;=]*",) # links
for pattern in exclude_patterns_text:
text = re.sub(pattern, " ", text, flags=re.DOTALL)
return text
def analyze_text(text: str):
tokens = word_tokenize(text)
tokens = [
# normalize to lower case
word.lower()
for word in tokens
if word not in ("...", "''", "``", "--", "++") and
# punctuation
word not in string.punctuation and
# single character words
len(word) > 1 and
# words containing at least one digit
not any(character.isdigit() for character in word)
]
print("Words:", len(tokens))
# filter most common words
tokens = [
word
for word in tokens
if word not in set(stopwords.words("english") + stopwords.words("german"))
]
fdist = FreqDist(tokens)
# fdist.plot(50)
print("Most common words:")
for word, count in fdist.most_common(10):
print(f"- {word}: {count}")
def main():
# download nltk data at the first start
if False:
# "punkt" and "tokenizers" are needed.
nltk.download()
# get all notes from joplin
api = Api(token=os.getenv("API_TOKEN"))
notes = api.get_all_notes(fields="id,title,body")
print("Notes:", len(notes))
# concatenate and convert them to text
text = markdown_to_text("\n".join(note.body for note in notes))
# analyze them
analyze_text(text)
if __name__ == "__main__":
main()