# Logbook Analyzer

This notebook loads a Zola-like blog from GitHub and performs some analysis on it to provide you useful data and keep stuff in order.

---

In [None]:
#@title Load: Configurations
#@markdown Loads basic configurations for data load and analysis.

from re import search, compile, M, S

import pandas as pd

USER = 'lopes'        #@param{type:'string'}
REPO = 'logbook'      #@param{type:'string'}
LEN_TITLE = 50        #@param{type:'integer'}
LEN_DESCRIPTION = 75  #@param{type:'integer'}
LEN_TAG = 15          #@param{type:'integer'}

posts = list()
nonconforming = list()

re_fname = compile(r'^\d{4}(-[a-z0-9]+)+.md$', flags=M)
re_header = compile(r'^\+{3}.*^\+{3}', flags=M|S)
re_title = compile(r'^\s*title\s*=\s*\"(.+)\"\s*$', flags=M)
re_date = compile(r'^\s*date\s*=\s*(\d{4}-\d{2}-\d{2})\s*$', flags=M)
re_description = compile(r'^\s*description\s*=\s*\"(.+)\"\s*$', flags=M)
re_tags = compile(r'^\s*tags\s*=\s*\[(.+)\]\s*$', flags=M)
re_tag = compile(r'^[a-z0-9]+$', flags=M)

print(f'🟢 All configurations loaded')

In [None]:
#@title Load: File
#@markdown Loads or reloads data from GitHub.
#@markdown Requires all configurations loaded.

!rm -rf {REPO}
!git clone https://github.com/{USER}/{REPO}.git

print(f'\n🟢 Data is loaded under ./{REPO}')

In [None]:
#@title Load: Data
#@markdown Loads data from files for further analysis.
#@markdown Requires all configurations loaded.

from os import walk
from os.path import abspath, join
from datetime import datetime

for root,dirs,files in walk(abspath(join(REPO,'content')),topdown=False):
  for f in files:
    if f.lower().endswith('.md'):
      if f != '_index.md':
        if not re_fname.search(f) or len(f) > 40:
          nonconforming.append({'file':join(root,f),'reason':'file name'})
        with open(join(root,f),'r') as post:
          p = post.read()
          try:
            header = re_header.search(p).group(0)
            title = re_title.search(header).group(1)
            date = datetime.strptime(re_date.search(header).group(1), '%Y-%m-%d')
            description = re_description.search(header).group(1)
            tags = re_tags.search(header).group(1).replace('"','').replace(' ','').split(',')

            if len(title) > LEN_TITLE: nonconforming.append({'file':join(root,f),'reason':'title length'})
            if len(description) > LEN_DESCRIPTION: nonconforming.append({'file':join(root,f),'reason':'description length'})
            for t in tags:
              if not re_tag.search(t): nonconforming.append({'file':join(root,f),'reason':'bad tag'})
              if len(t) > LEN_TAG: nonconforming.append({'file':join(root,f),'reason':'bad tag'})

            posts.append({
                'file': join(root,f),
                'title': title,
                'date': date,
                'description': description,
                'tags': tags
            })
          except AttributeError:
            nonconforming.append({'file':join(root,f),'reason':'header'})

df = pd.DataFrame(posts)
df_errors = pd.DataFrame(nonconforming)

print(f'🟢 {len(posts)} posts loaded')
print(f'🟡 {len(nonconforming)} non-compliance')

---

In [None]:
#@title Data Show
#@markdown Run it just if you want to see the data loaded.  Setup at your will.
#@markdown Requires all data loaded.

# df
# df_errors.sort_values(by=['file'])

In [None]:
#@title Frequency Analysis
#@markdown Chart of frequency of post creation.
#@markdown Requires all data loaded.

ts = df['date'].dt.to_period('M')
ts.value_counts().sort_index().plot(kind='bar', title='Posts per Month' )

In [None]:
#@title Word Cloud
#@markdown Word cloud based on posts' descriptions.
#@markdown Requires all data loaded.

from wordcloud import WordCloud, STOPWORDS

wc_stopwords = ['rule', 'rules', 'based', 'detected', 'many', 'someone', 'learn', 'using', 'para', 'de', 'e', 'na']

wc = WordCloud(
  width=1920,
  height=1080,
  stopwords=wc_stopwords+list(STOPWORDS),
  collocations=True,
  max_words=150,
  mode='RGBA',
  background_color=None,
  colormap='rainbow'
)
wc.generate_from_text(' '.join(i for i in df['description'].str.lower()))

wc.to_image()

In [None]:
#@title Ad-Hoc Header Analyzer
#@markdown Paste a header and run this snippet of code to have it analyzed.
#@markdown Requires parameters from data.

header = "+++ title = \"Arch Linux Hardened Installation Guide\" date  = 2020-07-07 description = \"Step-by-step guide to perform a hardened Arch Linux installation.\"  [taxonomies] tags = [\"unix\", \"arch\", \"security\"]  [extra] image = \"images/logos/archlinux.png\" +++" #@param{type:'string'}

title = search(r'\s+title\s*=\s*"(.+?)"\s', header).group(1)
date = search(r'\s+date\s*=\s*([\d-]+)\s', header).group(1)
description = search(r'\s+description\s*=\s*"(.+?)"\s', header).group(1)
tags = search(r'\s+tags\s*=\s*\[(.*?)\]', header).group(1)


print(f'Title......: "{title}" ({len(title)} chars)')
print(f'Date.......: {date}')
print(f'Description: "{description}" ({len(description)} chars)')
print(f'Tags.......: [{tags}]')