In [5]:
# Get list of all markdown filenames
import os
import re

# Get path of current notebook
CURRENT_PATH = os.getcwd()
# Get parent dir of current notebook
NOTES_PATH = os.path.dirname(CURRENT_PATH)
# Get list of all markdown filenames without path
def get_all_note_files(notes_path=NOTES_PATH):
    return [f for f in os.listdir(notes_path) if f.endswith('.md')]

note_files = get_all_note_files()
note_files[:8]



['http.md',
 'csv.md',
 'pcde-module6-content.md',
 'document-database.md',
 'markdown.md',
 'find.md',
 'mapbox.md',
 'intro-python.md']

In [15]:
# Create note files metadata dataframe
import pandas as pd
import numpy as np

# Create dataframe with note files metadata
def construct_notes_meta_df(path=NOTES_PATH):
    note_files = get_all_note_files(path)
    sizes = [os.path.getsize(f'{path}/{f}') for f in note_files]
    ctimes = [os.path.getctime(f'{path}/{f}') for f in note_files]
    mtimes = [os.path.getmtime(f'{path}/{f}') for f in note_files]
    df = pd.DataFrame({
        'filename': note_files,
        'size': sizes,
        'ctime': ctimes,
        'mtime': mtimes,
    })
    # Convert ctime and mtime columns to datetime
    df['ctime'] = pd.to_datetime(df['ctime'], unit='s')
    df['mtime'] = pd.to_datetime(df['mtime'], unit='s')
    return df

# Create a df summarizing function
def summarize_df(df):
    print(f'DF shape: {df.shape}')
    print(f'DF column dtypes: {df.dtypes}')
    # Return random subset of rows
    return df.sample(8)

notes_df = construct_notes_meta_df()
summarize_df(notes_df)


DF shape: (161, 4)
DF column dtypes: filename            object
size                 int64
ctime       datetime64[ns]
mtime       datetime64[ns]
dtype: object


Unnamed: 0,filename,size,ctime,mtime
45,mime-type.md,3219,2023-05-05 17:20:38.886245120,2023-05-05 13:25:31.000000000
131,jinja.md,2031,2023-03-13 20:21:02.531521280,2023-03-13 09:51:26.012999936
40,nextjs.md,195,2023-03-27 11:24:49.953961728,2023-03-27 11:16:20.836998912
116,sql-eda.md,6480,2023-03-27 11:24:49.957152512,2023-03-27 11:10:18.694999040
0,http.md,3470,2023-03-07 16:00:51.093276672,2023-03-07 11:20:10.497998848
25,redis.md,1598,2023-03-23 14:26:55.477783296,2023-03-23 14:25:23.101999104
144,pcde-module5-content.md,56869,2023-01-27 16:11:25.871063552,2023-01-25 09:17:25.000000000
110,numpy.md,16187,2023-02-17 10:18:18.051048448,2023-02-16 16:27:29.207000064


In [44]:
# Helper functions
def human_readable_size(size):
    if size < 1024:
        return f'{size}B'
    elif size < 1024 ** 2:
        return f'{size / 1024:.1f}KB'
    elif size < 1024 ** 3:
        return f'{size / (1024 ** 2):.1f}MB'
    else:
        return f'{size / (1024 ** 3):.1f}GB'
# Get stats on note sizes
def summarize_notes_df_sizes_stats(df):
    size_mean = df['size'].mean()
    size_median = df['size'].median()
    size_std = df['size'].std()
    size_min = df['size'].min()
    size_max = df['size'].max()
    print(f'Size mean: {human_readable_size(size_mean)}')
    print(f'Size median: {human_readable_size(size_median)}')
    print(f'Size std: {human_readable_size(size_std)}')
    print(f'Size min: {human_readable_size(size_min)}')
    print(f'Size 1st quartile: {human_readable_size(df["size"].quantile(0.25))}')
    print(f'Size 3rd quartile: {human_readable_size(df["size"].quantile(0.75))}')
    print(f'Size max: {human_readable_size(size_max)}')

summarize_notes_df_sizes_stats(notes_df)


Size mean: 6.3KB
Size median: 3.3KB
Size std: 8.9KB
Size min: 0B
Size 1st quartile: 1.4KB
Size 3rd quartile: 7.3KB
Size max: 55.5KB


In [55]:
# Plot note sizes
import matplotlib.pyplot as plt
import seaborn as sns

# Plot note sizes assuming a log-normal distribution
def plot_notes_df_sizes(df):
    # Use a histplot
    sns.histplot(data=df['size'], x='size', bins=20, log_scale=True)
    plt.xlabel('log(size)')
    plt.ylabel('density')
    plt.title('Note sizes distribution')
    plt.show()

In [78]:
# Markdown helper functions
# Extract only prose elements of markdown text
def extract_prose(text):
    # Remove front matter
    text = re.sub(r'---.*?---', '', text, flags=re.DOTALL)
    # Remove italic bold
    text = re.sub(r'\*\*\*(.*?)\*\*\*', r'\1', text)
    # Remove bold
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    # Remove italic
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    # Remove code blocks including contents within them
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    # Remove inline code
    text = re.sub(r'`.*?`', '', text)
    # Remove the 2nd occurrence of square brackets for markdown reference links
    # e.g. [link][1] -> [link]
    text = re.sub(r'\[(.*?)\]\[.*?\]', r'[\1]', text)
    # Remove parenthesis and contents within them following square brackets of markdown links
    # e.g. [link](https://www.google.com) -> [link]
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    # Remove square brackets excluding contents within them
    text = re.sub(r'\[.*?\]', '', text)
    # Remove images
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    # Remove headings
    text = re.sub(r'#+ .*', '', text)
    # Remove horizontal rules
    text = re.sub(r'---', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove latex math
    text = re.sub(r'\$.*?\$', '', text)
    # Remove newlines
    text = re.sub(r'\n', ' ', text)
    # Remove extra spaces
    text = re.sub(r' +', ' ', text)
    return text

# Create a word counting dictionary from markdown text
# e.g. the key is a word and the value is the number of times the word appears in the text
def count_words(text):
    text = extract_prose(text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Split text into words
    words = text.split()
    # Create a word counting dictionary using dict comprehension
    word_counts = {w: words.count(w) for w in words}
    return word_counts

# Print words and their counts in order of descending counts
def print_word_counts(word_counts):
    # Sort word counts in descending order of counts
    sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    # Print words and their counts
    for word, count in sorted_word_counts:
        print(f'{word}: {count}')

def filter_out_common_words_from_text(text, common_words):
    # Remove common words from text
    for word in common_words:
        print(f'Removing {word}')
        text = text.replace(word, '')
    # Remove extra spaces
    text = re.sub(r' +', ' ', text)
    # Remove punctuation and asterisks
    text = re.sub(r'[^\w\s]', '', text)
    return text

# sql_note_text = open(f'{NOTES_PATH}/sql.md').read()
# extract_prose(sql_note_text)

pronouns_set = ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves']
prepositions_set = ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'per', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without']
alphabet_set = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
common_words = pronouns_set + prepositions_set + alphabet_set
sql_note_text = open(f'{NOTES_PATH}/sql.md').read()
sql_note_text = extract_prose(sql_note_text)
sql_note_text = filter_out_common_words_from_text(sql_note_text, common_words)
print(sql_note_text)
# sql_word_counts = count_words(sql_note_text)
# print_word_counts(sql_word_counts)

Removing i
Removing me
Removing my
Removing mine
Removing myself
Removing we
Removing us
Removing our
Removing ours
Removing ourselves
Removing you
Removing your
Removing yours
Removing yourself
Removing yourselves
Removing he
Removing him
Removing his
Removing himself
Removing she
Removing her
Removing hers
Removing herself
Removing it
Removing its
Removing itself
Removing they
Removing them
Removing their
Removing theirs
Removing themselves
Removing aboard
Removing about
Removing above
Removing across
Removing after
Removing against
Removing along
Removing amid
Removing among
Removing anti
Removing around
Removing as
Removing at
Removing before
Removing behind
Removing below
Removing beneath
Removing beside
Removing besides
Removing between
Removing beyond
Removing but
Removing by
Removing concerning
Removing considering
Removing despite
Removing down
Removing during
Removing except
Removing excepting
Removing excluding
Removing following
Removing for
Removing from
Removing in
Removi