In [1]:
from glob import glob
import pandas as pd
import os
import nbformat
import probml_utils.url_utils as url_utils
help(url_utils)

Help on module probml_utils.url_utils in probml_utils:

NAME
    probml_utils.url_utils

FUNCTIONS
    check_dead_urls(urls: Any, print_dead_url=True)
        returns if urls are dead or not
    
    colab_to_githubraw_url(url)
        convert colab .ipynb url to github raw .ipynb url
    
    colab_url_to_github_url(url)
        convert colab .ipynb url to github .ipynb url
    
    create_firestore_db(key_path)
    
    dict_to_csv(key_value_dict, csv_name, columns=['key', 'url'])
    
    extract_scripts_name_from_caption(caption)
        extract foo.py from ...{https//:<path/to/>foo.py}{foo.py}...
        Input: caption
        Output: ['foo.py']
    
    figure_url_mapping_from_lof(lof_file_path, csv_name, convert_to_colab_url=True, base_url='https://github.com/probml/pyprobml/blob/master/notebooks', book_no=1)
    
    github_to_rawcontent_url(github_url)
    
    github_url_to_colab_url(url)
        convert github .ipynb url to colab .ipynb url
    
    is_dead_url(link)
       

In [2]:
root_path = "../notebooks" 
def get_notebook_path(book_str, chap_no, nb_name):
    return os.path.join(root_path, book_str, chap_no, nb_name)

def seperate_stuffs(nb_path):
    return nb_path.split("/")[-3:]

In [3]:
book2 = glob("../notebooks/book2/*/*.ipynb") + glob("../notebooks/book1/*/*.ipynb")
book2
len(book2)

495

In [4]:
nb_list = list(map(seperate_stuffs,book2))
df_nb_list = pd.DataFrame(nb_list, columns=["book_no","chap_no","nb_name"])
df_nb_list

Unnamed: 0,book_no,chap_no,nb_name
0,book2,18,bnn_mnist_sgld.ipynb
1,book2,33,ab_test_demo.ipynb
2,book2,33,thompson_sampling_linear_gaussian.ipynb
3,book2,33,bandits.ipynb
4,book2,09,ising_image_denoise_demo.ipynb
...,...,...,...
490,book1,08,lrschedule_tf.ipynb
491,book1,08,steepestDescentDemo.ipynb
492,book1,08,newtonsMethodMinQuad.ipynb
493,book1,22,matrix_factorization_recommender_surprise_lib....


In [5]:
df_nb_list_grp = df_nb_list.groupby("nb_name").agg(lambda x: list(x)).reset_index()
df_nb_list_grp

Unnamed: 0,nb_name,book_no,chap_no
0,DiracGAN.ipynb,[book2],[25]
1,GAN_JAX_CelebA_demo.ipynb,[book2],[25]
2,GAN_loss_types.ipynb,[book2],[25]
3,IPM_divergences.ipynb,[book2],[25]
4,KLfwdReverseMixGauss.ipynb,[book1],[06]
...,...,...,...
446,word_analogies_jax.ipynb,[book1],[20]
447,word_analogies_torch.ipynb,[book1],[20]
448,xor_heaviside.ipynb,[book1],[13]
449,yeast_data_viz.ipynb,[book1],[21]


In [6]:
def is_query_in_nb(notebook, query):
    """
    fun should take one argument: code
    """
    nb = nbformat.read(notebook, as_version=4)
    for cell in nb.cells:
        code = cell["source"]
        if query in code:
            return 1
    return 0
            
def get_n_cells_nb(notebook):
    """
    fun should take one argument: code
    """
    nb = nbformat.read(notebook, as_version=4)
    return len(nb.cells)

def get_original_nb(df_nb_list_grp_ser):
    nb_name = df_nb_list_grp_ser["nb_name"]
    books = df_nb_list_grp_ser["book_no"]
    chaps = df_nb_list_grp_ser["chap_no"]
    t = []
    for book, chap in zip(books, chaps):
        nb_path = get_notebook_path(book,chap,nb_name)
        n_cells = is_query_in_nb(nb_path,"Source of this notebook")
        t.append(n_cells)
    return t

In [7]:
df_nb_list_grp["is_source_present"] = df_nb_list_grp.apply(get_original_nb, axis=1)
df_nb_list_grp

Unnamed: 0,nb_name,book_no,chap_no,is_source_present
0,DiracGAN.ipynb,[book2],[25],[0]
1,GAN_JAX_CelebA_demo.ipynb,[book2],[25],[0]
2,GAN_loss_types.ipynb,[book2],[25],[0]
3,IPM_divergences.ipynb,[book2],[25],[0]
4,KLfwdReverseMixGauss.ipynb,[book1],[06],[0]
...,...,...,...,...
446,word_analogies_jax.ipynb,[book1],[20],[0]
447,word_analogies_torch.ipynb,[book1],[20],[0]
448,xor_heaviside.ipynb,[book1],[13],[0]
449,yeast_data_viz.ipynb,[book1],[21],[0]


In [8]:
# Delete duplicate notebooks from book2

# def del_duplicate_notebook(df_root_ser):
#     is_source = df_root_ser["is_source_present"]
#     nb_name = df_root_ser["nb_name"]
#     #print(is_source)
#     for i in range(len(is_source)):
#         if is_source[i] == 1 and df_root_ser["book_no"][i] == "book2": #delete only book2's duplicate notebook:
#             print(df_root_ser["book_no"][i])
#             nb_path = get_notebook_path(df_root_ser["book_no"][i], df_root_ser["chap_no"][i],nb_name)
#             os.remove(nb_path)

# df_root = df_nb_list_grp
# df_nb_list_grp.apply(del_duplicate_notebook,axis=1)

In [9]:
def get_root_col(df_root_ser, col):
    is_source = df_root_ser["is_source_present"]
    nb_name = df_root_ser["nb_name"]
    
    if is_source.count(0) == 0:
        print(f"{nb_name} is not in pyprobml!")
        return df_root_ser[col][0]
    
    elif is_source.count(0) > 1:
        print(f"{nb_name} - multiple copies exist")
    
    else:
        return df_root_ser[col][is_source.index(0)]

df_root = df_nb_list_grp
df_root["chap_no"] = df_nb_list_grp.apply(get_root_col, col = "chap_no" ,axis=1)
df_root["book_no"] = df_nb_list_grp.apply(get_root_col, col = "book_no" ,axis=1)
df_root

bootstrap_filter.ipynb is not in pyprobml!
bootstrap_filter_maneuver.ipynb is not in pyprobml!
ekf_mlp.ipynb is not in pyprobml!
ekf_vs_ukf.ipynb is not in pyprobml!
gauss-bp-1d-line.ipynb is not in pyprobml!
pendulum_1d.ipynb is not in pyprobml!
rbpf_maneuver.ipynb is not in pyprobml!
rbpf_maneuver_demo.ipynb is not in pyprobml!
sis_vs_smc.ipynb is not in pyprobml!
bootstrap_filter.ipynb is not in pyprobml!
bootstrap_filter_maneuver.ipynb is not in pyprobml!
ekf_mlp.ipynb is not in pyprobml!
ekf_vs_ukf.ipynb is not in pyprobml!
gauss-bp-1d-line.ipynb is not in pyprobml!
pendulum_1d.ipynb is not in pyprobml!
rbpf_maneuver.ipynb is not in pyprobml!
rbpf_maneuver_demo.ipynb is not in pyprobml!
sis_vs_smc.ipynb is not in pyprobml!


Unnamed: 0,nb_name,book_no,chap_no,is_source_present
0,DiracGAN.ipynb,book2,25,[0]
1,GAN_JAX_CelebA_demo.ipynb,book2,25,[0]
2,GAN_loss_types.ipynb,book2,25,[0]
3,IPM_divergences.ipynb,book2,25,[0]
4,KLfwdReverseMixGauss.ipynb,book1,06,[0]
...,...,...,...,...
446,word_analogies_jax.ipynb,book1,20,[0]
447,word_analogies_torch.ipynb,book1,20,[0]
448,xor_heaviside.ipynb,book1,13,[0]
449,yeast_data_viz.ipynb,book1,21,[0]


In [10]:
df_root[df_root["book_no"].isna()]

Unnamed: 0,nb_name,book_no,chap_no,is_source_present


In [11]:
df_root

Unnamed: 0,nb_name,book_no,chap_no,is_source_present
0,DiracGAN.ipynb,book2,25,[0]
1,GAN_JAX_CelebA_demo.ipynb,book2,25,[0]
2,GAN_loss_types.ipynb,book2,25,[0]
3,IPM_divergences.ipynb,book2,25,[0]
4,KLfwdReverseMixGauss.ipynb,book1,06,[0]
...,...,...,...,...
446,word_analogies_jax.ipynb,book1,20,[0]
447,word_analogies_torch.ipynb,book1,20,[0]
448,xor_heaviside.ipynb,book1,13,[0]
449,yeast_data_viz.ipynb,book1,21,[0]


In [12]:
url_utils.github_to_rawcontent_url("https://github.com/probml/pyprobml/blob/master/notebooks/book2/03/dtheory.ipynb")

'https://raw.githubusercontent.com/probml/pyprobml/master/notebooks/book2/03/dtheory.ipynb'

In [13]:
# check dead urls
df_root["url"] = df_root.apply(lambda x: url_utils.make_url_from_chapter_no_and_script_name(chapter_no=int(x["chap_no"]), 
                                                                                            script_name=x["nb_name"], 
                                                                                            book_no=int(x["book_no"][-1]),
                                                                                         convert_to_which_url = "github-raw"), axis=1)
df_root

Unnamed: 0,nb_name,book_no,chap_no,is_source_present,url
0,DiracGAN.ipynb,book2,25,[0],https://raw.githubusercontent.com/probml/pypro...
1,GAN_JAX_CelebA_demo.ipynb,book2,25,[0],https://raw.githubusercontent.com/probml/pypro...
2,GAN_loss_types.ipynb,book2,25,[0],https://raw.githubusercontent.com/probml/pypro...
3,IPM_divergences.ipynb,book2,25,[0],https://raw.githubusercontent.com/probml/pypro...
4,KLfwdReverseMixGauss.ipynb,book1,06,[0],https://raw.githubusercontent.com/probml/pypro...
...,...,...,...,...,...
446,word_analogies_jax.ipynb,book1,20,[0],https://raw.githubusercontent.com/probml/pypro...
447,word_analogies_torch.ipynb,book1,20,[0],https://raw.githubusercontent.com/probml/pypro...
448,xor_heaviside.ipynb,book1,13,[0],https://raw.githubusercontent.com/probml/pypro...
449,yeast_data_viz.ipynb,book1,21,[0],https://raw.githubusercontent.com/probml/pypro...


In [14]:
url_utils.check_dead_urls(list(df_root["url"]))

0 dead urls detected!


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [26]:
# Add colab url
df_root["colab_url"] = df_root.apply(lambda x: url_utils.make_url_from_chapter_no_and_script_name(chapter_no=int(x["chap_no"]), 
                                                                                            script_name=x["nb_name"], 
                                                                                            book_no=int(x["book_no"][-1]),
                                                                                         convert_to_which_url = "colab"), axis=1)

# Add colab url
df_root["github_url"] = df_root.apply(lambda x: url_utils.make_url_from_chapter_no_and_script_name(chapter_no=int(x["chap_no"]), 
                                                                                            script_name=x["nb_name"], 
                                                                                            book_no=int(x["book_no"][-1]),
                                                                                         convert_to_which_url = "github"), axis=1)
df_root


Unnamed: 0,nb_name,book_no,chap_no,is_source_present,url,md_url,md_colab_url,colab_url,github_url
0,DiracGAN.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
1,GAN_JAX_CelebA_demo.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
2,GAN_loss_types.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
3,IPM_divergences.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
4,KLfwdReverseMixGauss.ipynb,book1,06,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
...,...,...,...,...,...,...,...,...,...
446,word_analogies_jax.ipynb,book1,20,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
447,word_analogies_torch.ipynb,book1,20,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
448,xor_heaviside.ipynb,book1,13,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
449,yeast_data_viz.ipynb,book1,21,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=colab>colab</span>](https://colab.re...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...


In [21]:
t = df_root["url"][1]
t

'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book2/25/GAN_JAX_CelebA_demo.ipynb'

In [35]:
enclose_span = lambda text,nb_id: f"<span id={nb_id}>{text}</span>"
to_md_url = lambda text, url: f"[{text}]({url})"

#to_md_url(enclose_span("GAN_JAX_CelebA_demo.ipynb"), t)
df_root["md_colab_url"] = df_root.apply(lambda x: to_md_url(enclose_span("colab", x["nb_name"]), x["colab_url"]),axis=1)
df_root["md_github_url"] = df_root.apply(lambda x: to_md_url(enclose_span("github", x["nb_name"]), x["github_url"]),axis=1)
df_root

Unnamed: 0,nb_name,book_no,chap_no,is_source_present,url,md_url,md_colab_url,colab_url,github_url,md_github_url
0,DiracGAN.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=DiracGAN.ipynb>colab</span>](https:/...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=DiracGAN.ipynb>github</span>](https:...
1,GAN_JAX_CelebA_demo.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=GAN_JAX_CelebA_demo.ipynb>colab</spa...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=GAN_JAX_CelebA_demo.ipynb>github</sp...
2,GAN_loss_types.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=GAN_loss_types.ipynb>colab</span>](h...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=GAN_loss_types.ipynb>github</span>](...
3,IPM_divergences.ipynb,book2,25,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=IPM_divergences.ipynb>colab</span>](...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=IPM_divergences.ipynb>github</span>]...
4,KLfwdReverseMixGauss.ipynb,book1,06,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=KLfwdReverseMixGauss.ipynb>colab</sp...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=KLfwdReverseMixGauss.ipynb>github</s...
...,...,...,...,...,...,...,...,...,...,...
446,word_analogies_jax.ipynb,book1,20,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=word_analogies_jax.ipynb>colab</span...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=word_analogies_jax.ipynb>github</spa...
447,word_analogies_torch.ipynb,book1,20,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=word_analogies_torch.ipynb>colab</sp...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=word_analogies_torch.ipynb>github</s...
448,xor_heaviside.ipynb,book1,13,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=xor_heaviside.ipynb>colab</span>](ht...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=xor_heaviside.ipynb>github</span>](h...
449,yeast_data_viz.ipynb,book1,21,[0],https://colab.research.google.com/github/probm...,[<span id=colab>colab</span>](https://colab.re...,[<span id=yeast_data_viz.ipynb>colab</span>](h...,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...,[<span id=yeast_data_viz.ipynb>github</span>](...


In [38]:
df_final = df_root[["nb_name", "md_colab_url", "md_github_url"]]
df_final = df_final.sort_values(by='nb_name', key=lambda col: col.str.lower())
df_final.columns = ["Notebook", "Colab url", "Github url"]
df_final

Unnamed: 0,Notebook,Colab url,Github url
10,ab_test_demo.ipynb,[<span id=ab_test_demo.ipynb>colab</span>](htt...,[<span id=ab_test_demo.ipynb>github</span>](ht...
11,activation_fun_deriv_jax.ipynb,[<span id=activation_fun_deriv_jax.ipynb>colab...,[<span id=activation_fun_deriv_jax.ipynb>githu...
12,activation_fun_plot.ipynb,[<span id=activation_fun_plot.ipynb>colab</spa...,[<span id=activation_fun_plot.ipynb>github</sp...
13,adf_logistic_regression_demo.ipynb,[<span id=adf_logistic_regression_demo.ipynb>c...,[<span id=adf_logistic_regression_demo.ipynb>g...
14,advi_beta_binom_jax.ipynb,[<span id=advi_beta_binom_jax.ipynb>colab</spa...,[<span id=advi_beta_binom_jax.ipynb>github</sp...
...,...,...,...
446,word_analogies_jax.ipynb,[<span id=word_analogies_jax.ipynb>colab</span...,[<span id=word_analogies_jax.ipynb>github</spa...
447,word_analogies_torch.ipynb,[<span id=word_analogies_torch.ipynb>colab</sp...,[<span id=word_analogies_torch.ipynb>github</s...
448,xor_heaviside.ipynb,[<span id=xor_heaviside.ipynb>colab</span>](ht...,[<span id=xor_heaviside.ipynb>github</span>](h...
449,yeast_data_viz.ipynb,[<span id=yeast_data_viz.ipynb>colab</span>](h...,[<span id=yeast_data_viz.ipynb>github</span>](...


In [39]:
df_final.to_markdown("common_readme.md", index=False)