In [1]:
import pandas as pd
# read in all commits in sample
all_commits_in_sample = pd.read_csv("6All_Commits_in_Sample.csv")

In [2]:
# for each unique notebook, take the latest commit
first_commits = all_commits_in_sample.groupby("notebook").first()

In [3]:
# read in all notebooks
all_notebooks = pd.read_csv("1Notebook_Sample.csv")

In [4]:
# determine which notebooks are missing from first_commits from all_notebooks
missing_notebooks = all_notebooks[~all_notebooks["Repository"].isin(first_commits["repo"].unique())]

In [5]:
import subprocess

# get latest_commit_id
missing_notebooks["commit"] = ""
failed = 0
for notebook in missing_notebooks["Repository"]:
    command = f"git ls-remote https://github.com/{notebook}.git"
    try:
        output = subprocess.check_output(command, shell=True).decode("utf-8")
        latest_commit_id = output.split()[0]
        missing_notebooks.loc[missing_notebooks["Repository"] == notebook, "commit"] = latest_commit_id
    except:
        missing_notebooks.loc[missing_notebooks["Repository"] == notebook, "commit"] = None
        failed += 1
print("Failed: ", failed)

# drop repos that were not found
missing_notebooks = missing_notebooks[missing_notebooks["commit"].notnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_notebooks["commit"] = ""
remote: Repository not found.
fatal: repository 'https://github.com/userkimcs/uit-grad.git/' not found
remote: Repository not found.
fatal: repository 'https://github.com/darshak10ramani/jupyternotebook.git/' not found
remote: Repository not found.
fatal: repository 'https://github.com/Oxxkar/PythonDataScienceHandbook.git/' not found
remote: Repository not found.
fatal: repository 'https://github.com/brucewuquant/ie598.git/' not found


Failed:  4


In [6]:
# rename relevant columns
missing_notebooks = missing_notebooks.rename(columns={"Repository": "repo", "Notebook Path": "notebook"})

In [7]:
# make notebook the index column
missing_notebooks = missing_notebooks.set_index("notebook")
# keep only repo, notebook, and commit columns in missing_notebooks
missing_notebooks = missing_notebooks[["repo", "commit"]]
# keep only repo, notebook, and commit columns in first_commits
first_commits = first_commits[["repo", "commit"]]

In [8]:
# merge missing_notebooks with first_commits
merged = pd.concat([first_commits, missing_notebooks])

In [9]:
def create_url(repo, commit_hash, notebook):
    return f"https://raw.githubusercontent.com/{repo}/{commit_hash}/{notebook}"

# Create a new column with the URL strings
merged['url'] = merged.apply(lambda x: create_url(x['repo'], x['commit'], x.name), axis=1)

In [10]:
# for each URL, replace any instance of '\ ' with '%20'
merged['url'] = merged['url'].str.replace(r'\ ', '%20')

In [11]:
# fetch each notebook from github
import requests
def fetch_response_json(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

notebooks = []
for url in merged['url']:
    notebook = fetch_response_json(url)
    if notebook is None:
        print("Error: ", url)
        failed += 1
    else:
        notebooks.append((url, notebook))

print("Total failed: ", failed)

Error:  https://raw.githubusercontent.com/jpbacher/foreclosures/b321f02281384481e3c455f194e094b2adee9085/06_FinalScript.ipynb
Error:  https://raw.githubusercontent.com/rfeinman/hw-mathtools/79c43583b73f8eb288cdef7d86e38ad17d0e5d2e/Feinman_Reuben_HW6/hw6.ipynb
Error:  https://raw.githubusercontent.com/reedan88/QAQC_Sandbox/6788f56e857ddc6cbd2c117b67ccce2c17624aa3/Metadata_Review/Metadata_Review.ipynb
Error:  https://raw.githubusercontent.com/stanlo229/MicrosoftMLwithPython/48d137dc459384c0645f5c90c3658bdf1fb96d90/Module5/Bias-Variance-Trade-Off.ipynb
Error:  https://raw.githubusercontent.com/Rishivikram1/mxnet/768a0312b191d76d2a9ad273a030b3f15b17cffb/chapter02_supervised-learning/softmax-regression-scratch.ipynb
Error:  https://raw.githubusercontent.com/jmartu/TFM/395c4a27c0d66fef032f9fe8e4cf2cc3c53cdae1/convnet/VGG16_04_bottleneck.ipynb
Error:  https://raw.githubusercontent.com/SultanovAR/augmentation/6302b7c830945f50aa9d1c62c0dbb7b57bbe2bb6/deeppavlov/models/evolution/Results_analysis

In [12]:
# only support notebooks with version 4 or above
# filter by supported notebooks and count the number of unsupported notebooks
unsupported_count = 0
supported = []
for notebook in notebooks:
    if notebook[1]['nbformat'] < 4:
        unsupported_count += 1
    else:
        supported.append(notebook)
failed += unsupported_count
print("Total failed: ", failed)

Total failed:  37


In [13]:
# for each notebook, get each cell's source code
def get_single_cell_src_lines(notebook, cell_id):
        cell = notebook[1]['cells'][cell_id]
        if type(cell['source']) == list:
            src_lines = cell['source']
        else:
            assert type(cell['source']) == str
            src_lines = cell['source'].split('\n')
        return (notebook[0], ''.join(src_lines))

notebook_cells = []
for notebook in supported:
    cells = []
    for cell_id in range(len(notebook[1]['cells'])):
        # get only code cells
        if notebook[1]['cells'][cell_id]['cell_type'] == 'code':
            cell = get_single_cell_src_lines(notebook, cell_id)
            cells.append(cell)
    notebook_cells.append(cells)

In [16]:
# count the total number of cells and total number of empty and nonempty cells
empty_notebooks = 0
total_cells = 0
empty_cells = 0
nonempty_cells = 0
more_than_one_line = 0
for notebook in notebook_cells:
    if len(notebook) == 0:
        empty_notebooks += 1
        failed += 1
        continue
    total_cells += len(notebook)
    for cell in notebook:
        # empty cell
        if cell[1] == '':
            empty_cells += 1
        # nonempty cell
        elif cell[1] != '':
            nonempty_cells += 1
        # more than one code line
        lines = cell[1].split('\n')
        comment_lines = 0
        for line in lines:
            if line.startswith('#') or line.strip() == '':
                comment_lines += 1
        if len(lines) - comment_lines > 1:
            more_than_one_line += 1

print("Empty notebooks: ", empty_notebooks)
print("Total failed (incl. empty): ", failed)
print("Total cells: ", total_cells)
print("Empty cells: ", empty_cells)
print("Nonempty cells: ", nonempty_cells)        
print("Cells with more than one line: ", more_than_one_line)   

Empty notebooks:  5
Total failed (incl. empty):  42
Total cells:  6344
Empty cells:  246
Nonempty cells:  6098
Cells with more than one line:  4311


In [17]:
# remove empty notebooks, empty cells, and cells with only one line (after removing comments and blank lines)
# empty notebooks also count if all cells are empty or one line
new_notebooks = []
for notebook in notebook_cells:
    new_notebooks.append([cell for cell in notebook if cell[1] != ''])
notebook_cells = new_notebooks

# new_notebooks = []
# for notebook in notebook_cells:
#     cells = []
#     for cell in notebook:
#         comment_lines = 0
#         lines = cell.split('\n')
#         for line in lines:
#             if line.startswith('#') or line.strip() == '':
#                 comment_lines += 1
#         if len(lines) - comment_lines > 1:
#             cells.append(cell)
#     new_notebooks.append(cells)
# notebook_cells = new_notebooks
        
notebook_cells = [notebook for notebook in notebook_cells if len(notebook) != 0]

In [18]:
# print number of notebooks fetched
print("Number of notebooks fetched: ", len(notebook_cells))

Number of notebooks fetched:  233


In [19]:
# get all cells
all_cells = []
for notebook in notebook_cells:
    for cell in notebook:
        all_cells.append(cell)

print("Total cells: ", len(all_cells))

Total cells:  6098


In [21]:
# print all cells to a folder called 'all_cells'
import os

if not os.path.exists("all_cells"):
    os.makedirs("all_cells")

for i in range(len(all_cells)):
    with open("all_cells/" + str(i) + ".py", "w") as f:
        f.write(all_cells[i][1])

In [None]:
# print names of all urls into a single file called 'all_cells_urls.txt'
with open("all_cells_urls.txt", "w") as f:
    for cell in all_cells:
        f.write(cell[0] + '\n')