# Take an Inventory of All Colab Notebooks

Find all notebooks in Google Drive, get stats (e.g. names, provenance chains) and extract the first text cell to be able to inventory what you have there.

This uses the GDrive API.

In [None]:
import os
import re
import json
import pandas as pd
from tqdm import tqdm
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

SCOPES = ['https://www.googleapis.com/auth/drive']

In [None]:
#@markdown authenticate with Google
credpath = '/content/drive/MyDrive/keys/gdrive-credentials.json' #@param {type:'string'}
creds = None
if os.path.exists('token.json'):
    creds = Credentials.from_authorized_user_file('token.json', SCOPES)

if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            credpath, SCOPES)
        creds = flow.run_console(port=0)
    # Save the credentials for the next run
    with open('token.json', 'w') as token:
        token.write(creds.to_json())

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=125818656934-snl68hp0qqsnnsth5lkgo59b6fhgsptj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=bESTVxLXLDga7swDgBl9Z4ZokYGqHX&prompt=consent&access_type=offline
Enter the authorization code: 4/1AdQt8qi9TR8m7Y388yZmPXc9NiaoUxN_IGp1Uo1CIEjxaaTJ4uTAbptQZnc


In [None]:
service = build('drive', 'v3', credentials=creds)
files = service.files()

## Search for all Colab Notebooks in my Google Drive

In [None]:
page_token = None
all_files = []

while True:
    print(page_token)
    response = files.list(q="mimeType='application/vnd.google.colaboratory'",
                                                spaces='drive',
                                    fields='nextPageToken, '
                                                    'files(id, mimeType, name, webContentLink, webViewLink, modifiedByMeTime)',
                                                pageToken=page_token).execute()
    all_files.extend(response.get('files', []))

    page_token = response.get('nextPageToken', None)
    if page_token is None:
        break

colabmeta = pd.DataFrame(all_files).sort_values('modifiedByMeTime', ascending=False)
print(len(colabmeta))
colabmeta.sample(2)

195


Unnamed: 0,id,name,mimeType,webContentLink,webViewLink,modifiedByMeTime
103,1iDxMSytyio2Y1w7wXNkI8d1eXXdhDbt-,My VQGAN updates.ipynb,application/vnd.google.colaboratory,https://drive.google.com/uc?id=1iDxMSytyio2Y1w...,https://colab.research.google.com/drive/1iDxMS...,2021-09-15T21:56:39.714Z
113,12dTHThcO1p4Er8oLZ6WWqmM8D828Q7w_,SpellingCorrection.ipynb,application/vnd.google.colaboratory,https://drive.google.com/uc?id=12dTHThcO1p4Er8...,https://colab.research.google.com/drive/12dTHT...,2021-09-01T22:25:14.405Z


## Collect contents of files and extract first markdown cell

In [None]:
details = []
for fileId in tqdm(colabmeta.id.tolist()):
    fileraw = files.get_media(fileId=fileId).execute()
    filedata = json.loads(fileraw.decode('utf-8'))

    try:
        provenance = filedata['metadata']['colab']['provenance']
        nprov = len(provenance)
    except:
        provenance, nprov = None, None

    first_markdown = None
    for cell in filedata['cells']:
        if cell['cell_type'] == 'markdown':
            first_markdown = "".join(cell['source']).strip()
            break
    details.append((fileId, first_markdown, provenance, nprov))

detaildf = pd.DataFrame(details, columns=['id', 'first_md', 'provenance', 'nprov'])
data = colabmeta.merge(detaildf, on='id')
data.sample(2)

Unnamed: 0,id,name,mimeType,webContentLink,webViewLink,modifiedByMeTime,first_md,provenance,nprov
28,1GT07sahsWXiB01H7y22lKGSAqqpAb-C7,Term Weighting OCS Example,application/vnd.google.colaboratory,https://drive.google.com/uc?id=1GT07sahsWXiB01...,https://colab.research.google.com/drive/1GT07s...,2022-07-19T19:51:45.804Z,# Testing Term-Weighting in [Open Creativity S...,[],0.0
57,1RbK9mty8z19ChXoBXG0FBWoTf7TKZT-k,ZeroShotForGame3.ipynb,application/vnd.google.colaboratory,https://drive.google.com/uc?id=1RbK9mty8z19ChX...,https://colab.research.google.com/drive/1RbK9m...,2022-04-12T22:12:38.800Z,,[],0.0


In [None]:
def pretty_name(x):
    x = x.replace('.ipynb', '').replace('_', ' ')
    x = re.sub('([a-z])([A-Z])', r'\1 \2', x)
    x = x.replace('ru DALLE', 'ruDALLE').replace('yle GAN', 'yleGAN').replace('i Thi', 'iThi').replace('thi Trust', 'thiTrust')
    x = re.sub('\[.*?\]', '', x)
    x = re.sub('\(.*?\)', '', x)
    x = re.sub('v\d\.?\d?', '', x)
    x = x.strip()
    return x
data['pretty_name'] = data['name'].apply(pretty_name)
data.sample()

Unnamed: 0,id,name,mimeType,webContentLink,webViewLink,modifiedByMeTime,first_md,provenance,nprov,pretty_name
149,1ygDHov1sSf67MwMD0Aw-BJ5KIBXnAF7J,Working with Book Recommendation Ground Truth,application/vnd.google.colaboratory,https://drive.google.com/uc?id=1ygDHov1sSf67Mw...,https://colab.research.google.com/drive/1ygDHo...,2020-04-30T01:15:06.736Z,,[],0.0,Working with Book Recommendation Ground Truth


## Analysis

In [None]:
# Longest provenance
data[data.nprov == data.nprov.max()]

Unnamed: 0,id,name,mimeType,webContentLink,webViewLink,modifiedByMeTime,first_md,provenance,nprov,pretty_name
68,1Z5UAANeevPHQ_OkI17cRw5ZxkY1hCkPc,My Disco Diffusion,application/vnd.google.colaboratory,https://drive.google.com/uc?id=1Z5UAANeevPHQ_O...,https://colab.research.google.com/drive/1Z5UAA...,2022-02-12T03:40:54.122Z,"# Disco Diffusion v4.1 - Now with Video Inits,...",[{'file_id': '1sHfRn5Y0YKYKi1k-ifUSBFRNJ8_1sa3...,11.0,My Disco Diffusion
69,16q8ExajysAdr4MCBRNeexNoCQqsmWjHR,"Copy of Disco Diffusion v4.1 [w/ Video Inits, ...",application/vnd.google.colaboratory,https://drive.google.com/uc?id=16q8ExajysAdr4M...,https://colab.research.google.com/drive/16q8Ex...,2022-02-11T09:07:28.110Z,"# Disco Diffusion v4.1 - Now with Video Inits,...",[{'file_id': '1sHfRn5Y0YKYKi1k-ifUSBFRNJ8_1sa3...,11.0,Copy of Disco Diffusion


In [186]:
fullmddoc = ''
for i, row in data.sample(2).iterrows():
    fmd = row.first_md.replace('#', '##', 1) if row.first_md else ''
    md = f'''#{row.pretty_name}

<{row.webViewLink}>

Last Modified: {row.modifiedByMeTime}
Length of Provenance: {row.nprov}

Details:

{fmd}

----

'''
    fullmddoc += md

display(Markdown(fullmddoc))

#Lab04 Adrienne Text Mining

<https://colab.research.google.com/drive/12gNN26pcGphb4xP9Lm3ECaDDQ8eJpC6p>

Last Modified: 2022-04-20T19:08:19.505Z
Length of Provenance: 1.0

Details:

##Text Mining Lab 4

----

#Firearm Deaths By State

<https://colab.research.google.com/drive/1GHuQGOeJTaqeIcc96gevdJuL13bxyaCz>

Last Modified: 2022-06-04T20:25:53.363Z
Length of Provenance: 0.0

Details:



----



Pasted to here for easy browsing: https://docs.google.com/document/d/1D4CxYN-b_7nsHGl0aJTYDo4yJrTjOL3SzFE3UY_XhNs/edit#