# Process Notebooks

In [1]:
import string
from processors.import_processor import get_imports
import logging
import nbformat
import json
import pandas as pd
import numpy as np
import pandas as pd
import pypandoc
import re

logging.basicConfig(filename="log/processNotebooks.log", filemode='w')
# Creating an object
logger = logging.getLogger()
# Setting the threshold of logger to DEBUG

logger.setLevel(logging.DEBUG)
text = ''

In [2]:
DATA_DIRECTORY = 'data-1k/'
BASE64_IMG_DIRECTORY = 'base64Images/'
SAMPLE_DATA = 'sample-1000.txt'

 # Lambdas and Helper methods

In [3]:
def getLanguage(x):
    """
    Takes each notebook and extracts the language information from the notebook.
    """
    language = None
    # x = row.raw
    if 'kernelspec' in x['metadata'].keys() :
        language = x['metadata']['kernelspec']['name']
    elif 'language_info' in x['metadata'].keys():
        language = x['metadata']['language_info']['name']
    else:
        logger.exception("kernelspec and language_info keys not present in metadata. printing metadata entry for notebook")
        logger.error(x['metadata'])
        language = None
    return language  

In [4]:
def getSourceCodeAndExtractImports(data_row):
    filename = data_row['fileNames']
    data = data_row['raw']
    cells = data['cells']
    language = data_row['language']
    import_strings = []
    print(f'Processing {filename}')
    if language is not None and 'py' in language:
        for cell in cells:
            if cell['cell_type'] == 'code':
                src = cell['source']
                try:
                    import_list = get_imports(src)
                    for import_item in import_list:
                        modules = import_item.module
                        imported_names = import_item.name
                        import_str = ".".join(modules) + " " + ".".join(imported_names)
                        import_strings.append(import_str)
                except:
                    pass

    return json.dumps(import_strings)

In [5]:
def hasMatplotlib(line: "str"):
    if line.strip().startswith("#"):
        return False

    # fr matplotlib
    if '.subplots(' in line:
        return True
    
    return False


def hasBokeh(line: "str"):
    if line.strip().startswith("#"):
        return False

    # for bokeh
    if 'gridplot' in line:
        return True
    if 'column(' in line:
        return True
    if 'row(' in line:
        return True
    if 'layout(' in line:
        return True
    
    return False


def hasPlotly(line: "str"):
    if line.strip().startswith("#"):
        return False
    
    # for plotly
    if 'facet_' in line: 
        return True
    
    return False


def count_factory(check):
    def fn(row):
        filename = row['fileNames']
        data = row['raw']
        cells = data['cells']
        count = 0
        for cell in cells:
            if cell['cell_type'] == 'code':
                src = cell['source']
                try:
                    for line in src.split('\n'):
                        if check(line):
                            count += 1
                except:
                    pass
            
        return count
    return fn

In [6]:
# get base64 image metadata from each cell  
def storeBase64ImagesAndExtractImageMetadata(row):
    x = row['raw']
    cells = x['cells']
    base64Images = []
    image_metadata = []
    for cell in cells:
        if cell['cell_type'] == 'code':
            # check if cell contains display type as images or plots
            if 'outputs' in cell.keys():
                outputs = cell['outputs']
                for output in outputs:
                    if 'data' in output.keys():
                        data = output['data']
                        if 'image/png' in data.keys():
                            base64Images.append(data['image/png'])
                            image_metadata.append('png')
                        elif 'image/jpeg' in data.keys():
                            base64Images.append(data['image/jpeg'])
                            image_metadata.append('jpeg')
                        elif 'image/svg+xml' in data.keys():
                            base64Images.append(data['image/svg+xml'])
                            image_metadata.append('svg')
                            
    # Store the images into a directory
    for i, image in enumerate(base64Images):
        image_file_name = f"{row['fileNames']}-{i}.png"
        try:
            with open(f"{DATA_DIRECTORY}{BASE64_IMG_DIRECTORY}{image_file_name}", "w") as f:
                f.write(image)
        except Exception as e:
            continue
    
    return json.dumps(image_metadata)

In [7]:
def fileToNbNode(x):
    a = None
    try:
        # print("loading file: data-1k/" + x)
        a = nbformat.read(f"{DATA_DIRECTORY}{x}", as_version=4)
    except Exception as e:
        # print(x)
        logger.exception("Exception occurred for file '"+ x + "': "+ repr(e))
        
        a = None
    return a

 ## load the text file into a pandas dataframe

In [8]:
fnames = []
with open(f"{SAMPLE_DATA}") as f:
    for line in f:
        fnames.append(line.strip())

# load fnames into a pandas dataframe with column "fileNames"
df = pd.DataFrame(fnames, columns=["fileNames"])
validFiles = df.dropna()
# print("number of valid files")

# for each fileName in df, load it using nbformat and store it in a new column "raw"
df['raw'] = df['fileNames'].apply(lambda x: fileToNbNode(x))
validFiles = df.dropna()
print("number of valid files")
print(validFiles.count())
# validFiles.set_option('display.width', 999)
pd.options.display.max_colwidth = 999
# print(validFiles.count())

number of valid files
fileNames    995
raw          995
dtype: int64


## Now get the kernel type and language for each file and store it in the "language" column of this dataframe.

In [9]:
validFiles['language'] = validFiles['raw'].apply(lambda x: getLanguage(x) )
print(f'Shape of validfiles with language: {validFiles.shape}')
# logger.info(validFiles[['fileNames','language']].head(20))
print(f'Validfiles with drop na after language : {validFiles.shape}')
validFiles['language'].value_counts()

Shape of validfiles with language: (995, 3)
Validfiles with drop na after language : (995, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validFiles['language'] = validFiles['raw'].apply(lambda x: getLanguage(x) )


python3                                                              809
python2                                                               84
Python [Root]                                                          6
conda-root-py                                                          6
ir                                                                     3
python36                                                               3
tensorflow                                                             3
sagemath                                                               2
python                                                                 2
julia-0.5                                                              2
java                                                                   2
julia-1.0                                                              2
pythondata                                                             2
julia-0.4                                          

 ## Get the subplot information

In [10]:

# validFiles['subplot_count'] = validFiles.apply(lambda nb: getSourceCodeAndExtractSubplotCount(nb), axis=1)
validFiles['matplotlib_count'] = validFiles.apply(lambda nb: count_factory(hasMatplotlib)(nb), axis=1)
validFiles['plotly_count'] = validFiles.apply(lambda nb: count_factory(hasPlotly)(nb), axis=1)
validFiles['bokeh_count'] = validFiles.apply(lambda nb: count_factory(hasBokeh)(nb), axis=1)
print(validFiles.head(50))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validFiles['matplotlib_count'] = validFiles.apply(lambda nb: count_factory(hasMatplotlib)(nb), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validFiles['plotly_count'] = validFiles.apply(lambda nb: count_factory(hasPlotly)(nb), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validFi

                                         fileNames  \
0   3a086a2a7f2c39cc7ba96591be9421d4c039fd15.ipynb   
1   f5ebce0eaaaafb97c083d50ed2d92eb042ac8ade.ipynb   
2   1b33f8f963e34d5c7c312bf60f0aaea3186b5185.ipynb   
3   6e344a48a45ab3a54916ce37345bdb66e820a8b4.ipynb   
4   32e931355b30b9581621d4404c80957d2f1b1487.ipynb   
5   d6113cd4137f7d6938e8437e5a6f101928e9ef59.ipynb   
6   c2339e43d4779fe62bfeed4a032c8314a77783fd.ipynb   
7   cc09fe32ad90745050896c45693d23a740217246.ipynb   
8   a406d8d85481c1b72c51ff954f73c1c4d9552a8c.ipynb   
9   5ab864003202c4d0a91612b48b04fce393272d73.ipynb   
10  28841b67b469b1750d52036282ca16180b7cf1fd.ipynb   
11  d2cbf2e23d59a4ba3599529919f8a1fcfe172813.ipynb   
12  0c40263a43c1db2deba7983cbd958b7e1cbab6de.ipynb   
13  a86b1f2c0cf0b09f5c210c39aa9b68c724b84f46.ipynb   
14  baf8f117962a742cd3f2f3f32b73208d9e4f51c4.ipynb   
15  00ea6c5f3f3b1b9418466cd680b19ad7cbb1b672.ipynb   
16  c06dcd817a6a482b81e969dba329babec858cbe2.ipynb   
17  7315f26819a7b29bb7cec850

In [11]:
validFiles['imports'] = validFiles.apply(lambda nb: getSourceCodeAndExtractImports(nb), axis=1)
# languageGroups = validFiles.groupby('language').count()
# logger.info(languageGroups.head(10))
logger.info(" number of files after dropping values with no language_info is")
logger.info(validFiles.count())

# validFiles['source'] = validFiles['raw'].apply(lambda x: getSourceFromCells(x))
# validFiles['output'] = validFiles['raw'].apply(lambda x: getOutputFromCells(x))
# validFiles= validFiles.dropna()
# logger.info("valid files with code cells with outputs")
# logger.info(validFiles.count())
validFiles['images'] = validFiles.apply(lambda row: storeBase64ImagesAndExtractImageMetadata(row), axis=1)
validFiles = validFiles.dropna()
logger.info("valid files with base64 image outputs")
logger.info(validFiles.count())
validFiles['numImages'] = validFiles['images'].apply(lambda x: len(x))
logger.info("sum of numImages column")
logger.info(validFiles['numImages'].sum())

logger.info(validFiles[['fileNames','language','numImages']].head(5))

Processing 3a086a2a7f2c39cc7ba96591be9421d4c039fd15.ipynb
Processing f5ebce0eaaaafb97c083d50ed2d92eb042ac8ade.ipynb
Processing 1b33f8f963e34d5c7c312bf60f0aaea3186b5185.ipynb
Processing 6e344a48a45ab3a54916ce37345bdb66e820a8b4.ipynb
Processing 32e931355b30b9581621d4404c80957d2f1b1487.ipynb
Processing d6113cd4137f7d6938e8437e5a6f101928e9ef59.ipynb
Processing c2339e43d4779fe62bfeed4a032c8314a77783fd.ipynb
Processing cc09fe32ad90745050896c45693d23a740217246.ipynb
Processing a406d8d85481c1b72c51ff954f73c1c4d9552a8c.ipynb
Processing 5ab864003202c4d0a91612b48b04fce393272d73.ipynb
Processing 28841b67b469b1750d52036282ca16180b7cf1fd.ipynb
Processing d2cbf2e23d59a4ba3599529919f8a1fcfe172813.ipynb
Processing 0c40263a43c1db2deba7983cbd958b7e1cbab6de.ipynb
Processing a86b1f2c0cf0b09f5c210c39aa9b68c724b84f46.ipynb
Processing baf8f117962a742cd3f2f3f32b73208d9e4f51c4.ipynb
Processing 00ea6c5f3f3b1b9418466cd680b19ad7cbb1b672.ipynb
Processing c06dcd817a6a482b81e969dba329babec858cbe2.ipynb
Processing 731

Processing e3d955308acd9bd1a127ecc1097b683a01c4f4e3.ipynb
Processing 48a2a338552a4d839fe5aa6f2485b027a74dc177.ipynb
Processing 75edaca7f51ba6ece0bd3a285495ac044ae3dc07.ipynb
Processing 7f47832a17323376cc9cece682a298bba4b56ebf.ipynb
Processing c476bedfa5f61c94a3e95c9914d32fe9444648ce.ipynb
Processing 8b25323fc93588322d5b4afcc85af59fdce4c982.ipynb
Processing d68749eb2a43d645181e584371d422c3de373d60.ipynb
Processing ccb42ec3538e1cf0c2afe5cb9693935fa64e15da.ipynb
Processing 314c8e81ac8d52a55f058c5b480b011d5336583b.ipynb
Processing 0a3813447a5ceb9c0d7a350a6eebba3071bdb571.ipynb
Processing 86d48ee6eabbb959f013d011f8c631ab51fb5e44.ipynb
Processing a704c4348e266ed8509cd891956e4b9ac829e90e.ipynb
Processing 94591902ae89a06ec9881937a93d3db8b09266c3.ipynb
Processing b5d957eece4b7b1a30158412570d9ba2e4f977d9.ipynb
Processing 513adaf1fb2dae10bcfe316abe2cdbad6cdb1d04.ipynb
Processing 6fa79ba65b83a5bddee23956887d92c79785a11f.ipynb
Processing 2ef4b049c66eb945650b2df422b6614fdeb77031.ipynb
Processing 6d8

Processing 9bc0b8f59288d7064eedda250f4f21021d50a1ad.ipynb
Processing 231d45d2d9d11c081bdee294370df1dbc227a0bb.ipynb
Processing 7955aac6e302f7cad4e0c8f10837df264e077587.ipynb
Processing 26977a3fc296859f833f3064b917be75ba139bb0.ipynb
Processing c183414af3f0ce44c269fa955d4c3a309bc72422.ipynb
Processing 270df91e2c76c91ea68df276a9d1b25351a545bf.ipynb
Processing f824f6352df9e69b01d50ce71ca19b9f95d49b04.ipynb
Processing 926da025102d2570706425251f289d2887b1377f.ipynb
Processing 62481993dca5257591dabc56f607dd1576f94db4.ipynb
Processing a8b5a08422f8e9b1f8b4edf33f4a814282254f38.ipynb
Processing 8494a29720ebe57faf413118d51a24834482857c.ipynb
Processing 11b6083146d2f2a16da1d0a30925294092788780.ipynb
Processing 8d67dd63a950f2e96775ee6e83d13101c894b113.ipynb
Processing 50b14f4e488dafcaca04307d5a284e2345f8fe2c.ipynb
Processing 88e11955f36a7ca0a92a4f34163a3dd4d3072392.ipynb
Processing fa272858113fcce54237a44c0c6a3839d40c55f1.ipynb
Processing 82df3a0037bef24b031a41c824ef17cb28990310.ipynb
Processing 6a3

Processing d836bcc6c97424f388327505fcde0782e52cc2df.ipynb
Processing a17b09e5a82c61c2f66597f2c749478af15af6b5.ipynb
Processing f75f1d04f89b4b14857e91fe3af47df875a32790.ipynb
Processing 868130b99d282128eda6e712d6b6064c80744212.ipynb
Processing bce297484b17445688f64448eed43928bd8af001.ipynb
Processing d95d18fef52379ee37f68dea6470b6e411440dee.ipynb
Processing 9a2dc2c57f96e5d8c1362c296bd391e5a09a1487.ipynb
Processing e44a89731024535a4059044c55079cfec7c2146e.ipynb
Processing 8be0786b9075d2c473512c4fafae27536afff3aa.ipynb
Processing 005d7f32d836da3742220ecbb55e972986da9487.ipynb
Processing 35862a317ab7d821ab3bbe8b5a92150c318a219d.ipynb
Processing bf1ad55e5cbb484be0fd49e11bfb0518c300bf15.ipynb
Processing c23a58f4393245f6f0738757a9a21f10522f5c24.ipynb
Processing 97500d644a6e9195a8bc1c1ac4e309eecf6c85a3.ipynb
Processing 83b860b14cdef82ce27209bbca18e789b5a7929b.ipynb
Processing e902b1e194e98247c22f8e99f49037404871f395.ipynb
Processing ac914b53f3779bd0ee3ba86dbd8e950f4bc0de2c.ipynb
Processing 92a

Processing 4524ee75fd2773ef169745d973d91b7f705ed74f.ipynb
Processing 41c736fc667c4170fad2eaed2dd69f3cfd149edd.ipynb
Processing 0d27f0e13b065c33c4b2ab0d66980c0a589e9446.ipynb
Processing 06923261e9881b25bbbe2a21b822891678e776f6.ipynb
Processing a7531ae46e3ef870dab1ed50ca534830244248f4.ipynb
Processing 3eb6417ae446d4f4bd211deef1fee6ee56506739.ipynb
Processing ea6f03fde0d22d9b3913f9b5523d5d24498f6f35.ipynb
Processing 189c72d301474e03a95771238a26ba41b5da3350.ipynb
Processing f3fc51a16c42052b2120e60bb7cb184d85e57db5.ipynb
Processing a3f92f9e89e4acb7a91844e4a9922917e492356a.ipynb
Processing 6dd0cc514bcc7d0d1486e4834c10aea0377b6715.ipynb
Processing 38250d8f74e29c61565ac37a7cb061e3558a246a.ipynb
Processing c7fc4d8a7705eadba512a1c8144c9883f8afecf3.ipynb
Processing 83202849f24cf2ffc8841e6c90228014afa502e3.ipynb
Processing 063fb36152c85884719c2aeec2ac1b14b95a16a0.ipynb
Processing 0f52599ce40772b22544de124b74940ca1591d78.ipynb
Processing e8028212e0b6093793bee90c476c67892013493d.ipynb
Processing 1a3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validFiles['imports'] = validFiles.apply(lambda nb: getSourceCodeAndExtractImports(nb), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validFiles['images'] = validFiles.apply(lambda row: storeBase64ImagesAndExtractImageMetadata(row), axis=1)


 # Cleanup and save.

In [12]:
validFiles = validFiles.drop(['raw'], axis=1)
print(validFiles)

validFiles.to_csv('nb_subplots.csv', header=True, index=False)

# write text into a text file
# with open('nbText.txt', 'w') as f:
#     f.write(text)

# validFilesRdd = spark.createDataFrame(validFiles).rdd
# logger.info(validFilesRdd.show())

                                          fileNames       language  \
0    3a086a2a7f2c39cc7ba96591be9421d4c039fd15.ipynb        python3   
1    f5ebce0eaaaafb97c083d50ed2d92eb042ac8ade.ipynb        python3   
2    1b33f8f963e34d5c7c312bf60f0aaea3186b5185.ipynb        python2   
3    6e344a48a45ab3a54916ce37345bdb66e820a8b4.ipynb        python3   
4    32e931355b30b9581621d4404c80957d2f1b1487.ipynb        python3   
..                                              ...            ...   
994  d248dad57e2b0e1e4fbcde4d92bfcf94ebd02849.ipynb        python3   
995  7a9ff5dde3b92e4ea187b21fcb8be7b796bf1a1a.ipynb        python3   
996  5f08adade4134503fd17af71c6b2c78520e6189f.ipynb        python3   
997  5695ccca8e533b53edd1745a19d5eb4c16a3a140.ipynb        python3   
998  d5482abc8dafee3f2f116c46e2798717b47311f4.ipynb  Python [Root]   

     matplotlib_count  plotly_count  bokeh_count  \
0                   0             0            0   
1                   0             0            0   
2  