In [128]:
# Change this fileLocation if necessary
# Subsequent downloads might be called 'Databricks (1).dbc', etc.
source_FileLocation = r'path_to_your_dbc_file.dbc'
output_FileLocation = r'path_to_output_directory'

In [129]:
# Extract dbc file

# Cleanup from prior run
import shutil
try: shutil.rmtree('tmp_dbc')
except OSError: pass

import zipfile
import os
try: os.mkdir('tmp_dbc')
except OSError: pass
with zipfile.ZipFile(source_FileLocation, 'r') as z:
    z.extractall('tmp_dbc')

print('*** Contents from the .dbc file (usually one file or a directory) ***\n')
print(os.listdir('tmp_dbc'))

*** Contents from the .dbc file (usually one file or a directory) ***

['introduction-to-python-for-data-science-and-data-engineering-english', 'manifest.mf']


In [130]:
# Find files to parse
import fnmatch

filesToParse = []
for root, dirNames, fileNames in os.walk('tmp_dbc'):
    for fileName in fnmatch.filter(fileNames, '*.python'):
        filesToParse.append((root, fileName))

def getIpynbName(path, fileName):
    newFileName = os.path.splitext(fileName)[0] + '_export.ipynb'
    subDir = path.replace('tmp_dbc\\','')
    baseDir = os.path.join(output_FileLocation,subDir)
    return os.path.join(baseDir, newFileName)

print("*** Files to be created on the specified output_FileLocation path ***")
print("(Warning: files will be overwritten!)\n")
for path, fileName in filesToParse:
    print(getIpynbName(path, fileName))

*** Files to be created (relative to your current working directory) ***

C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\AGENDA_export.ipynb
C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 00 - Databricks Environment_export.ipynb
C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 01 - Data Types and Variables_export.ipynb
C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 02 - Control Flow_export.ipynb
C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 03 - Functions_export.ipynb
C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 04 - Collection Types and Methods_export.

In [131]:
# Create the IPython Notebooks
# Convert .python files to .ipynb files
import codecs
import nbformat
from nbformat.v3.nbpy import PyReader
import json
import re

_header = u'# -*- coding: utf-8 -*-\n# <nbformat>3.0</nbformat>\n'
_markdownCell = u'\n\n# <markdowncell>\n\n'
_codeCell = u'\n\n# <codecell>\n\n'

def convertToIpynb(fileToParse):
    
    with codecs.open(os.path.join(*fileToParse), encoding="utf-8") as fp:
        jsonData = json.load(fp)
        commands = jsonData['commands']
        commandInfo = [(x['position'], x['command']) for x in commands]
        commandList = sorted(commandInfo)

    with codecs.open('tmp_ipynb.py', 'w', encoding="utf-8") as fp:
        fp.write(_header)
        fp.write(_codeCell)

        for position, command in commandList:
            if re.match(r'\s*%md', command):
                # Add any custom texts which need to be replaced
                command = re.sub(r'^\s*%md', '', command, flags=re.MULTILINE)
                command = re.sub(r'-sandbox', '', command, flags=re.MULTILINE)
                command = re.sub(r'(%\(|\)%)', '$', command)
                command = re.sub(r'(%\[|\]%)', '$$', command)

                fp.write(_markdownCell)
                asLines = command.split('\n')
                command = '# ' + '\n# '.join(asLines)
            else:
                command = re.sub(r'^\s*baseDir\s*=.*$', 'baseDir = \'data\'', 
                                 command, flags=re.MULTILINE)
                fp.write(_codeCell)

            fp.write(command)

    outputName = getIpynbName(fileToParse[0],fileToParse[1])

    with codecs.open('tmp_ipynb.py', 'r', encoding="utf-8") as intermediate:
        nb = PyReader().read(intermediate)

    os.remove('tmp_ipynb.py')
    baseDirectory = os.path.split(outputName)[0]
    
    if not os.path.isdir(baseDirectory):
        os.makedirs(baseDirectory)

    with codecs.open(outputName, 'w', encoding="utf-8") as output:
        nbformat.write(nbformat.convert(nb, 4.0), output)  
        print('Created: {0}'.format(outputName))

In [132]:
for fileToParse in filesToParse:
    convertToIpynb(fileToParse)

Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\AGENDA_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 00 - Databricks Environment_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 01 - Data Types and Variables_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 02 - Control Flow_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 03 - Functions_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\ITP 04 - Collection Types and Methods_export.ipynb
Created: C:\Us

Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\Solutions\Includes\_dataset_index_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\Solutions\Labs\ITP 01L - Data Types and Variables Lab_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\Solutions\Labs\ITP 02L - Control Flow Lab_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\Solutions\Labs\ITP 03L - Functions Lab_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-python-for-data-science-and-data-engineering-english\Solutions\Labs\ITP 04L - Collection Types and Methods Lab_export.ipynb
Created: C:\Users\nemet\OneDrive\Dokumentumok\Databricks\introduction-to-pytho

In [133]:
# Cleanup from current run
directory_path = os.getcwd()

tmpDir = os.path.join(directory_path,'tmp_dbc')

if not os.path.isdir(tmpDir):
    print('*** The .tmp_dbc folder not exists ***\n')
else:
    shutil.rmtree(tmpDir)
    print('*** The .tmp_dbc folder has been removed ***\n')
    print(os.path.join(directory_path,'tmp_dbc'))

*** The .tmp_dbc folder has been removed ***

C:\Users\nemet\OneDrive\Dokumentumok\GitHub\dbc_to_ipynb_converter\tmp_dbc
