# Topic Modelling of used functions
## Function that collects all the calls

In [1]:
import ast

class CallCollector(ast.NodeVisitor):
    def __init__(self):
        self.calls = []

    def visit_Call(self, node):
        self.calls.append(node.func.attr)
    
    def get_calls(self):
        return(self.calls) 

## Test of the function

In [15]:
tree = ast.parse('''
connection = pymysql.connect()
with connection.cursor() as cursor:
    sql = ""
    cursor.execute(sql)
    test = cursor.fetchall()
    test = pd.DataFrame(test)
''')
cc = CallCollector()
cc.visit(tree)
cc.get_calls()

['connect', 'cursor', 'execute', 'fetchall', 'DataFrame']

## Get all the code blocks from a notebook

In [2]:
from nbformat import reads, NO_CONVERT
def code_extractor(jpt):
    notebook = reads(jpt, NO_CONVERT)
    cells = notebook.cells
    code_cells = [c for c in cells if c.cell_type == 'code']
    code = []
    for cell in code_cells:
        source = cell.source
        code.append(source)
    return code

## Extract all used functions 

In [6]:
import csv
# incresing the csv field size
import ctypes
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))

calls = list()
with open('../data/jupyter_files.csv',"r", encoding="utf8") as csvfile:
    data = csv.DictReader(csvfile)
    for row in data:
        jpt = row['content']
        try:
            code = code_extractor(jpt)
            for c in code:
                try:
                    tree = ast.parse(c)
                    cc = CallCollector()
                    cc.visit(tree)
                    call = cc.get_calls()
                    calls.append(call)
                except:
                    continue
        except KeyboardInterrupt:
            break
        except:
            continue

### Write the list to disk

In [None]:
import pickle

with open('../data/calls', 'wb') as fp:
    pickle.dump(calls, fp)

### Read the list from disk

In [6]:
import pickle

with open('../data/calls', 'rb') as fp:
    calls = pickle.load(fp)

## Inspect the data and clean up

In [7]:
len(calls)

230263

### Empty Blocks

In [8]:
calls.count([])

91319

In [9]:
#Function to remove all occurences of a specific item
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]
#remove all empty blocks
new_calls = remove_values_from_list(calls, [])
len(new_calls)

138944

## How many function calls are there per block?

In [10]:
import pandas as pd
c = []
for i in new_calls:
    c.append(len(i))
c = pd.Series(c)
c.describe()

count    138944.000000
mean          2.474076
std           3.346581
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max         194.000000
dtype: float64

In [11]:
len(c[c > 1])

58764

In [12]:
c[c > 1].describe()

count    58764.000000
mean         4.485365
std          4.412580
min          2.000000
25%          2.000000
50%          3.000000
75%          5.000000
max        194.000000
dtype: float64