# Topic Modelling of used functions
## Function that collects all the calls

In [16]:
import ast

class CallCollector(ast.NodeVisitor):
    def __init__(self):
        self.calls = []

    def visit_Call(self, node):
        self.calls.append(node.func.attr)
    
    def get_calls(self):
        return(self.calls) 

## Test of the function

In [15]:
tree = ast.parse('''
connection = pymysql.connect()
with connection.cursor() as cursor:
    sql = ""
    cursor.execute(sql)
    test = cursor.fetchall()
    test = pd.DataFrame(test)
''')
cc = CallCollector()
cc.visit(tree)
cc.get_calls()

['connect', 'cursor', 'execute', 'fetchall', 'DataFrame']

## Get all the code blocks from a notebook

In [17]:
from nbformat import reads, NO_CONVERT
def code_extractor(jpt):
    notebook = reads(jpt, NO_CONVERT)
    cells = notebook.cells
    code_cells = [c for c in cells if c.cell_type == 'code']
    code = []
    for cell in code_cells:
        source = cell.source
        code.append(source)
    return code

## Extract all used functions 

In [31]:
import csv
# incresing the csv field size
import ctypes
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))

calls = list()
with open('../data/jupyter_files.csv',"r", encoding="utf8") as csvfile:
    data = csv.DictReader(csvfile)
    for row in data:
        jpt = row['content']
        try:
            code = code_extractor(jpt)
            for c in code:
                tree = ast.parse(c)
                cc = CallCollector()
                cc.visit(tree)
                call = cc.get_calls()
                calls.append(call)
        except KeyboardInterrupt:
            break
        except:
            continue

Wall time: 0 ns


### Write the list to disk

In [None]:
import pickle

with open('../data/calls', 'wb') as fp:
    pickle.dump(calls, fp)

### Read the list from disk

In [6]:
import pickle

with open('../data/calls', 'rb') as fp:
    calls = pickle.load(fp)

## Inspect the data and clean up

In [7]:
len(calls)

30214

### Empty Blocks

In [5]:
calls.count([])

15802

In [8]:
#Function to remove all occurences of a specific item
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]
#remove all empty blocks
new_calls = remove_values_from_list(calls, [])
len(new_calls)

14412

## How many function calls are there per block?

In [7]:
import pandas as pd
c = []
for i in new_calls:
    c.append(len(i))
c = pd.Series(c)
c.describe()

count    14412.000000
mean         1.931030
std          2.260795
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         65.000000
dtype: float64

In [10]:
len(c[c > 1])

5161

In [12]:
c[c > 1].describe()

count    5161.000000
mean        3.599884
std         3.151988
min         2.000000
25%         2.000000
50%         3.000000
75%         4.000000
max        65.000000
dtype: float64

# Topic Modelling of the functions with code blocks as a document

In [2]:
#Import all necessary libraries
import glob
import pandas as pd
import numpy as np

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore')
import _pickle as pickle

import re
import random
import collections

from gensim import corpora, models
import gensim

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from scipy import stats

pyLDAvis.enable_notebook()
random.seed(1234)

from sklearn.metrics.cluster import adjusted_rand_score
pd.options.display.max_columns = None



In [9]:
%%time
#build gensim corpus
texts = new_calls
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=3, no_above=0.4)
dictionary.compactify()
corpus = [dictionary.doc2bow(text) for text in texts]

Wall time: 273 ms


In [10]:
len(dictionary)

1176

In [11]:
%%time
ldamodelnormal = gensim.models.ldamodel.LdaModel(corpus, num_topics=7, id2word = dictionary, passes=10, chunksize=100, update_every=0, alpha=1/15, random_state=1)

Wall time: 31.9 s


In [12]:
# helper to create descriptive tables (doc-topic probabilities) and visualizations for LDA models 
def getModelResults(ldamodel, corpus, dictionary):
    vis = pyLDAvis.gensim.prepare(ldamodel,corpus, dictionary, sort_topics=False)
    transformed = ldamodel.get_document_topics(corpus)
    df = pd.DataFrame.from_records([{v:k for v, k in row} for row in transformed])
    return vis, df    

In [13]:
# get the top topic per document into a list
def maxTop(x):
    mx = max(x,key=lambda item:item[1])
    if (mx[1]>0.0):
        return(mx[0])
    else:
        return 99

In [14]:
%%time
# get descriptive stuff for all models
normalv, dfnormal = getModelResults(ldamodelnormal, corpus, dictionary)

Wall time: 13.8 s


In [15]:
#print regular topics (top word probabilities)
ldamodelnormal.print_topics(num_words=8)

[(0,
  '0.063*"plot" + 0.055*"show" + 0.028*"ylabel" + 0.027*"xlabel" + 0.022*"append" + 0.020*"init" + 0.019*"figure" + 0.019*"title"'),
 (1,
  '0.058*"join" + 0.051*"extension" + 0.050*"apply" + 0.024*"execute" + 0.024*"head" + 0.018*"Series" + 0.017*"basicConfig" + 0.016*"connect"'),
 (2,
  '0.042*"count" + 0.040*"insert" + 0.037*"arange" + 0.026*"sqrt" + 0.025*"sin" + 0.024*"mean" + 0.021*"join" + 0.020*"cos"'),
 (3,
  '0.222*"read_csv" + 0.056*"add" + 0.030*"head" + 0.028*"opts" + 0.025*"plot" + 0.024*"linspace" + 0.019*"chdir" + 0.017*"json"'),
 (4,
  '0.094*"append" + 0.043*"load" + 0.023*"astype" + 0.023*"set" + 0.022*"run" + 0.022*"extension" + 0.019*"seed" + 0.019*"conv2d"'),
 (5,
  '0.121*"array" + 0.054*"DataFrame" + 0.030*"placeholder" + 0.027*"read_csv" + 0.027*"Variable" + 0.023*"drop" + 0.019*"head" + 0.018*"append"'),
 (6,
  '0.102*"head" + 0.093*"load_data" + 0.043*"reshape" + 0.038*"get" + 0.035*"split" + 0.034*"show" + 0.027*"describe" + 0.017*"DataFrame"')]

In [16]:
# inspect regular topics - ATTENTION: ALL TOPICS ARE SHIFTED WITH ID +1 w.r.t. GENSIM
normalv