In [26]:
import ast
import gensim
import json
import pandas as pd
import pickle as tastes_good
import shutil
import tensorflow as tf
import tokenize, io

from keras.models import load_model, Model
from end2end.seq2seq import load_text_processor, load_decoder_inputs, load_encoder_inputs, Seq2Seq_Inference 

<h3> 1. Load training file</h3>

In [27]:
seq2seq_Model = tf.keras.models.load_model('bilstm_seq2seq_model.h5')
num_encoder_tokens, funct_pp = load_text_processor('data/cell_pp.dpkl')
num_decoder_tokens, comts_pp = load_text_processor('data/comments_pp.dpkl')
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=funct_pp,
                                 decoder_preprocessor=comts_pp,
                                 seq2seq_model=seq2seq_Model, model_option='bilstm')

Size of vocabulary for data/cell_pp.dpkl: 8,002
Size of vocabulary for data/comments_pp.dpkl: 4,502


In [28]:
# load the data
df_nb = pd.read_csv('data/csv/notebooks_sample.csv', nrows = 20)
df_nb = df_nb.drop(columns=['html_url', 'max_filesize', 'min_filesize', 'query_page', 'path', 'name', 'repo_id'])

<h3>Find how many cells a notebook has, and then append a new column to the dataframe</h3>

In [29]:
def get_num_cells(nb_id):
    
    nb_name = 'data/notebooks/nb_' + str(nb_id) + '.ipynb'
    
    with open(nb_name) as nb_file:
        
        try:
            # get the nb as a JSON file
            data = json.load(nb_file)
            if isinstance(data, dict): 
                keys = data.keys()
            else:
                keys = []
            
            # get the number of cells
            if 'cells' in keys:
                return len(data['cells'])
            elif 'worksheets' in keys:
                num_cells = 0
                for w in data['worksheets']:
                    num_cells += len(w['cells'])
                return num_cells
        
        except:
            return None
        
        
df_nb['num_cells'] = df_nb['nb_id'].apply(get_num_cells)

<h3>Only keep notebooks with more than 0 cells. We do not even consider notebooks that have no cells</h3>

In [30]:
df_nb = df_nb.query("num_cells > 0")
df_nb = df_nb.reset_index(drop=True)

<h3>Append a new column -- 'cells'. A list of [cell_type, cell_content]</h3>

In [31]:
def keep_code_and_markdown(row):
    if row.get('cell_type') == "code" or row.get('cell_type') == "markdown":
        return True
    return False

def keep_source_code(row):
    if row.get('source') == None:
        return [row.get('cell_type'), row.get('input')]
    return [row.get('cell_type'), row.get('source')]


# is the markdown cells helpful?
def get_codes(nb_id):
    
    nb_name = 'data/notebooks/nb_' + str(nb_id) + '.ipynb'
    
    with open(nb_name) as nb_file:
        
        try:
            # get the nb as a JSON file
            data = json.load(nb_file)
            if isinstance(data, dict): 
                keys = data.keys()
            else:
                keys = []
            
            # get the number of cells
            if 'cells' in keys:
                iterable = data['cells']
                itor = list(filter(keep_code_and_markdown, iterable))
                itor = list(map(keep_source_code, itor))
                return itor
            elif 'worksheets' in keys:
                cells = []
                for w in data['worksheets']:
                    cells.append(w['cells'])
                flattened_list = [y for x in cells for y in x]
                itor = list(filter(keep_code_and_markdown, flattened_list))
                itor = list(map(keep_source_code, itor))
                return itor
        
        except:
            return None
        
df_nb['cells'] = df_nb['nb_id'].apply(get_codes)

<h3>Remove notebooks with cells == None</h3>

In [32]:
df_nb = df_nb[df_nb.cells != None]

<h3>Drop the column num_cells since it's not useful anymore.</h3>

In [33]:
df_nb = df_nb.drop(columns=['num_cells'])

<h3>Expand notebooks based on column 'cells'</h3>

In [34]:
df_nb = df_nb.explode('cells').reset_index(drop=True)

<h3>Remove cells that has nothing inside.</h3>

In [35]:
df_nb = df_nb[df_nb.cells != None]

<h3>Append new columns 'markdown_cell' and 'code_cell' to the dataframe.</h3>

In [36]:
def get_codecell(a_cell):
    try:
        if a_cell[0] == 'markdown':
            return None
        return a_cell[1]
    except:
        return None
def get_markdowncell(a_cell):
    try:
        if a_cell[0] == 'code':
            return None
        return a_cell[1]
    except:
        return None

<h3>For 'markdown_cell' column, it contains content if the current row is a markdown cell, o/w None.</h3>
<h3>Similar for the 'code_cell' column.</h3>

In [37]:
df_nb['markdown_cell'] = df_nb['cells'].apply(get_markdowncell)
df_nb['code_cell'] = df_nb['cells'].apply(get_codecell)

<h3>2. convert code to list of vectors</h3>

<h3>Add new column 'code_cell_no_comments' which does not have comments, just source code of each code cell.</h3>

In [38]:
# get pure code 
def remove_comments(lst):
    try:
        if lst == None:
            return None
        elif lst == []:
            return ''
        the_whole_cell = ''
    
        for li in lst:
            the_whole_cell += li
       
        buf = io.StringIO(the_whole_cell)
        ans = ''
        for line in tokenize.generate_tokens(buf.readline):
            if line.type != tokenize.COMMENT:
                ans += line.string + ' '
        return ans
    except:
        return 'Syntax_error'
    
df_nb['code_cell_no_comments'] = df_nb['code_cell'].apply(remove_comments)

<h3>This cell contains function definition which will be used later for extracting comments from code_cell </h3>

In [39]:
# just a way to detect if we run into a scenario mentioned above
def is_valid_python(code):
    try:
        ast.parse(code)
    except SyntaxError:
        return False
    return True

# Use list.append in getting comments... but string concatenation in removing comments
# Reasonable, need to check English comments again, so string concatenation may not work
def get_comments(lst):
    try:
        if lst == None:
            return None
        elif lst == []:
            return []
        the_whole_cell = ''
        for li in lst:
            the_whole_cell += li
       
        buf = io.StringIO(the_whole_cell)
        ans = []
        for line in tokenize.generate_tokens(buf.readline):
            if line.type == tokenize.COMMENT:
                # check if you have things like "#for variable in field.getchildren():"
                if (is_valid_python(line.string.strip("#").strip(" ").strip("#"))):
                    continue
                else:
                    ans.append(line.string)
        return ans
    except:
        # your code has syntax errors...
        return "Syntax_error"

<h3>Remove cells that has syntax error in code...</h3>

In [40]:
# drop those rows have "Syntax_error,srsly?"
df_nb = df_nb[df_nb.code_cell_no_comments != 'Syntax_error']
df_nb = df_nb.reset_index(drop=True)

<h3>Extract comments from cells</h3>

In [41]:
df_nb['code_cell_comments'] = df_nb['code_cell'].apply(get_comments)

<h3>Also removes cells with syntax error.</h3>

In [42]:
df_nb = df_nb[df_nb.code_cell_comments != 'Syntax_error']
df_nb = df_nb.reset_index(drop=True)

<h3>Remove comments with non-ascii characters.</h3>

In [43]:
def is_English(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

def remove_non_ascii_comments(lst):
    if lst == None:
        return None
    ans = []
    for cmt in lst:
        if (is_English(cmt)):
            ans.append(cmt)
    return ans

df_nb['code_cell_comments'] = df_nb['code_cell_comments'].apply(remove_non_ascii_comments)

<h3> Concatenate comments into a new column 'conc_comment' </h3>

In [44]:
def concatenate_valid_comments(lst):
    if lst == None:
        return None
    if lst == []:
        return ''
    ans = ''
    for cmt in lst:
        ans += cmt + ' '
    return ans

df_nb['conc_comment'] = df_nb['code_cell_comments'].apply(concatenate_valid_comments)

<h3>Remove cells = None</h3>

In [45]:
realdf = df_nb[df_nb['cells'].map(type) != float].reset_index(drop=True)

<h3>Remove cells = [Mkd/code, None]</h3>

In [46]:
def check_useless_cell(a_cell):
    if a_cell != None:
        if a_cell[1] == None:
            return True
    return False

actualdf = realdf[realdf['cells'].map(check_useless_cell) != True].reset_index(drop=True)

<h3>Remove code_cell_no_comments = ""</h3>

In [47]:
def check_empty_code(a_cell):
    if a_cell == '':
        return True
    return False

actualdf = actualdf[actualdf['code_cell_no_comments'].map(check_empty_code) != True].reset_index(drop=True)

<h3>emb_vecs_code_middle is a list contains all code descriptors.</h3>

<h3>If the current cell is markdown, emb_vecs_code_middle will append a '0' to it
<h3>If the current code cell contains original comment, then use the original</h3>
<h3>If the current code cell did not have descriptor, use the predicted one.</h3>

In [48]:
emb_vecs_code_middle = []
for index, row in actualdf.iterrows():
    if row['code_cell_no_comments'] == None:
        # markdown cell
        emb_vecs_code_middle.append(0)
    else:
        if row['conc_comment'] != '':
            emb_vecs_code_middle.append(row['conc_comment'])
        else:
            try:
                emb_vecs_code_middle.append((seq2seq_inf.generate_comments(row['code_cell_no_comments']))[1])
            except:
                # change this to 0????
                emb_vecs_code_middle.append('ERR_predict_false')
emb_vecs_code_middle.count('ERR_predict_false')

210

In [49]:
with open('data/all_code_middle_embeddings.txt', 'wb') as file:
    tastes_good.dump(emb_vecs_code_middle, file)
the_lan_model = gensim.models.doc2vec.Doc2Vec.load('my_model.doc2vec')

<h3>Convert emb_vecs_code_middle into list of vectors.</h3>
<h3>If emb_vecs_code_middle[i] == 0, then emb_vecs_code[i] = 0.</h3>

In [50]:
%%time
emb_vecs_code = []
for pred in emb_vecs_code_middle:
    try:
        if pred == 0:
            emb_vecs_code.append(0)
        else:
            emb_vecs_code.append(the_lan_model.infer_vector(gensim.utils.simple_preprocess(pred)))
    except:
        print(pred)
        break
with open('data/all_code_embeddings.txt', 'wb') as file:
    tastes_good.dump(emb_vecs_code, file)

CPU times: user 705 ms, sys: 3.84 ms, total: 709 ms
Wall time: 722 ms


<h3>3. convert mkd to list of vectors</h3>
<h3>If the current cell is not markdown, append 0 to it.</h3>
<h3>Else use our doc2vec model to predict the vector.</h3>

In [51]:
emb_vecs_mkd = []
for index, row in actualdf.iterrows():
    if row['markdown_cell'] == None:
        emb_vecs_mkd.append(0)
    elif row['markdown_cell'] == []:
        emb_vecs_mkd.append(the_lan_model.infer_vector(gensim.utils.simple_preprocess('')))
    else:
        emb_vecs_mkd.append(the_lan_model.infer_vector(gensim.utils.simple_preprocess(row['markdown_cell'][0])))

In [52]:
with open('data/all_markdown_embeddings.txt', 'wb') as file:
    tastes_good.dump(emb_vecs_mkd, file)

<h3>4. store the relationships</h3>

<h3>Create a dictionary which stores information about contribution of markdown cells to code cells</h3>
<h3>Specifically, key : value pairs => markdown_cell_index : [array of indexes that from code cells impacted by the markdown cell]</h3>

In [53]:
dict_relationships = {}
# helper function 
def find_contribution(index, max_row):
    the_id = actualdf.iloc[index]['nb_id']
    arr = []
    loop = True
    curr_index = index + 1
    while curr_index < max_row:
        if actualdf.iloc[curr_index]['nb_id'] == the_id:
            # same notebook
            if actualdf.iloc[curr_index]['markdown_cell'] == None:
                # A consequtive code cell
                arr.append(curr_index)
                curr_index = curr_index + 1
            else:
                # same nb but a markdown cell, time to stop the loop
                break
        else:
            # different nb, time to stop
            break
        
    return arr
    
total_rows = actualdf.shape[0]
for index, row in actualdf.iterrows():
    if row['markdown_cell'] != None:
        arr = find_contribution(index, total_rows)
        dict_relationships[str(index)] = arr

<h3>After applying the find_contribution function.</h3>
<h3>E.g. index 0 is a markdown cell, it has impact on code cells from 1 to 27</h3>

In [54]:
print(dict_relationships)

{'0': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27], '28': [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44], '45': [], '46': [], '47': [48, 49], '50': [], '51': [], '52': [53, 54, 55, 56, 57, 58, 59], '60': [61, 62, 63, 64], '65': [], '66': [], '67': [68, 69, 70, 71, 72, 73, 74], '75': [76, 77, 78, 79, 80, 81, 82], '83': [84, 85, 86, 87], '88': [], '106': [107, 108, 109, 110, 111, 112], '113': [114, 115, 116, 117, 118, 119, 120, 121, 122], '123': [124, 125], '126': [127, 128, 129, 130], '131': [132, 133, 134], '135': [], '136': [], '137': [138], '139': [140, 141, 142], '143': [144, 145], '146': [], '147': [148], '149': [150], '151': [152, 153], '154': [155], '156': [157], '158': [159], '160': [161], '162': [163, 164], '165': [166, 167], '168': [169], '170': [171], '172': [173], '174': [175], '176': [177, 178], '179': [180], '181': [182], '183': [184], '185': [186], '187': [188, 189], '190': [191, 192], '193': [], '19

<h3>Testing cell.</h3>

In [55]:
the_lan_model.infer_vector(gensim.utils.simple_preprocess('Test'))[:2]

array([-0.015332  , -0.01260937], dtype=float32)

<h3>Create an array which has length = number of rows in our dataframe, i.e. each element is representing a cell</h3>
<h3>An element is None if the corresponding cell is markdown, o/w index of the markdown has impact on me.</h3>


In [56]:
relationship_arr = [None] * len(emb_vecs_code)
for dic in dict_relationships:
    child_code_arr = dict_relationships[str(dic)]
    for indx in child_code_arr:
        relationship_arr[indx] = int(dic)

<h3>For example, remember previously, we said index 0 contains a markdown cell, so relationship_arr[0] = None, and all code cells from 1 to 27 are impacted by cell 0.</h3>

In [57]:
print(relationship_arr)

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, None, None, None, 47, 47, None, None, None, 52, 52, 52, 52, 52, 52, 52, None, 60, 60, 60, 60, None, None, None, 67, 67, 67, 67, 67, 67, 67, None, 75, 75, 75, 75, 75, 75, 75, None, 83, 83, 83, 83, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 106, 106, 106, 106, 106, 106, None, 113, 113, 113, 113, 113, 113, 113, 113, 113, None, 123, 123, None, 126, 126, 126, 126, None, 131, 131, 131, None, None, None, 137, None, 139, 139, 139, None, 143, 143, None, None, 147, None, 149, None, 151, 151, None, 154, None, 156, None, 158, None, 160, None, 162, 162, None, 165, 165, None, 168, None, 170, None, 172, None, 174, None, 176, 176, None, 179, None, 181, None, 183, None, 185, None, 187, 187, None, 190, 190, None, None, None, None, 196, None, 198, None, 200, None, 202, None, 204,

In [58]:
with open("data/child_relationships.txt", "wb") as fp:
    tastes_good.dump(relationship_arr, fp)
actualdf.to_csv('data/stored_df.csv', index=False)
with open("data/dict.pkl", "wb") as fp:
    tastes_good.dump(dict_relationships, fp)

<h3>End of 1-4 steps (above)

This section is to make sure our lists do not contain dangerous values
</h3>

In [59]:
# clear markdown
emb_vecs_code_no_zero = []
for vec in emb_vecs_code:
    if type(vec) != int:
        emb_vecs_code_no_zero.append(vec)
        
emb_vecs_mkd_no_zero = []
for vec in emb_vecs_mkd:
    if type(vec) != int:
        emb_vecs_mkd_no_zero.append(vec)     

In [60]:
# clear markdown
df_nb_no_mkd = actualdf[actualdf['code_cell_no_comments'].map(type) != type(None)].reset_index(drop=True)

In [61]:
counter_aa = 0
for index, row in actualdf.iterrows():
    if row['code_cell_no_comments'] == '':
        counter_aa = counter_aa + 1
print(counter_aa)

0


In [62]:
seq2seq_inf.generate_comments("from sklearn . linear_model import LinearRegression")[1]

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  indices[0,99] = 1737 is not in [0, 547)
	 [[node Encoder-Model/Cell-Word-Embedding/embedding_lookup (defined at home/na/codes/NBSearch/end2end/seq2seq.py:223) ]]
	 [[Encoder-Model/Cell-Word-Embedding/embedding_lookup/_6]]
  (1) Invalid argument:  indices[0,99] = 1737 is not in [0, 547)
	 [[node Encoder-Model/Cell-Word-Embedding/embedding_lookup (defined at home/na/codes/NBSearch/end2end/seq2seq.py:223) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_predict_function_164121]

Errors may have originated from an input operation.
Input Source operations connected to node Encoder-Model/Cell-Word-Embedding/embedding_lookup:
 Encoder-Model/Cell-Word-Embedding/embedding_lookup/163208 (defined at home/na/miniconda3/envs/nbsearch-gpu/lib/python3.7/contextlib.py:112)

Input Source operations connected to node Encoder-Model/Cell-Word-Embedding/embedding_lookup:
 Encoder-Model/Cell-Word-Embedding/embedding_lookup/163208 (defined at home/na/miniconda3/envs/nbsearch-gpu/lib/python3.7/contextlib.py:112)

Function call stack:
predict_function -> predict_function


In [63]:
def get_files_recursively(start_directory, filter_extension=None):
    for root, _, files in os.walk(start_directory):
        for file in files:
            if filter_extension is None or file.lower().endswith(filter_extension):
                yield os.path.join(root, file)

def selective_copy(source, target, file_extension=None):
    for file in get_files_recursively(source, file_extension):
        shutil.copy(file, target)
        print("The following file has been copied", file)
for extension in ['txt', 'csv', 'pkl', 'h5', 'doc2vec']:
  selective_copy("data/","resource", extension)

The following file has been copied data/all_code_embeddings.txt
The following file has been copied data/corpus.txt
The following file has been copied data/all_markdown_embeddings.txt
The following file has been copied data/child_relationships.txt
The following file has been copied data/all_code_middle_embeddings.txt
The following file has been copied data/train_rows.csv
The following file has been copied data/df_test_rows.csv
The following file has been copied data/Seq2Seq_pred_comments.csv
The following file has been copied data/stored_df.csv
The following file has been copied data/predict_rows.csv
The following file has been copied data/csv/notebooks_sample.csv
The following file has been copied data/sample_data/data/csv/repositories_sample.csv
The following file has been copied data/sample_data/data/csv/readmes_sample.csv
The following file has been copied data/sample_data/data/csv/notebooks_sample.csv
The following file has been copied data/dict.pkl
The following file has been copi