Skip to content

Commit 30440dd

Browse files
committed
preprocess refactor for token error
1 parent 7645fa5 commit 30440dd

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

python150k/preprocess.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,13 @@ def collect_data(filename: str,
229229
start_def = function_code.find("def")
230230
function_code = function_code[start_def:]
231231

232-
function_code, tokens, comments, docstring, stopwords_count = \
233-
get_tokens(function_code)
232+
function_code, tokens, comments, docstring, stopwords_count, \
233+
is_tokenizable = get_tokens(function_code)
234+
235+
if not is_tokenizable:
236+
error_counter += 1
237+
function_code = ""
238+
tokens = []
234239

235240
# print(f"In filename = {filename}, fun_ind = {fun_ind}")
236241
# print(f"Found {stopwords_count} stopwords.")
@@ -313,6 +318,10 @@ def retrieve_functions_docstrings(
313318
def convert_tokens_to_ast(functions):
314319
ast_tokens = []
315320
for function in functions:
321+
if len(function) == 0:
322+
# Happens after returned TokenError
323+
# Empty function should be skipped
324+
continue
316325
ast_fun_tokens = json.loads(parse_python3.parse_file(function, "code"))
317326
ast_fun_sequential = get_dfs(convert(ast_fun_tokens))
318327
ast_tokens.append(ast_fun_sequential)

0 commit comments

Comments
 (0)