In [1]:
import spacy
import json

In [2]:
with open('../data/python_train_0.jsonl') as f:
    docs = ([json.loads(line) for line in f.readlines()])

In [3]:
docs[0]

{'repo': 'ageitgey/face_recognition',
 'path': 'examples/face_recognition_knn.py',
 'func_name': 'train',
 'original_string': 'def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo=\'ball_tree\', verbose=False):\n    """\n    Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        ├── <person1>/\n        │   ├── <somename1>.jpeg\n        │   ├── <somename2>.jpeg\n        │   ├── ...\n        ├── <person2>/\n        │   ├── <somename1>.jpeg\n        │   └── <somename2>.jpeg\n        └── ...\n\n    :param model_save_path: (optional) path to save model on disk\n    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n    :param knn_algo: (optional) underlying data structure to support

In [4]:
import itertools
from collections import Counter

In [5]:
docstring_tokens_counter = Counter(itertools.chain.from_iterable([doc['docstring_tokens'] for doc in docs]))
print(len(docstring_tokens_counter))
docstring_tokens_counter.most_common(10)

21313


[('.', 32802),
 ('the', 22992),
 ('a', 15056),
 (':', 10058),
 ('to', 9578),
 ('of', 9210),
 ('and', 5694),
 ('for', 5290),
 ('in', 4516),
 ('is', 4390)]

In [14]:
print(docs[1]['docstring'], docs[1]['docstring_tokens'])

Recognizes faces in given image using a trained KNN classifier

    :param X_img_path: path to image to be recognized
    :param knn_clf: (optional) a knn classifier object. if not specified, model_save_path must be specified.
    :param model_path: (optional) path to a pickled knn classifier. if not specified, model_save_path must be knn_clf.
    :param distance_threshold: (optional) distance threshold for face classification. the larger it is, the more chance
           of mis-classifying an unknown person as a known one.
    :return: a list of names and face locations for the recognized faces in the image: [(name, bounding box), ...].
        For faces of unrecognized persons, the name 'unknown' will be returned. ['Recognizes', 'faces', 'in', 'given', 'image', 'using', 'a', 'trained', 'KNN', 'classifier']


In [15]:
nlp = spacy.load('en_core_web_sm')

In [30]:
doc = nlp(docs[7]['docstring'].lower())
[(token.text, token.lemma_) for token in doc if not token.is_stop and len(token.text.strip()) > 0 and not token.is_punct]

[('returns', 'return'),
 ('array', 'array'),
 ('bounding', 'bound'),
 ('boxes', 'box'),
 ('human', 'human'),
 ('faces', 'face'),
 ('image', 'image'),
 ('param', 'param'),
 ('img', 'img'),
 ('image', 'image'),
 ('numpy', 'numpy'),
 ('array', 'array'),
 ('param', 'param'),
 ('number_of_times_to_upsample', 'number_of_times_to_upsample'),
 ('times', 'time'),
 ('upsample', 'upsample'),
 ('image', 'image'),
 ('looking', 'look'),
 ('faces', 'face'),
 ('higher', 'high'),
 ('numbers', 'number'),
 ('find', 'find'),
 ('smaller', 'small'),
 ('faces', 'face'),
 ('param', 'param'),
 ('model', 'model'),
 ('face', 'face'),
 ('detection', 'detection'),
 ('model', 'model'),
 ('use', 'use'),
 ('hog', 'hog'),
 ('accurate', 'accurate'),
 ('faster', 'faster'),
 ('cpus', 'cpus'),
 ('cnn', 'cnn'),
 ('accurate', 'accurate'),
 ('deep', 'deep'),
 ('learning', 'learn'),
 ('model', 'model'),
 ('gpu', 'gpu'),
 ('cuda', 'cuda'),
 ('accelerated', 'accelerate'),
 ('available', 'available'),
 ('default', 'default'),
 (

In [35]:
preprocessed_docstrings = []
for doc in docs[:20]:
    doc = nlp(doc['docstring'].lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and len(token.text.strip()) > 0 and not token.is_punct]
    preprocessed_docstrings.append(tokens)

preprocessed_docstrings_counter = Counter(itertools.chain.from_iterable(preprocessed_docstrings))
print(len(preprocessed_docstrings_counter))
preprocessed_docstrings_counter.most_common(10)

290


[('>', 89),
 ('face', 42),
 ('param', 33),
 ('image', 28),
 ('`', 26),
 ('return', 22),
 ('─', 16),
 ('<', 12),
 ('array', 11),
 ('traceback', 11)]