In [9]:
import os
import pickle
import time
from time import time
from gitignore_parser import parse_gitignore
from pydriller import RepositoryMining, GitRepository
from textblob import TextBlob, Word
from tqdm import tqdm
from multiprocessing import Pool

In [10]:
class Index:
    def __init__(self, repo_path):
        self.repo_path = repo_path
        self.repo_stub = repo_path.rsplit('/', 1)[-1]
        self.repo_obj = GitRepository(repo_path)
        self.index = {}
        self.build_index()

    @classmethod
    def load_index(cls, filepath):
        with open(filepath, "rb") as file:
            return pickle.load(file)

    def build_index(self):
        file_list = self.get_file_list()
        for filepath in tqdm(file_list):
            for commit in RepositoryMining(self.repo_path, filepath=filepath).traverse_commits():
                self.add(commit.msg, filepath)

    def add(self, message, file):
        tokens = TextBlob(message).words.lemmatize()
        for token in tokens:
            try:
                self.index[token].add(file)
            except KeyError:
                self.index[token] = {file}

    def search(self, query):
        lemma = Word(query).lemmatize()
        return self.index[lemma]

    def get_file_list(self):
        file_list = self.repo_obj.files()
        try:
            matches = parse_gitignore(self.repo_path + '/.gitignore')
        except FileNotFoundError:
            return file_list
        return [x for x in file_list if not matches(x)]

    def save_index(self, filepath):
        with open(filepath, "wb") as file:
            pickle.dump(self, file)

In [12]:

def build_and_save_index(stub):
    t = time()
    index = Index('/Users/kapilan/githome/for_analysis/' + stub)
    index.save_index(stub + "_index.p")
    print("Index saved for " + stub + " in {} minutes".format(round((time() - t) / 60, 2)))

projects = next(os.walk('/Users/kapilan/githome/for_analysis'))[1]
projects = ["crdt-canvas", "pintos-for-students"]

with Pool(8) as p:
    p.map(build_and_save_index(), projects)




ModuleNotFoundError: No module named 'multiprocessing.pool'