In [1]:
import pickle
from pydriller import RepositoryMining, GitRepository
from tqdm import tqdm
from textblob import TextBlob, Word
from gitignore_parser import parse_gitignore

In [1]:
class Index:
    def __init__(self, repo_path):
        self.repo_path = repo_path
        self.repo_stub = repo_path.rsplit('/', 1)[-1]
        self.repo_obj = GitRepository(repo_path)
        self.index = {}
        self.build_index()

    @classmethod
    def load_index(cls, filepath):
        with open(filepath, "rb") as file:
            return pickle.load(file)

    def build_index(self):
        file_list = self.get_file_list()
        for filepath in tqdm(file_list):
            for commit in RepositoryMining(self.repo_path, filepath=filepath).traverse_commits():
                self.add(commit.msg, filepath)

    def add(self, message, file):
        tokens = TextBlob(message).words.lemmatize()
        for token in tokens:
            try:
                self.index[token].add(file)
            except KeyError:
                self.index[token] = {file}

    def search(self, query):
        lemma = Word(query).lemmatize()
        return self.index[lemma]

    def get_file_list(self):
        file_list = self.repo_obj.files()
        try:
            matches = parse_gitignore(self.repo_path + '/.gitignore')
        except FileNotFoundError:
            return file_list
        return [x for x in file_list if not matches(x)]

    def save_index(self, filepath):
        with open(filepath, "wb") as file:
            pickle.dump(self, file)

In [7]:
crdt_index = Index("/Users/kapilan/githome/for_analysis/crdt-canvas")
crdt_index.search("call")

100%|██████████| 60/60 [00:06<00:00,  9.92it/s]


Generating Index for crdt-canvas


{'/Users/kapilan/githome/for_analysis/crdt-canvas/Canvas.xcodeproj/project.pbxproj',
 '/Users/kapilan/githome/for_analysis/crdt-canvas/Canvas/AutomergeJavaScript.swift',
 '/Users/kapilan/githome/for_analysis/crdt-canvas/Canvas/DrawView.swift',
 '/Users/kapilan/githome/for_analysis/crdt-canvas/Canvas/ViewController.swift'}

In [35]:
crdt_index.save_index("crdt_canvas_index.p")

In [31]:
signal_android_index = Index.load_index("Signal_Android_Index.p")
signal_android_index.search("refresh")

{'/Users/kapilan/githome/Signal-Android/app/src/main/AndroidManifest.xml',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/ApplicationContext.java',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/ApplicationPreferencesActivity.java',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/ContactSelectionActivity.java',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/ContactSelectionListFragment.java',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/NewConversationActivity.java',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/contacts/ContactsCursorLoader.java',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/contacts/sync/DirectoryHelper.java',
 '/Users/kapilan/githome/Signal-Android/app/src/main/java/org/thoughtcrime/securesms/conversa

In [6]:
import os
projects = next(os.walk('/Users/kapilan/githome/for_analysis'))[1]

for stub in projects:
    index = Index('/Users/kapilan/githome/for_analysis/' + stub)
    index.save_index(stub + "_index.p")
    print("Index saved for " + stub)

100%|██████████| 60/60 [00:04<00:00, 12.46it/s]
100%|██████████| 761/761 [01:53<00:00,  6.70it/s]
100%|██████████| 4289/4289 [46:52<00:00,  1.52it/s]
100%|██████████| 525/525 [02:26<00:00,  3.57it/s]
 23%|██▎       | 1131/4933 [35:11<1:58:17,  1.87s/it]


Generating Index for crdt-canvas
Index saved for crdt-canvas
Generating Index for pintos-for-students
Index saved for pintos-for-students
Generating Index for Signal-Android
Index saved for Signal-Android
Generating Index for vue
Index saved for vue
Generating Index for flutter


KeyboardInterrupt: 