# Matching the functions

In [None]:
from pydriller import RepositoryMining
import difflib
import pandas as pd

def repo_matching(repo, commit, path):
    for c in RepositoryMining(repo, single = commit).traverse_commits():
        for modified_files in c.modifications:
            if modified_files.new_path == path:
                try:
                    before = code_extractor(modified_files.source_code_before)
                except TypeError:
                    before = ""
                lines_b = before.strip().splitlines()
                new = code_extractor(modified_files.source_code)
                lines_n = new.strip().splitlines()
                
                diff = difflib.unified_diff(lines_b, lines_n, fromfile='before', tofile='new', lineterm='', n=0)
                lines = list(diff)[2:]
                lineno = 0
                changes = pd.DataFrame(columns = ['repo','path','author','commit','line','code'])
                for line in lines:
                    prefix = '@@'
                    if line.startswith(prefix):
                        s = line[line.find("+"):]
                        try:
                            lineno = int(s[1:s.find(",")])
                        except:
                            lineno = int(s[1:s.find(" ")])
                    else:
                        if line.startswith("+"):
                            changes = changes.append({"repo": c.project_name, 
                                                      "path": modified_files.new_path,
                                                      "author": c.author.email,
                                                      "commit": c.hash,
                                                      "line": lineno,
                                                      "code": line[1:]}, ignore_index= True)
                            lineno += 1
                matches = function_matching(new)
                result = pd.merge(matches, changes, how = 'inner', on= "line")
                return result

# Test

In [None]:
from pydriller import RepositoryMining, GitRepository
repo = "/home/ubuntu/repos/abulbasar/machine-learning"
for commit in RepositoryMining(repo).traverse_commits():
    for file in commit.modifications:
        print(commit.hash + ' at ' + file.new_path)

In [None]:
repo = "/home/ubuntu/repos/abulbasar/machine-learning"
path = "01 Neural Network using Numpy.ipynb"
commit = "df41ae028af5a445ec41a94925ac98ad2ccdea01"

m = repo_matching(repo = repo,path = path,commit = commit)


In [None]:
m.head()

# Running through all commits

In [None]:
import pandas as pd
import ast
import csv
# incresing the csv field size
import ctypes
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
failed = []
fields = ['line','function','package','class','repo','path', 'author', 'commit','code']
repo_path = "/home/ubuntu/repos/"
with open('../data/commits_bigquery.csv',"r", encoding="utf8") as csvfile:
    with open("../data/repo_function_matched.csv", "w", encoding="utf8") as newfile:
        data = csv.DictReader(csvfile)
        writer = csv.DictWriter(newfile, fieldnames=fields)
        writer.writeheader()
        for row in data:
            try:
                commit = row['commit']
                repo = row['repo']
                path = row['path']
                path = path.replace("/", "\\")
                repo = repo_path + repo
                matched = repo_matching(repo, commit, path)
                matched.to_csv(newfile, header=False, index= False)
            except KeyboardInterrupt:
                break
            except Exception as e: 
                failed.append(e)
                #print(e)
                #break
import pickle

with open('../data/fails', 'wb') as fp:
    pickle.dump(failed, fp)

# Prepare data

In [23]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/commits_bigquery.csv')
df = df.drop(df.columns[0], axis = 1)
repos = df.repo.unique()
add_path = np.vectorize(lambda x: "/home/ubuntu/repos/" +x)
repos = add_path(repos)
repos = repos.tolist()
commits = df.commit.unique().tolist()
paths = df[['commit','path']].groupby('commit')['path'].apply(list)

# Get all changes and write them to disc

In [34]:
from nbformat import reads, NO_CONVERT
from nbconvert import PythonExporter
def code_extractor(jpt):
    nb = reads(jpt, NO_CONVERT)
    exporter = PythonExporter()
    source, meta = exporter.from_notebook_node(nb)
    return source

In [33]:
import imp
Matchmaker = imp.load_source('name', '../matchmaker.py')
import ast
def function_matching(nb):
    tree = ast.parse(nb)
    mm = Matchmaker.Matchmaker()
    mm.visit(tree)
    matchs = mm.matching()
    return matchs

In [36]:
from pydriller import RepositoryMining
import difflib
import pandas as pd
#inspect all repos and commits
failed = []
fields = ['line','function','package','class','repo','path', 'author', 'commit','code']
newfile = open("../data/repo_function_matched.csv", "w", encoding="utf8")
writer = csv.DictWriter(newfile, fieldnames=fields)
writer.writeheader()
    
for c in RepositoryMining(repos, only_no_merge=True, only_commits=commits).traverse_commits():
    for modified_files in c.modifications:
        try:
            if modified_files.new_path in paths[c.hash]:
                try:
                    before = code_extractor(modified_files.source_code_before)
                except TypeError:
                    before = ""
                lines_b = before.strip().splitlines()
                new = code_extractor(modified_files.source_code)
                lines_n = new.strip().splitlines()
                
                diff = difflib.unified_diff(lines_b, lines_n, fromfile='before', tofile='new', lineterm='', n=0)
                lines = list(diff)[2:]
                lineno = 0
                changes = pd.DataFrame(columns = ['repo','path','author','commit','line','code'])
                for line in lines:
                    prefix = '@@'
                    if line.startswith(prefix):
                        s = line[line.find("+"):]
                        try:
                            lineno = int(s[1:s.find(",")])
                        except:
                            lineno = int(s[1:s.find(" ")])
                    else:
                        if line.startswith("+"):
                            changes = changes.append({"repo": c.project_name, 
                                                      "path": modified_files.new_path,
                                                      "author": c.author.email,
                                                      "commit": c.hash,
                                                      "line": lineno,
                                                      "code": line[1:]}, ignore_index= True)
                            lineno += 1
                matches = function_matching(new)
                result = pd.merge(matches, changes, how = 'inner', on= "line")
                result.to_csv(newfile, header=False, index= False)
        except KeyboardInterrupt:
            break
        except Exception as e: 
            failed.append([c.project_name,modified_files.new_path,e])

newfile.close()
                
import pickle

with open('../data/fails', 'wb') as fp:
    pickle.dump(failed, fp)

Missing parentheses in call to 'print' (<unknown>, line 29)
Missing parentheses in call to 'print' (<unknown>, line 41)
cells
cells
cells
cells
cells
cells
cells
Missing parentheses in call to 'print' (<unknown>, line 29)
cells
cells
cells
Notebook does not appear to be JSON: '{\n  "worksheets": [\n    {\n      "cel...
cells
cells
cells
cells
cells
cells
Missing parentheses in call to 'print' (<unknown>, line 42)
cells
Missing parentheses in call to 'print' (<unknown>, line 43)
Missing parentheses in call to 'print' (<unknown>, line 43)
Missing parentheses in call to 'print' (<unknown>, line 43)
Missing parentheses in call to 'print' (<unknown>, line 43)
Missing parentheses in call to 'print' (<unknown>, line 44)
'Tuple' object has no attribute 'id'
'Tuple' object has no attribute 'id'
Missing parentheses in call to 'print' (<unknown>, line 44)
'Tuple' object has no attribute 'id'
Missing parentheses in call to 'print' (<unknown>, line 45)
'Tuple' object has no attribute 'id'
'Tuple' o

KeyboardInterrupt: 