In [103]:
from git import Repo, Tree
import re
import os
import subprocess
import json
import time
from data_generation.config import REPO_PATH, REPO, REF_COMMIT
from data_generation.git_extraction_helper import get_all_blobs, file_commits, blob_file_str, remove_comments
from data_generation.theorem_extraction_helper import theorem_names_from_blob, THEOREM_NAME_REGEX

In [128]:
# call repo
REPO_COPY_PATH = "/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copy"
REPO_COPY_DIR = "/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copies"
DATASET_PATH = "/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/proof-repair/dataset"

# Get all most updated list of theorems

## Define functions for retrieving the names of the theorems from the most recent commit.

In [4]:
def get_commit(all_commits, commit_sha=None):
    """
    Returns the git.Commit object for the commit with the specified hexsha.
    
    If no hexsha is specified, the most recent commit will be returned.
    """
    commit_idx = [c.hexsha for c in all_commits].index(commit_sha) if commit_sha is not None else 0
    return all_commits[commit_idx]


def all_theorem_names(commit, dir_path=None):
    """
    Returns the list of all theorem names that have been defined within a file in the 
    specified directory path during the specific commit. 
    """
    # designate the specific directory of mathlib4
    tree = commit.tree['Mathlib'] if dir_path is None else commit.tree[dir_path]

    # retrieve all the files/blobs within the directory
    if isinstance(tree, Tree):
        all_blobs = get_all_blobs(tree)
    else:
        all_blobs = [tree]

    # retrieve all the names of the theorems from all the files/blobs
    all_theorem_names = {blob.path: theorem_names_from_blob(blob) for blob in all_blobs}

    return all_theorem_names
    

## Define functions to retrieve the theorem body

In [109]:
def file_str_from_commit(commit, file_path):
    """Returns the file from a specific file path and commit."""
    try:
        blob = commit.tree[file_path]
        return blob_file_str(blob)
    except KeyError:
        return
    
def theorem_body_regex(theorem_name):
    """Returns the regular expression for extracting the body of a specific theorem."""
    thm_regex = r'((?:theorem|lemma)\s+'
    thm_regex += re.escape(theorem_name)
    thm_regex += r'\s.*?)'
    thm_regex += r'(?=\n+?\S|$)'
    return thm_regex
    
def theorem_body(file_str, theorem_name):
    """Returns the body of the theorem of interest from the specified file of text."""
    thm_body_regex = theorem_body_regex(theorem_name)
    thm_body = re.search(thm_body_regex, file_str, re.DOTALL).group()
    return thm_body


def theorem_statement_proof(file_str, theorem_name):
    """
    Returns the statement and proof of the theorem of interest as a dictionary. 
    """
    body = theorem_body(file_str, theorem_name).splitlines()
    statement = [body.pop(0)]
    proof = body

    # identify the line where the theorem statement ends
    two_spaces_regex = r"^\s{2}(?!\s)"
    while body:
        line = body[0]
        if not re.match(two_spaces_regex, line):
            statement.append(body.pop(0))
        else:
            break 

    # identify the exact point where the theorem proof begins
    idx = statement[-1].rfind(':=')
    if idx >= 0:
        proof = [statement[-1][idx:]] + proof
        statement[-1] = statement[-1][:idx]

    return {'statement': '\n'.join(statement), 'proof': '\n'.join(proof)}


def theorem_proof_regex(thm_statement):
    """Returns the regular expression for extracting the theorem proof."""
    thm_proof_regex = r'(?<='
    thm_proof_regex += re.escape(thm_statement)
    thm_proof_regex += r')(.*?)(?=\n\n\S|\n\S|$)'
    return thm_proof_regex


def file_theorem_info(commit, file_path):
    """
    Returns a dictionary that contains the name, filepath, statement, proof information of 
    all theorems within the specified file. 
    """
    file_theorem_names = all_theorem_names(commit, file_path)[file_path]

    # extract all the text from specific file and remove all comments/annotations
    file_str = file_str_from_commit(commit, file_path)
    file_str = remove_comments(file_str)

    thm_info = {"filepath": [], "name": [], "statement": [], "proof": []}
    for name in file_theorem_names:
        # if name != "BijOn.perm_zpow":
        thm_info["filepath"].append(file_path)
        thm_info["name"].append(name)
        thm_statement_proof = theorem_statement_proof(file_str, name)
        thm_info["statement"].append(thm_statement_proof['statement'])
        thm_info["proof"].append(thm_statement_proof['proof'])

    # return a dictionary of (name, body) pairs of theorems in the file
    return thm_info


## Trial functions

In [120]:
def all_directories():
    # NOTE SHOULD GO TO CONFIG FILE (FOR FUTURE)
    # excluded 'tactic', 'Init', 'Testing', 'Util', 'Lean', 'Mathport', 'Data', 'Control', 'Deprecated'
    return [
        "Algebra",
        "Analysis",
        "Computability",
        "FieldTheory",
        "InformationTheory",
        "LinearAlgebra",
        "MeasureTheory",
        "Order",
        "RingTheory",
        "AlgebraicGeometry",
        "CategoryTheory",
        "Condensed",
        "Geometry",
        "Logic",
        "ModelTheory",    
        "Probability",
        "SetTheory",
        "AlgebraicTopology",
        "Combinatorics",
        "Dynamics",
        "GroupTheory",
        "NumberTheory",
        "RepresentationTheory",
        "Topology",
    ]

def get_file_num_thm():

    files_num_thm = []
    for sub_dir in all_directories():
        for blob in get_all_blobs(REF_COMMIT.tree[f'Mathlib/{sub_dir}']):
            num_thm = len(theorem_names_from_blob(blob))
            if num_thm > 0:
                files_num_thm.append((blob.path, num_thm))
    return files_num_thm

def get_groups_by_thm_num(n_groups, file_num_thm):
    file_num_thm = sorted(file_num_thm, key=lambda x: x[1], reverse=True)

    # initialise empty groups
    groups = [[] for _ in range(n_groups)]
    group_sizes = [0] * n_groups

    # greedily assign items to groups
    for item, size in file_num_thm:
        # find the group with the smallest current total size
        min_index = min(range(n_groups), key=lambda x: group_sizes[x])
        # assign the item to that group
        groups[min_index].append(item)
        group_sizes[min_index] += size
    
    return groups

In [148]:
groups = get_groups_by_thm_num(800, get_file_num_thm())[::-1]

In [112]:
k = theorem_statement_proof(remove_comments(file_str_from_commit(NEWEST_COMMIT, "Mathlib/Computability/RegularExpressions.lean")), "matches'_map")

In [113]:
k['statement']
k['proof']

"  | 0 => (map_zero _).symm\n  | 1 => (map_one _).symm\n  | char a => by\n    rw [eq_comm]\n    exact image_singleton\n    | R + S => by simp only [matches'_map, map, matches'_add]; rw [map_add]\n  | comp R S => by simp only [matches'_map, map, matches'_mul]; erw [map_mul]\n  | star R => by\n    simp_rw [map, matches', matches'_map]\n    rw [Language.kstar_eq_iSup_pow, Language.kstar_eq_iSup_pow]\n    simp_rw [← map_pow]\n    exact image_iUnion.symm"

In [57]:
print(theorem_statement(remove_comments(file_str_from_commit(NEWEST_COMMIT, "Mathlib/LinearAlgebra/QuadraticForm/Basic.lean")), "_root_.QuadraticForm.polarBilin_injective"))

theorem  _root_.QuadraticForm.polarBilin_injective (h : IsUnit (2 : R)) :
    Function.Injective (polarBilin : QuadraticForm R M → _) :=
  fun Q₁ Q₂ h₁₂ => QuadraticForm.ext fun x => h.mul_left_cancel <| by
    simpa using DFunLike.congr_fun (congr_arg toQuadraticForm h₁₂) x

variable [CommRing S] [Algebra S R] [Module S M] [IsScalarTower S R M]
variable [AddCommGroup N] [Module R N]

theorem _root_.QuadraticForm.polarBilin_comp (Q : QuadraticForm R N) (f : M →ₗ[R] N) :
    polarBilin (Q.comp f) = compl₁₂ (polarBilin Q) f f :=
  ext₂ fun x y => by simp [polar]

theorem compQuadraticForm_polar (f : R →ₗ[S] S) (Q : QuadraticForm R M) (x y : M) :
    polar (f.compQuadraticForm Q) x y = f (polar Q x y)
theorem  _root_.QuadraticForm.polarBilin_injective (h : IsUnit (2 : R)) :
    Function.Injective (polarBilin : QuadraticForm R M → _) :=
  fun Q₁ Q₂ h₁₂ => QuadraticForm.ext fun x => h.mul_left_cancel <| by
    simpa using DFunLike.congr_fun (congr_arg toQuadraticForm h₁₂) x

variable [CommR

In [47]:
file_theorem_info(NEWEST_COMMIT, "Mathlib/LinearAlgebra/QuadraticForm/Basic.lean")['statement'][-1]

'theorem basisRepr_eq_of_iIsOrtho {R M} [CommRing R] [AddCommGroup M] [Module R M]\n    [Invertible (2 : R)] (Q : QuadraticForm R M) (v : Basis ι R M)\n    (hv₂ : (associated (R'

In [7]:
all_commits = file_commits() # get the list of all commits from git history
newest_commit = get_commit(all_commits) # retrieve the git.Commit object of most recent commit
file_path = "Mathlib/Algebra/AddTorsor.lean"
file_str = file_str_from_commit(newest_commit, file_path)
thm_name = "AddTorsor.subsingleton_iff"
theorem_statement(file_str, thm_name)

'theorem AddTorsor.subsingleton_iff (G P : Type*) [AddGroup G] [AddTorsor G P] :\n    Subsingleton G ↔ Subsingleton P := '

## Run functions

In [46]:
all_commits = file_commits() # get the list of all commits from git history
newest_commit = get_commit(all_commits) # retrieve the git.Commit object of most recent commit
all_thm_names = all_theorem_names(NEWEST_COMMIT, dir_path=dir_path) # retrieve the name of all theorems from the most recent commit
# retrieve the body of all the theorems (in one specific file)
# newest_theorem_body = file_theorem_body(commit=newest_commit, 
#                                         file_path=dir_path, 
#                                         file_theorem_names=all_thm_names[dir_path])

In [124]:
all_thm_names

{'Mathlib/Combinatorics/Colex.lean': ['ofColex_toColex',
  'toColex_inj',
  'ofColex_inj',
  'ofColex_ne_ofColex',
  'ofColex_injective',
  'trans_aux',
  'antisymm_aux',
  'le_def',
  'toColex_le_toColex',
  'toColex_lt_toColex',
  'toColex_mono',
  'toColex_strictMono',
  'toColex_le_toColex_of_subset',
  'toColex_lt_toColex_of_ssubset',
  'toColex_empty',
  'ofColex_bot',
  'forall_le_mono',
  'forall_lt_mono',
  'toColex_le_singleton',
  'toColex_lt_singleton',
  'singleton_le_toColex',
  'singleton_le_singleton',
  'singleton_lt_singleton',
  'le_iff_sdiff_subset_lowerClosure',
  'toColex_sdiff_le_toColex_sdiff',
  'toColex_sdiff_lt_toColex_sdiff',
  "toColex_sdiff_le_toColex_sdiff'",
  "toColex_sdiff_lt_toColex_sdiff'",
  'max_mem_aux',
  'toColex_lt_toColex_iff_exists_forall_lt',
  'lt_iff_exists_forall_lt',
  "toColex_le_toColex_iff_max'_mem",
  "le_iff_max'_mem",
  "toColex_lt_toColex_iff_max'_mem",
  "lt_iff_max'_mem",
  'toColex_image_le_toColex_image',
  'toColex_image_lt_t

In [176]:
c = get_commit(file_commits(), 'a261710852a957a7d20d89b962e4b59887549f21')
t = remove_comments(file_str_from_commit(c, file_path='Mathlib/Algebra/Category/ModuleCat/Limits.lean'))
# theorem_statement_proof(t, 'Exact.map')['proof']

In [177]:
print(t)

import Mathlib.Algebra.Category.ModuleCat.Basic
import Mathlib.Algebra.Category.Grp.Limits
import Mathlib.Algebra.DirectLimit




open CategoryTheory

open CategoryTheory.Limits

universe v w u t

noncomputable section

namespace ModuleCat

variable {R : Type u} [Ring R]
variable {J : Type v} [Category.{t} J] (F : J ⥤ ModuleCat.{w} R)

instance addCommGroupObj (j) :
    AddCommGroup ((F ⋙ forget (ModuleCat R)).obj j) :=
  inferInstanceAs <| AddCommGroup (F.obj j)
set_option linter.uppercaseLean3 false

instance moduleObj (j) :
    Module.{u, w} R ((F ⋙ forget (ModuleCat R)).obj j) :=
  inferInstanceAs <| Module R (F.obj j)

def sectionsSubmodule : Submodule R (∀ j, F.obj j) :=
  { AddGrp.sectionsAddSubgroup.{v, w}
      (F ⋙ forget₂ (ModuleCat R) AddCommGrp.{w} ⋙
          forget₂ AddCommGrp AddGrp.{w}) with
    carrier := (F ⋙ forget (ModuleCat R)).sections
    smul_mem' := fun r s sh j j' f => by
      simp only [forget_map, Functor.comp_map, Pi.smul_apply, map_smul]
      dsimp [Fun

# Iterate over each theorem to examine if any change has been done to it over the commit history of the file it lives in.

In [66]:
def substituted_file(to_replace_file_str, old_thm_proof, thm_statement):
    """
    Return the most recent file that has the proof of the 
    theorem of interest replaced with its version from an older commit.
    """
    thm_proof_regex = theorem_proof_regex(thm_statement)
    replaced_file_str = re.sub(thm_proof_regex, old_thm_proof, to_replace_file_str, flags=re.DOTALL)
    return replaced_file_str

In [158]:
subprocess.run(['lake', 'exe', 'cache', 'get'], cwd=REPO_COPY_PATH)

No files to download
Decompressing 4626 file(s)
Unpacked in 145 ms
Completed successfully!


CompletedProcess(args=['lake', 'exe', 'cache', 'get'], returncode=0)

In [27]:
file_path = "Mathlib.Algebra.DualNumber"
path = os.path.join(REPO_COPY_PATH, file_path)
# with open(path, 'r') as sub_file:
#     content = sub_file.read()

subprocess.run(['lake', 'exe', 'cache', 'get!'], cwd=REPO_COPY_PATH)

# TODO NEED TO INVESTIGATE
try:
    subprocess.run(['lake', 'build', file_path],
                        cwd=REPO_COPY_PATH,
                        capture_output=True,
                        text=True,
                        check=True
                    )

except subprocess.CalledProcessError as e:
    x = e.stdout

x

info: Qq: cloning https://github.com/leanprover-community/quote4 to '././.lake/packages/Qq'
info: importGraph: cloning https://github.com/leanprover-community/import-graph.git to '././.lake/packages/importGraph'
✔ [2/10] Built Cache.IO
✔ [3/10] Built Cache.Hashing


KeyboardInterrupt: 

# Generate Dataset

In [95]:
dir_path = "Mathlib/Algebra/Bounds.lean"
all_thm_names = {fp: thm_ls for fp, thm_ls in all_theorem_names(REF_COMMIT, dir_path).items() if thm_ls}
dataset = {
    'filepath': [],
    'thm_name': [],
    'commit': [],
    'failed_proof': [],
    'error_msg': [],
    }

In [96]:
# amend directory names
if dir_path[-5:] == ".lean":
    no_lean_dir_path = dir_path[:-5].replace("/", "_")
else:
    no_lean_dir_path = dir_path.replace("/", "_")

# generate copy of original repo
REPO_COPY_PATH = os.path.join(REPO_COPY_DIR, 'mathlib4_'+no_lean_dir_path)
subprocess.run(
        # args=['cp', '-r', 'mathlib4', f'mathlib4_copies/mathlib4_{no_lean_dir_path}'], 
        args=['cp', '-r', REPO_PATH, REPO_COPY_PATH], 
        # cwd='/vol/bitbucket/tcwong/individual_project', 
        check=True
    ) 

CompletedProcess(args=['cp', '-r', '/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4', '/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copies/mathlib4_Mathlib_Algebra_Bounds'], returncode=0)

In [97]:
# get cache
subprocess.run(['lake', 'exe', 'cache', 'get!'], cwd=REPO_COPY_PATH)

Attempting to download 4626 file(s)


Downloaded: 4626 file(s) [attempted 4626/4626 = 100%] (100% success)


Decompressing 4626 file(s)
Unpacked in 4526 ms
Completed successfully!


CompletedProcess(args=['lake', 'exe', 'cache', 'get!'], returncode=0)

In [101]:
# generate dataset

for file_path, thm_names_ls in all_thm_names.items():
    print(f"Working on {file_path}")
    newest_file_str = remove_comments(file_str_from_commit(NEWEST_COMMIT, file_path))
    commits_ls = file_commits(file_path)
    sub_file_path = os.path.join(REPO_COPY_PATH, file_path)

    # keep the current version of the file for restoration later
    with open(sub_file_path, 'r') as sub_file:
        sub_file.seek(0)
        existing_file_str = sub_file.read()

    for thm_name in thm_names_ls:

        ### NOTE try-except block for debugging and to identify theorems that are not parsed
        try:
            thm_body = theorem_statement_proof(newest_file_str, thm_name)
        except:
            print(f'PROBLEM: {thm_name} ({file_path})')

        thm_statement = thm_body['statement']
        newest_thm_proof = thm_body['proof']

        for commit in commits_ls:

            # skip commits that newer than the reference commit
            if commit.committed_date > NEWEST_COMMIT.committed_date:
                continue
                
            # retrieve the old file 
            old_file_str = file_str_from_commit(commit, file_path)

            # skip commit if old file is empty 
            if old_file_str is None:
                continue
            old_file_str = remove_comments(old_file_str)

            # skip history if the theorem statement has differed
            if thm_statement not in old_file_str:
                break
        
            old_thm_proof = theorem_statement_proof(old_file_str, thm_name)['proof']
            
            # skip commit if there has been no change to proof
            if newest_thm_proof == old_thm_proof:
                continue
            
            subbed_file_str = substituted_file(newest_file_str, old_thm_proof, thm_statement)

            # overwrite file with changes 
            with open(sub_file_path, 'w') as sub_file:
                sub_file.write(subbed_file_str)

            # attempt to compile the code in lean
            if file_path[-5:] == ".lean":
                file_path_module = file_path[:-5].replace("/", ".")
            else:
                file_path_module = file_path.replace("/", ".")
            try:
                subprocess.run(['lake', 'build', file_path_module],
                                    cwd=REPO_COPY_PATH,
                                    capture_output=True,
                                    text=True,
                                    check=True
                                )
            except subprocess.CalledProcessError as e:
                dataset['filepath'].append(file_path)
                dataset['thm_name'].append(thm_name)
                dataset['commit'].append(commit.hexsha)
                dataset['failed_proof'].append(old_thm_proof)
                dataset['error_msg'].append(e.stdout)

    # restore file to existing condition
    with open(sub_file_path, 'w') as sub_file:
       sub_file.write(existing_file_str)


Working on Mathlib/Algebra/Bounds.lean


In [82]:
f = file_str_from_commit(file_commits("Mathlib/CategoryTheory/Comma/StructuredArrow.lean")[38], "Mathlib/CategoryTheory/Comma/StructuredArrow.lean")

In [76]:
print(dataset['error_msg'][0])

✖ [570/570] Building Mathlib.CategoryTheory.Comma.Presheaf
trace: .> LEAN_PATH=././.lake/packages/batteries/.lake/build/lib:././.lake/packages/Qq/.lake/build/lib:././.lake/packages/aesop/.lake/build/lib:././.lake/packages/proofwidgets/.lake/build/lib:././.lake/packages/Cli/.lake/build/lib:././.lake/packages/importGraph/.lake/build/lib:././.lake/build/lib DYLD_LIBRARY_PATH=././.lake/build/lib /Users/timmonkey/.elan/toolchains/leanprover--lean4---v4.9.0-rc1/bin/lean -Dpp.unicode.fun=true -DautoImplicit=false -DrelaxedAutoImplicit=false ././././Mathlib/CategoryTheory/Comma/Presheaf.lean -R ./././. -o ././.lake/build/lib/Mathlib/CategoryTheory/Comma/Presheaf.olean -i ././.lake/build/lib/Mathlib/CategoryTheory/Comma/Presheaf.ilean -c ././.lake/build/ir/Mathlib/CategoryTheory/Comma/Presheaf.c --json
error: ././././Mathlib/CategoryTheory/Comma/Presheaf.lean:203:141: unsolved goals
C : Type u
inst✝ : Category.{v, u} C
A : Cᵒᵖ ⥤ Type v
F : (CostructuredArrow yoneda A)ᵒᵖ ⥤ Type v
X : C
G : (Cost

In [120]:
# save dataset as json
json_path = DATASET_PATH + no_lean_dir_path + '.json'
with open(json_path, 'w') as file:
    json.dump(dataset, file)

In [102]:
# remove mathlib4 copy
subprocess.run(
    # args=['rm', '-rf', f'mathlib4_copies/mathlib4_{no_lean_dir_path}'], 
    args=['rm', '-rf', REPO_COPY_PATH], 
    check=True
    )

rm: /Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copies/mathlib4_Mathlib_Algebra_Bounds/.lake/build: Permission denied
rm: /Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copies/mathlib4_Mathlib_Algebra_Bounds/.lake: Permission denied
rm: /Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copies/mathlib4_Mathlib_Algebra_Bounds: Permission denied


CalledProcessError: Command '['rm', '-rf', '/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copies/mathlib4_Mathlib_Algebra_Bounds']' returned non-zero exit status 1.

# New Dataset Generation

In [150]:
dir_paths_str = '|'.join(groups[7])
suffix_name = groups[7][0].replace("/", "_")
suffix_name = suffix_name[:-5] if suffix_name[-5:] == ".lean" else suffix_name

dir_paths = dir_paths_str.split('|')

# declare environment variables
REPO_COPY_PATH = "/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/mathlib4_copies/mathlib4_" + suffix_name
REPO = Repo(REPO_PATH)
THEOREM_NAME_REGEX = re.compile(r'^\s*(?:.*\s)?(?:theorem|lemma)\s+(\S+)', re.MULTILINE)

In [151]:
suffix_name

'Mathlib_Analysis_BoxIntegral_Partition_Split'

In [152]:
def construct_dataset(dir_path):
    all_thm_names = {fp: thm_ls for fp, thm_ls in all_theorem_names(REPO.head.commit, dir_path).items() if thm_ls}
    dataset = {
        'filepath': [],
        'thm_name': [],
        'commit': [],
        'failed_proof': [],
        'error_msg': [],
        }

    count = 0

    # generate dataset
    for file_path, thm_names_ls in all_thm_names.items():
        # print(f"{datetime.now()} - Working on {file_path}", flush=True)
        newest_file_str = remove_comments(file_str_from_commit(REPO.head.commit, file_path))
        commits_ls = file_commits(file_path)
        sub_file_path = os.path.join(REPO_COPY_PATH, file_path)

        # keep the current version of the file for restoration later
        with open(sub_file_path, 'r') as sub_file:
            sub_file.seek(0)
            existing_file_str = sub_file.read()

        for thm_name in thm_names_ls:

            ### NOTE try-except block for debugging and to identify theorems that are not parsed
            try:
                thm_body = theorem_statement_proof(newest_file_str, thm_name)
            except:
                print(f'PROBLEM: {thm_name} ({file_path})', flush=True)

            thm_statement = thm_body['statement']
            newest_thm_proof = thm_body['proof']

            for commit in commits_ls:

                # skip commits that newer than the reference commit
                if commit.committed_date > REPO.head.commit.committed_date:
                    continue
                    
                # retrieve the old file 
                old_file_str = file_str_from_commit(commit, file_path)

                # skip commit if old file is empty 
                if old_file_str is None:
                    continue
                old_file_str = remove_comments(old_file_str)

                # skip history if the theorem statement has differed
                if thm_statement not in old_file_str:
                    break
            
                old_thm_proof = theorem_statement_proof(old_file_str, thm_name)['proof']
                
                # skip commit if there has been no change to proof
                if newest_thm_proof == old_thm_proof:
                    continue
                
                subbed_file_str = substituted_file(newest_file_str, old_thm_proof, thm_statement)

                # overwrite file with changes 
                with open(sub_file_path, 'w') as sub_file:
                    sub_file.write(subbed_file_str)

                # attempt to compile the code in lean
                if file_path[-5:] == ".lean":
                    file_path_module = file_path[:-5].replace("/", ".")
                else:
                    file_path_module = file_path.replace("/", ".")
                try:
                    subprocess.run(['lake', 'build', file_path_module],
                                        cwd=REPO_COPY_PATH,
                                        capture_output=True,
                                        text=True,
                                        check=True
                                    )
                except subprocess.CalledProcessError as e:
                    count += 1
                    dataset['filepath'].append(file_path)
                    dataset['thm_name'].append(thm_name)
                    dataset['commit'].append(commit.hexsha)
                    dataset['failed_proof'].append(old_thm_proof)
                    dataset['error_msg'].append(e.stdout)

        # restore file to existing condition
        with open(sub_file_path, 'w') as sub_file:
            sub_file.write(existing_file_str)

    print("Number of datapoints = " + str(count))
    return dataset

def save_data_to_json(dataset, dir_path):
    path = '/Users/timmonkey/Desktop/Imperial/Summer Term/Individual Project/Repos/Data/' + dir_path + '.json'
    with open(path, 'w') as file:
        json.dump(dataset, file)

In [153]:
# copy mathlib4
print('STARTING: copy mathlib4', flush=True)
subprocess.run(
    args=['cp', '-r', REPO_PATH, REPO_COPY_PATH], 
    check=True
    ) 
print('DONE: copy mathlib4', flush=True)

# retrieve cache that will be used for building lean later
print('STARTING: retrieve cache', flush=True)
subprocess.run(
    args=['lake', 'exe', 'cache', 'get!'], 
    cwd=REPO_COPY_PATH, 
    check=True 
)
print('DONE: retrieve cache', flush=True)


for dir_path in dir_paths:
    
    # dir_path = 'Mathlib/' + dir_path
    # amend directory names
    if dir_path[-5:] == ".lean":
        no_lean_dir_path = dir_path[:-5].replace("/", "_")
    else:
        no_lean_dir_path = dir_path.replace("/", "_")

    print(f'STARTING: {dir_path}')
    dataset = construct_dataset(dir_path)
    save_data_to_json(dataset, no_lean_dir_path) 
    print(f'DONE: {dir_path}')

print('STARTING: removing mathlib4 copy', flush=True)
subprocess.run(
    args=['rm', '-rf', REPO_COPY_PATH], 
    check=True
    )
print('DONE: removing mathlib4 copy', flush=True)
print('COMPLETED', flush=True)


STARTING: copy mathlib4
DONE: copy mathlib4
STARTING: retrieve cache
Attempting to download 4626 file(s)


Downloaded: 4626 file(s) [attempted 4626/4626 = 100%] (100% success)


Decompressing 4626 file(s)
Unpacked in 4218 ms
Completed successfully!
DONE: retrieve cache
STARTING: Mathlib/Analysis/BoxIntegral/Partition/Split.lean
Number of datapoints = 52
DONE: Mathlib/Analysis/BoxIntegral/Partition/Split.lean
STARTING: Mathlib/Geometry/Manifold/BumpFunction.lean
