## Preprocess and save training data

In [1]:
import sys
sys.path.append("../../src")

from tree_parser import file_contents, meta_math_database

import json

from expanding import construct_proof

from tqdm import tqdm

import time

from sklearn.model_selection import train_test_split

import os

from theorem_database import TheoremDatabase

In [2]:
tdb = TheoremDatabase("../../data/tdb")

In [13]:
def get_theorem_complexity(theorem_name):
    theorem = tdb[theorem_name]
    
    if theorem == None:
        return 0
    
    if "complexity" not in theorem:
        #print(theorem["theorem"])
        
        if len(theorem["steps"]) == 0:
            theorem["complexity"] = 1
        else:
            theorem["complexity"] = sum([get_theorem_complexity(tt["theorem"]) for tt in theorem["steps"]])
            
    return theorem["complexity"]

def get_theorem_complexity_and_save(theorem_name):
    #_save = False
    #theorem = tdb[theorem_name]
    #if theorem == None:
    #    _save = True
    
    t_complexity = get_theorem_complexity(theorem_name)
    
    #if _save:
        #tdb.save()
    
    return t_complexity 

In [5]:
%%time

text = file_contents("../../data/set_mod.mm")
database = meta_math_database(text,n=6000)

print()

included 3651825 tokens from ../../data/set_mod.mm
proposition: 6000
CPU times: user 41 s, sys: 2.41 s, total: 43.4 s
Wall time: 45.3 s


In [2]:
test_props = [
    "dfss3",
    "dfss2",
    "dfss",
    "t1lucas",
    "t2lucas",
    "ssun1",
    "t4lucas",
    "t5lucas",
    "opoe",
    "omoe",
    "pwin",
    "inidm",
    "in0",
    "sstr",
    "ssequn1",
    "pwunss",
    "epee",
    "emee",
    "oddp1eveni",
    "dvdsadd2b",
    "opoeALTV",
    "omoeALTV"
]

#Check if something is absent
for t in test_props:
    if t not in database.propositions.keys():
        print(f"{t} not in database.")

NameError: name 'database' is not defined

In [15]:
train_props = [p for p in database.propositions.keys() if p not in test_props]
print(len(train_props))

5986


In [16]:
train_props, valid_props = train_test_split(train_props, test_size=0.2, random_state=2)
print(len(train_props), len(valid_props))

4788 1198


In [17]:
def _expand_all_nodes_with_depth_and_filter(root, target_depth, filter_list):
    if root.depth == target_depth and root.label in filter_list:
        root = root.expand()
    
    for i in root.inputs:
        expand_all_nodes_with_depth_and_filter(i, target_depth, filter_list)
        
    return root

In [18]:
def get_prop_dataset_batch(prop_label, filter_list):

    prop = database.propositions[prop_label]

    prop_proof = construct_proof(prop)

    steps2expand = [(s.label, s.statement) for s in prop_proof.get_steps_df() if s.label in filter_list]

    prop_batch = []

    for s_label, s_statement in steps2expand:
        prop_proof = construct_proof(prop)
        for s in prop_proof.get_steps_df():
            if s_label == s.label and s_statement == s.statement:
                s.expand()
                break

        prop_dataset = {
            'id':prop.number,
            'prop': prop_label,
            'steps':[],
            'links':[]
        }

        #Populate step numbers
        next_step_n = 0

        for s in prop_proof.get_steps_df(): 
            next_step_n += 1
            s._step_num = next_step_n

        #Populate data

        _no_zero_labels = True 
        #This flag is necessary to remove from the dataset propositions which every label is true
        #All labels true doesnt contribute much to the train/validation

        for s in prop_proof.get_steps_df():
            #Since negative class is some depth bigger than 0, in case we have some, set this to false
            if s.statement_depth > 0: _no_zero_labels = False

            prop_dataset["steps"].append((
                s._step_num,
                s.label,
                s.raw_statement,
                s.raw_prop_statement,
                get_theorem_complexity(s.label), #Lemma complexity
                int(s.statement_depth == 0)
            ))

            for child_s in s.inputs:
                prop_dataset["links"].append((child_s._step_num, s._step_num))

        #Append only batch which have one example of a zero label
        if not _no_zero_labels:
            prop_batch.append(prop_dataset)
            
    return prop_batch

In [19]:
%%time
prop_batch = get_prop_dataset_batch("dfss2", train_props)

CPU times: user 16.4 ms, sys: 15.1 ms, total: 31.5 ms
Wall time: 42.7 ms


In [20]:
def _get_prop_dataset_with_filter(prop_label, filter_list):
    prop = database.propositions[prop_label]
    
    prop_dataset = {
        'id':prop.number,
        'prop': prop_label,
        'steps':[],
        'links':[]
    }
    
    prop_proof = construct_proof(prop)
    prop_proof = expand_all_nodes_with_depth_and_filter(prop_proof, 0, filter_list)

    #Populate step numbers
    next_step_n = 0
    
    for s in prop_proof.get_steps_df(): 
        next_step_n += 1
        s._step_num = next_step_n
    
    #Populate data
    
    _no_zero_labels = True 
    #This flag is necessary to remove from the dataset propositions which every label is true
    #All labels true doesnt contribute much to the train/validation
    
    for s in prop_proof.get_steps_df():
        #Since negative class is some depth bigger than 0, in case we have some, set this to false
        if s.statement_depth > 0: _no_zero_labels = False
        
        prop_dataset["steps"].append((
            s._step_num,
            s.label,
            s.raw_statement,
            s.raw_prop_statement,
            int(s.statement_depth == 0)
        ))
        
        for child_s in s.inputs:
            prop_dataset["links"].append((child_s._step_num, s._step_num))
        
    if _no_zero_labels:
        raise Exception(f"Only positive classes in {prop_label}.")
        
    return prop_dataset

In [21]:
%%time

## Process training data

train_props_data = []
train_failed_props = []

for i, p in enumerate(train_props):
    print(f"\r{i+1}/{len(train_props)}", end="")
    
    if len(database.propositions[p].entails_proof_steps) == 0:
        continue
        
    try:
        #train_props_data.append(get_prop_dataset_with_filter(p, train_props))
        train_props_data.extend(get_prop_dataset_batch(p, train_props))
    except:
        train_failed_props.append(p)
      
print()
print(len(train_props_data))
print(len(train_failed_props))

4788/4788
24769
104
CPU times: user 4min 43s, sys: 36.8 s, total: 5min 20s
Wall time: 1h 34min 21s


In [23]:
%%time

## Process validation data

valid_props_data = []
valid_failed_props = []

for i, p in enumerate(valid_props):
    print(f"\r{i+1}/{len(valid_props)}", end="")
    
    if len(database.propositions[p].entails_proof_steps) == 0:
        continue
        
    try:
        #valid_props_data.append(get_prop_dataset_with_filter(p, valid_props))
        valid_props_data.extend(get_prop_dataset_batch(p, valid_props))
    except Exception as e:
        #raise e
        valid_failed_props.append(p)
      
print()
print(len(valid_props_data))
print(len(valid_failed_props))

1198/1198
1802
7
CPU times: user 23.3 s, sys: 3.4 s, total: 26.7 s
Wall time: 5min 29s


In [24]:
ts = time.time()
print(ts)
os.mkdir(f"../../data/datasets/{ts}")

json.dump(train_props_data, open(f"../../data/datasets/{ts}/train.json", "w"))
json.dump(valid_props_data, open(f"../../data/datasets/{ts}/valid.json", "w"))
open(f"../../data/datasets/{ts}/test.txt", "w").write("\n".join(test_props))

1660253531.516595


148

In [39]:
tdb.save()

FileNotFoundError: [Errno 2] No such file or directory: '../../data/tdb.pkl'