Esse notebook irá fazer um baseline utilizando redes neurais para computar a probabilidade de um nó ser ocultado baseando-se apenas na expressão da descrição. Os dados de treinamento serão criados expandindo-se apenas um nível da árvore de provas, chamando os novos nós de 1 e os originais 0. Para verificar se os nós são os mesmos, vamos utilizar as expressões apenas, já que uma vez que um nó é expandido a sua proposição original se quebra em diversas proposições que constituem sua prova.

https://www.youtube.com/watch?v=8owQBFAHw7E

https://www.youtube.com/watch?v=uF53xsT7mjc

In [1]:
import sys
sys.path.append("..")

import random

from collections import Counter, defaultdict

import json

import glob

import os

import numpy as np

#from tree_parser import file_contents, meta_math_database

#from my_utils import print_proof_props_graph, get_proof_steps, print_ident_proof, print_proof_linear_steps
#from my_utils import get_proof_steps_graph, print_proof_steps_graph
#from expanding import construct_proof, expand_proof_step_ps, expand_all_nodes_with_depth
#from expanding import PStep, construct_proof

from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

In [2]:
all_props = json.load(open("prop_dataset_1659467564.761641.json"))
print(len(all_props))

7476


In [3]:
train_props, valid_props = train_test_split(all_props, test_size=0.2, random_state=2)
print(len(train_props), len(valid_props))

5980 1496


In [15]:
train_props[0]["steps"][0][4]

1

In [16]:
class BaseClf:
    def score_auc(self, p):
        try:
            predictions = self.predict(p["steps"])
            labels = [s[4] for s in p["steps"]]
            prop_auc = roc_auc_score(labels, predictions)
        except Exception as e:
            print(p)
            raise e
        return prop_auc

In [17]:
%%time

class RandomClassifier(BaseClf):
    def predict(self, X):
        return np.random.random(len(X))
    
rnd_clf = RandomClassifier()

auc_values = []
for p in tqdm(valid_props):
    auc_values.append(rnd_clf.score_auc(p))

np.mean(auc_values)

100%|█████████████████████████████████████████████████████████████████████████████| 1496/1496 [00:00<00:00, 1835.00it/s]

CPU times: user 719 ms, sys: 65.2 ms, total: 784 ms
Wall time: 819 ms





0.49916475980219005

In [18]:
class WeightClassifier(BaseClf):   
    def predict(self, X):
        predictions = list()
        for s in X:
            predictions.append(len(s[2].split(" ")))
        
        predictions = np.array(predictions)
        
        #Get max value and normalize by it
        max_pred = predictions.max()
        predictions = predictions / max_pred
        
        # The way it is the largest statement will get the value of 1
        # Here we will treat the negative class as the statement to be removed
        # So we need to invert the prediction with 1-pred     
        return 1 - predictions
    
weight_clf = WeightClassifier()

auc_values = []
for p in tqdm(valid_props):
    auc_values.append(weight_clf.score_auc(p))

np.mean(auc_values)

100%|█████████████████████████████████████████████████████████████████████████████| 1496/1496 [00:00<00:00, 1624.88it/s]


0.6277139343466421

In [19]:
class WeightDistClassifier(BaseClf):   
    def predict(self, X):
        predictions = list()
        for s in X:
            predictions.append(len(set(s[2].split(" "))))
        
        predictions = np.array(predictions)
        
        #Get max value and normalize by it
        max_pred = predictions.max()
        predictions = predictions / max_pred
        
        # The way it is the largest statement will get the value of 1
        # Here we will treat the negative class as the statement to be removed
        # So we need to invert the prediction with 1-pred     
        return 1 - predictions
    
weight_dist_clf = WeightDistClassifier()

auc_values = []
for p in tqdm(valid_props):
    auc_values.append(weight_dist_clf.score_auc(p))

np.mean(auc_values)

100%|█████████████████████████████████████████████████████████████████████████████| 1496/1496 [00:01<00:00, 1391.22it/s]


0.5518099511704142

In [67]:
def get_prop_parameters(target_prop):
    steps_dict = {s[0]:{"prop": s[1], "statement":s[2], "y":s[3]} for s in target_prop["steps"]}

    #Add neighbors refs
    for s in steps_dict.values():
        s["inputs"] = []
        s["output"] = None
    for s, d in target_prop["links"]:
        steps_dict[d]["inputs"].append(s)
        steps_dict[s]["output"] = d
        
    #Add number of neighbors
    for s in steps_dict.values():   
        s["n_neighbors"] = len(s["inputs"]) + (1 if s["output"] != None else 0)
        
    #Add number of symbols in the statement
    for s in steps_dict.values():
        s["ns"] = len(s["statement"].split(" "))

    #Add number of distinct symbols in the statement
    for s in steps_dict.values():
        s["nds"] = len(set(s["statement"].split(" ")))
        
    #Add ratio between # of symbols and # of distinct symbols
    for s in steps_dict.values():
        s["ns_nds_ratio"] = s["ns"] / s["nds"]
        
    #Add number of symbols in the statement normalize with the largest
    largest_ns = max([s["ns"] for s in steps_dict.values()])

    for s in steps_dict.values():
        s["ns_norm"] = s["ns"] / largest_ns

    #Add number of distinct symbols in the statement normalize with the largest
    largest_nds = max([s["nds"] for s in steps_dict.values()])

    for s in steps_dict.values():
        s["nds_norm"] = s["nds"] / largest_nds
        
    #Add neighbors summarized parameters
    neighbors_params = ["ns", "nds", "ns_norm", "nds_norm", "ns_nds_ratio"]

    for s in steps_dict.values():
        for p in neighbors_params:
            neighbors_values = []
            if s["output"] != None:
                neighbors_values.append(steps_dict[s["output"]][p])
            for inp in s["inputs"]:
                neighbors_values.append(steps_dict[inp][p])

            s[f"neighbor_{p}_max"] = max(neighbors_values)
            s[f"neighbor_{p}_min"] = min(neighbors_values)
            s[f"neighbor_{p}_avg"] = np.mean(neighbors_values)
            
            s[f"neighbor_{p}_max_ratio"] = s[p] / s[f"neighbor_{p}_max"]
            s[f"neighbor_{p}_min_ratio"] = s[p] / s[f"neighbor_{p}_min"]
            s[f"neighbor_{p}_avg_ratio"] = s[p] / s[f"neighbor_{p}_avg"]
            

    return steps_dict

In [68]:
train_features = []

get_prop_parameters(target_prop)

{1: {'prop': 'syl',
  'statement': '( ph -> U_ x e. A B = U_ x e. A C )',
  'y': 1,
  'inputs': [2, 6],
  'output': None,
  'n_neighbors': 2,
  'ns': 15,
  'nds': 11,
  'ns_nds_ratio': 1.3636363636363635,
  'ns_norm': 0.7142857142857143,
  'nds_norm': 1.0,
  'neighbor_ns_max': 21,
  'neighbor_ns_min': 11,
  'neighbor_ns_avg': 16.0,
  'neighbor_ns_max_ratio': 0.7142857142857143,
  'neighbor_ns_min_ratio': 1.3636363636363635,
  'neighbor_ns_avg_ratio': 0.9375,
  'neighbor_nds_max': 11,
  'neighbor_nds_min': 11,
  'neighbor_nds_avg': 11.0,
  'neighbor_nds_max_ratio': 1.0,
  'neighbor_nds_min_ratio': 1.0,
  'neighbor_nds_avg_ratio': 1.0,
  'neighbor_ns_norm_max': 1.0,
  'neighbor_ns_norm_min': 0.5238095238095238,
  'neighbor_ns_norm_avg': 0.7619047619047619,
  'neighbor_ns_norm_max_ratio': 0.7142857142857143,
  'neighbor_ns_norm_min_ratio': 1.3636363636363635,
  'neighbor_ns_norm_avg_ratio': 0.9375000000000001,
  'neighbor_nds_norm_max': 1.0,
  'neighbor_nds_norm_min': 1.0,
  'neighbor_nds

In [144]:
make_pipeline(
    FunctionTransformer(lambda a: list(map(lambda b:b[1:], a))),
    #CountVectorizer(tokenizer=lambda a: a.split(" "))
).transform(train_X[:2])

[[47, 20, 0.5053763440860215, 0.9523809523809523],
 [91, 19, 0.978494623655914, 0.9047619047619048]]

In [98]:
FunctionTransformer(lambda a: np.vectorize(len)(a)).transform(["Lucas", "oi", "ae"])

array([5, 2, 2])

In [104]:
FunctionTransformer(lambda a: np.expand_dims(np.vectorize(len)(a), axis=1)).transform(train_X[:100]).shape

(100, 1)

In [92]:
CountVectorizer(tokenizer=lambda a: a.split(" ")f).fit_transform(train_X[:100])

<100x32 sparse matrix of type '<class 'numpy.int64'>'
	with 1499 stored elements in Compressed Sparse Row format>

In [163]:
class ExtractFirstElement(FunctionTransformer):
    def __init__(self):
        super(ExtractFirstElement, self).__init__(lambda a: list(map(lambda b:b[0], a)))
        
        
class RemoveFirstElement(FunctionTransformer):
    def __init__(self):
        super(RemoveFirstElement, self).__init__(lambda a: list(map(lambda b:b[1:], a)))

In [171]:
from sklearn.tree import DecisionTreeClassifier

In [186]:
clf = make_pipeline(
    make_union(
#         make_pipeline(
#             #FunctionTransformer(lambda a: list(map(lambda b:b[0], a))),
#             ExtractFirstElement(),
#             CountVectorizer(tokenizer=lambda a: a.split(" "))
#         ),
        #FunctionTransformer(lambda a: list(map(lambda b:b[1:], a)))
        RemoveFirstElement()
    ),
    RandomForestClassifier(n_estimators=100, random_state=2)
    #DecisionTreeClassifier()
)  

In [187]:
%%time
clf.fit(train_X[:10000], train_y[:10000])

CPU times: user 804 ms, sys: 11.2 ms, total: 815 ms
Wall time: 816 ms


In [188]:
clf.score(valid_X, valid_y)

0.7020472335585737

In [189]:
def get_prop_auc(prop_label):
    prop_X, prop_y = get_prop_dataset(prop_label)
    prop_pred = clf.predict(prop_X)
    return roc_auc_score(prop_y, prop_pred)

In [190]:
get_prop_auc("dfss2")

0.53125

In [191]:
theorem_list = [
    "dfss3",
    "dfss2",
    "dfss",
    "t1lucas",
    "t2lucas",
    "ssun1",
    "t4lucas",
    "t5lucas",
    "opoe",
    "omoe",
    "pwin",
    "inidm",
    "in0",
    "sstr",
    "ssequn1",
    "pwunss",
    #"epee",
    #"emee",
    #"oddp1eveni",
    "dvdsadd2b",
    #"opoeALTV",
    #"omoeALTV"
]

#Ensure everything exists
for t in theorem_list:
    if t not in database.propositions.keys():
        print(f"{t} not in database.")

opoe not in database.
omoe not in database.
dvdsadd2b not in database.


In [202]:
np.array(prop_X)[:, 3]

array(['0.2698412698412698', '0.36507936507936506', '0.20634920634920634',
       '0.20634920634920634', '0.2698412698412698', '0.42857142857142855',
       '0.3968253968253968', '0.30158730158730157', '0.746031746031746',
       '0.5555555555555556', '0.6507936507936508', '0.1746031746031746',
       '0.5555555555555556', '0.6825396825396826', '0.6190476190476191',
       '0.42857142857142855', '0.49206349206349204',
       '0.49206349206349204', '1.0', '0.42857142857142855',
       '0.8095238095238095', '0.23809523809523808', '0.42857142857142855',
       '0.8095238095238095'], dtype='<U125')

In [212]:
def get_prop_auc_weight(prop_label):
    prop_X, prop_y = get_prop_dataset(prop_label)
    prop_pred = 1 - np.array(prop_X)[:, 4].astype(float)#clf.predict(prop_X)
    return roc_auc_score(prop_y, prop_pred)

In [213]:
get_prop_auc_weight("dfss2")

0.43359375

In [226]:
get_prop_auc_weight(valid_props[8])

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [227]:
valid_props[8]

'df-lim'

In [229]:
get_prop_dataset(valid_props[8])

([], [])

In [228]:
get_prop_auc_weight(valid_props[8])

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [235]:
auc_vals = []
failed_theorems = 0
for t in tqdm(valid_props[:1000]):
    try:
        #auc_val = get_prop_auc(t)
        auc_val = get_prop_auc_weight(t)
        auc_vals.append(auc_val)
    except:
        failed_theorems += 1
        
    
print(np.mean(auc_vals))
print(f"Failed theorems: {failed_theorems}")

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 142.25it/s]

0.5537687336330955
Failed theorems: 43





In [214]:
auc_vals = []
for t in tqdm(theorem_list):
    if t in database.propositions.keys():
        #auc_val = get_prop_auc(t)
        auc_val = get_prop_auc_weight(t)
        auc_vals.append(auc_val)
    
print(np.mean(auc_vals))

100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 146.58it/s]

0.6806973829417059





In [None]:
keep working on the machine learning baseline