In [1]:
import pandas as pd
import json
import ast

### Read data from individual csv files

In [2]:
# positive examples for classification
algebra_precalc_df = pd.read_csv("data/formula_data_algebra-precalculus.csv")
algebra_precalc_df.head()

Unnamed: 0,fid,mtype,exprstr,mention,tokens,tags
0,6327106106a69a3f488d6eb8_f_1_0_0,FUNC,"f(x,y)=x/y",the function,"['f', '(', 'x', ',', 'y', ')', '=', 'x', '/', ...",['algebra-precalculus']
1,6327106106a69a3f488d6f55_f_3_0_0,SCAL,\frac{a+bi}{c+di},two complex numbers,"['\\frac', '{', 'a', '+', 'b', 'i', '}', '{', ...",['algebra-precalculus']
2,6327106106a69a3f488d6f5c_f_5_0_0,FUNC,g,the functions,['g'],"['algebra-precalculus', 'functional-equations']"
3,6327106106a69a3f488d6f5c_f_5_4_0,FUNC,g,the function,['g'],"['algebra-precalculus', 'functional-equations']"
4,6327106106a69a3f488d6f5c_f_5_7_0,FUNC,g,every function f_5_7_0,['g'],"['algebra-precalculus', 'functional-equations']"


In [3]:
# negative examples for classification 
a_geometry_df = pd.read_csv("data/formula_data_analytic-geometry.csv")
elem_func_df = pd.read_csv("data/formula_data_elementary-functions.csv")
elem_num_th_df = pd.read_csv("data/formula_data_elementary-number-theory.csv")
elem_set_th_df = pd.read_csv("data/formula_data_elementary-set-theory.csv")
euc_geom_df = pd.read_csv("data/formula_data_euclidean-geometry.csv")
trig_df = pd.read_csv("data/formula_data_trigonometry.csv")

neg_df_list = [a_geometry_df, elem_func_df, elem_num_th_df, elem_set_th_df, euc_geom_df, trig_df]
lengths_list = [d_frame.shape[0] for d_frame in neg_df_list]
print(lengths_list)

[234, 182, 6253, 10000, 330, 1705]


#### Merge negative examples datasets while keeping the occurence frequencies and filter

In [4]:
def merge_dfs(df_list, return_size):
    r_nums = [d_frame.shape[0] for d_frame in df_list]
    r_nums_sum = sum(r_nums)

    sel_neg_dfs = [df_list[i].sample(n=int((r_nums[i]/r_nums_sum)*return_size)) for i in range(len(r_nums))]
    neg_samples = pd.concat(sel_neg_dfs, axis=0)
    if neg_samples.shape[0] < return_size:
        neg_samples = pd.concat([neg_samples,df_list[-1].sample(n=return_size-neg_samples.shape[0])])
    return neg_samples

In [5]:
def filter_data(train_df, is_pos):
    def cell_to_str(cell_val):
        if isinstance(cell_val,int):
            return "['" + str(cell_val) + "']"
        elif isinstance(cell_val,float):
            return "['" + str(cell_val) + "']"
        elif isinstance(cell_val,str):
            if len(cell_val) == 1:
                return "['" + cell_val + "']"
            else:
                return cell_val
        else:
            return cell_val

    train_df = train_df.drop(["fid", "exprstr", "mention", "tags"], axis=1)
    train_df["tokens"] = train_df["tokens"].map(cell_to_str)

    if is_pos:
        train_df["label"] = 1
    else:
        train_df["label"] = 0
    return train_df

In [6]:
intended_train_size = 11000 
neg_samples_df = merge_dfs(neg_df_list, int(intended_train_size/2))
neg_samples_df.head()
print("negative examples size: ", neg_samples_df.shape[0])
pos_samples_df = algebra_precalc_df.sample(n=int(intended_train_size/2))
print("positive examples size: ", pos_samples_df.shape[0])


negative examples size:  5500
positive examples size:  5500


In [7]:
pos_samples_df = filter_data(pos_samples_df, True)
neg_samples_df = filter_data(neg_samples_df, False)

data_df = pd.concat([pos_samples_df, neg_samples_df], axis=0)
print("total size: ", data_df.shape[0])
data_df.head()

total size:  11000


Unnamed: 0,mtype,tokens,label
870,SET,"['\\{', 'a', '\\}']",1
5187,SCAL,"['3', 'r']",1
5222,FUNC,['f'],1
4940,SCAL,"['\\binom', '{', '4', '}', '{', '4', '}']",1
1740,SCAL,['1'],1


In [8]:
data_df.to_csv("data/bin_class_data.csv", index=False)