In [14]:
gas_factor = 30

In [15]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
import math

# !!! Determinism !!!
np.random.seed(42)

# file_list = [("MNT4", "./mnt4/final_exp_parallel_1500.csv"),
#             ("MNT6", "./mnt6/final_exp_parallel_1500.csv")]

file_list = [("MNT4", "./mnt4/final_exp_parallel_50000.csv"),
            ("MNT6", "./mnt6/final_exp_parallel_50000.csv")]

num_pairs = 2
x_bits = 1
x_hamming = 1

def get_dfs(files):
    results = []
    for file in files:
        (name, path) = file;
        df = pd.read_csv(path)
#         df = df[df["x_is_negative"] == 1.0]
        #df.loc[:,"num_pairs"] *= 0.5
        df.drop("x_is_negative", axis = 1, inplace = True)
        df.drop("group_limbs", axis = 1, inplace = True)
        df.drop("x_bit_length", axis = 1, inplace = True)
        df.drop("x_hamming_weight", axis = 1, inplace = True)
        df.drop("num_pairs", axis = 1, inplace = True)
        df.drop("exp_w0_is_negative", axis = 1, inplace = True)
        results.append(df)
        
    return results

In [16]:
dataframes = get_dfs(file_list)

In [17]:
dataframes[0].head(15)

Unnamed: 0,modulus_limbs,exp_w0_bit_length,exp_w0_hamming,exp_w1_bit_length,exp_w1_hamming,run_microseconds
0,4,342,322,120,34,325
1,4,342,153,120,6,289
2,4,342,171,120,7,297
3,4,342,26,120,63,317
4,10,342,322,120,34,1490
5,11,342,322,120,34,1738
6,4,342,237,120,19,299
7,4,342,7,120,119,289
8,13,342,322,120,34,2511
9,4,342,175,120,39,325


In [19]:
dataframes[1].head(15)

Unnamed: 0,modulus_limbs,exp_w0_bit_length,exp_w0_hamming,exp_w1_bit_length,exp_w1_hamming,run_microseconds
0,4,63,40,195,18,327
1,4,63,8,195,150,278
2,4,63,52,195,132,305
3,4,63,26,195,93,292
4,4,63,41,195,160,335
5,4,63,41,195,6,331
6,4,63,41,195,183,316
7,4,63,15,195,34,330
8,4,63,53,195,99,324
9,4,63,22,195,192,355


In [20]:
from sklearn.model_selection import train_test_split

def split_df(df):
    train, test = train_test_split(
        df, test_size=0.10, random_state=42)
    
    print("Train samples {}, test samples {}".format(len(train), len(test)))
    
    return (train, test)

In [21]:
from sklearn import linear_model
from sklearn.linear_model import Lasso

from sklearn.metrics import max_error, mean_absolute_error, r2_score

def pretty_print_polynomial(poly, model, variable_names):
    terms = []

    for term_idx in range(0, poly.powers_.shape[0]):
        coeff = model.coef_[term_idx]
        if coeff == 0:
            continue
        coeff = np.around(coeff, decimals=6)
        subparts = []
        coeff_string = "{}".format(coeff)
        subparts.append(coeff_string)
        for variable_idx in range(0, poly.powers_.shape[1]):
            power = poly.powers_[term_idx, variable_idx]
            if power != 0:
                if power == 1:
                    term_string = '{}'.format(variable_names[variable_idx])
                    subparts.append(term_string)
                else:
                    term_string = '{}^{}'.format(variable_names[variable_idx], power)
                    subparts.append(term_string)
        if len(subparts) != 0:
            joined = " * ".join(subparts)
            terms.append(joined)

    polynomial_string = " + ".join(terms)
    print(polynomial_string)

In [22]:
mnt4_one_off = pd.read_csv("./mnt4/one_off_results.csv")

mnt4_one_off.head(15)

Unnamed: 0,modulus_limbs,gas
0,4,2400
1,5,3540
2,6,4800
3,7,6540
4,8,8670
5,9,11130
6,10,13860
7,11,17130
8,12,21030
9,13,24930


In [23]:
mnt6_one_off = pd.read_csv("./mnt6/one_off_results.csv")

mnt6_one_off.head(15)

Unnamed: 0,modulus_limbs,gas
0,4,2430
1,5,3570
2,6,4860
3,7,6570
4,8,8910
5,9,11160
6,10,13890
7,11,17160
8,12,21300
9,13,25080


In [24]:
from joblib import load

mnt4_model_description = load("mnt4_miller.joblib")
mnt6_model_description = load("mnt6_miller.joblib")

In [25]:
def evaluate_model(x, model_description, degree = 2):
    model = model_description["model"]
    features_description = model_description["features"]
    
    df = pd.DataFrame()

    for feature in features_description:
        name, max_power = feature
        if name not in ["x_bit_length", "x_hamming_weight"]:
            for i in range(1, max_power+1):
                subname = "{}^{}".format(name, i)
                df[subname] = [x[name] ** i]
        elif name == "x_bit_length":
            for i in range(1, max_power+1):
                subname = "{}^{}".format(name, i)
                df[subname] = [x_bits ** i]
        elif name == "x_hamming_weight":
            for i in range(1, max_power+1):
                subname = "{}^{}".format(name, i)
                df[subname] = [x_hamming ** i]
                
    poly = PolynomialFeatures(degree = degree, interaction_only=True, include_bias = False)
    
    X = poly.fit_transform(df)
    
    result = model.predict(X)
    
    return result[0]

In [26]:
def apply_correction(x, one_offs, miller_model_description):
    corrected = x["gas"] - one_offs[one_offs["modulus_limbs"] == x["modulus_limbs"]]["gas"].array[0]
    miller_contribution = num_pairs * evaluate_model(x, miller_model_description)
    corrected -= miller_contribution
    
    return corrected
    

In [29]:
def correct_for_parsing_and_miller(df, one_offs, miller_model_description):
    average = df.copy()
    average["gas"] = average["run_microseconds"].apply(lambda x: gas_factor * math.ceil(x))
    average["gas_corrected"] = average.apply(lambda x: apply_correction(x, one_offs, miller_model_description), axis = 1)
    average.drop("run_microseconds", axis = 1, inplace = True)
    average.drop("gas", axis = 1, inplace = True)
    
    return average

In [30]:
mnt4 = correct_for_parsing_and_miller(dataframes[0], mnt4_one_off, mnt4_model_description)
mnt4.head(15)

Unnamed: 0,modulus_limbs,exp_w0_bit_length,exp_w0_hamming,exp_w1_bit_length,exp_w1_hamming,gas_corrected
0,4,342,322,120,34,6870.188987
1,4,342,153,120,6,5790.188987
2,4,342,171,120,7,6030.188987
3,4,342,26,120,63,6630.188987
4,10,342,322,120,34,28881.245707
5,11,342,322,120,34,32716.178705
6,4,342,237,120,19,6090.188987
7,4,342,7,120,119,5790.188987
8,13,342,322,120,34,47360.122025
9,4,342,175,120,39,6870.188987


In [31]:
mnt6 = correct_for_parsing_and_miller(dataframes[1], mnt6_one_off, mnt6_model_description)
mnt6.head(15)

Unnamed: 0,modulus_limbs,exp_w0_bit_length,exp_w0_hamming,exp_w1_bit_length,exp_w1_hamming,gas_corrected
0,4,63,40,195,18,6357.609447
1,4,63,8,195,150,4887.609447
2,4,63,52,195,132,5697.609447
3,4,63,26,195,93,5307.609447
4,4,63,41,195,160,6597.609447
5,4,63,41,195,6,6477.609447
6,4,63,41,195,183,6027.609447
7,4,63,15,195,34,6447.609447
8,4,63,53,195,99,6267.609447
9,4,63,22,195,192,7197.609447


In [32]:
def analyze_manual_poly(df, features_description, target, trunc_limit = 0.001, degree = 3):
    
    new_df = df.copy()
    features = []
    for feature in features_description:
        name, max_power = feature
        for i in range(1, max_power+1):
            subname = "{}^{}".format(name, i)
            new_df[subname] = new_df[name].apply(lambda x: x**i)
            features.append(subname)
            
    print(features)
            
    poly = PolynomialFeatures(degree = degree, interaction_only=True, include_bias = False)
        
    train, test = split_df(new_df)

    X_train = train[features]
    Y_train = train[target]
    
    X_train = poly.fit_transform(X_train)

    lin = Lasso(alpha=0.0001,precompute=True, max_iter=100000, fit_intercept=False,
                positive=True, random_state=9999, selection='random')
    lin.fit(X_train, Y_train)
    
    print("Intercept = {}".format(lin.intercept_))

    print("score on training set {}".format(lin.score(X_train, Y_train)))

    X_test = test[features]
    Y_test = test[target]
    
    X_test = poly.fit_transform(X_test)

    print("score on test set {}".format(lin.score(X_test, Y_test)))
    
    y_true = Y_test
    y_pred = lin.predict(X_test)

    print("Model accuracy before manual truncation of coefficients")
    print("Max absolute error {} microseconds".format(max_error(y_true, y_pred)))
    print("Mean absolute error {} microseconds".format(mean_absolute_error(y_true, y_pred)))
    print("R2 score = {}".format(r2_score(y_true, y_pred)))

    coeffs = lin.coef_.copy()
    for k in range(0, coeffs.shape[0]):
        c = coeffs[k]
        if c < trunc_limit:
            coeffs[k] = 0.0

    lin.coef_ = coeffs

    y_true = Y_test
    y_pred = lin.predict(X_test)

    print("Truncating coefficients lower than {}".format(trunc_limit))
    print("Model accuracy after manual truncation of coefficients")
    print("Max absolute error {} microseconds".format(max_error(y_true, y_pred)))
    print("Mean absolute error {} microseconds".format(mean_absolute_error(y_true, y_pred)))
    print("R2 score = {}".format(r2_score(y_true, y_pred)))
    
    pretty_print_polynomial(poly, lin, features)
    
    return lin

In [33]:
def analyze_mnt_final_exp(df, trunc_limit = 0.001, modulus_power = 6):
    print("Fitting final exp price")
    model_final_exp = analyze_manual_poly(df, [
        ("exp_w0_bit_length", 1), 
        ("exp_w0_hamming", 1), 
        ("exp_w1_bit_length", 1),
        ("exp_w1_hamming", 1),
        ("modulus_limbs", modulus_power)], "gas_corrected", trunc_limit = trunc_limit, degree = 2)
    
    return model_final_exp

In [34]:
mnt4_final_exp = analyze_mnt_final_exp(mnt4, trunc_limit = 0.001, modulus_power = 4)

Fitting final exp price
['exp_w0_bit_length^1', 'exp_w0_hamming^1', 'exp_w1_bit_length^1', 'exp_w1_hamming^1', 'modulus_limbs^1', 'modulus_limbs^2', 'modulus_limbs^3', 'modulus_limbs^4']
Train samples 585000, test samples 65000
Intercept = 0.0
score on training set 0.9673525597120936
score on test set 0.9672808940991768
Model accuracy before manual truncation of coefficients
Max absolute error 172717.2970761515 microseconds
Mean absolute error 11249.268585637396 microseconds
R2 score = 0.967280894099177
Truncating coefficients lower than 0.001
Model accuracy after manual truncation of coefficients
Max absolute error 172717.2970761515 microseconds
Mean absolute error 11249.268585637396 microseconds
R2 score = 0.967280894099177
1.086688 * modulus_limbs^1 + 2.768829 * exp_w0_bit_length^1 * modulus_limbs^1 + 0.328426 * exp_w0_bit_length^1 * modulus_limbs^2 + 2.232078 * exp_w1_bit_length^1 * modulus_limbs^1 + 0.377451 * exp_w1_bit_length^1 * modulus_limbs^2


In [35]:
mnt6_final_exp = analyze_mnt_final_exp(mnt6, trunc_limit = 0.001, modulus_power = 6)

Fitting final exp price
['exp_w0_bit_length^1', 'exp_w0_hamming^1', 'exp_w1_bit_length^1', 'exp_w1_hamming^1', 'modulus_limbs^1', 'modulus_limbs^2', 'modulus_limbs^3', 'modulus_limbs^4', 'modulus_limbs^5', 'modulus_limbs^6']
Train samples 584964, test samples 64997
Intercept = 0.0
score on training set 0.9668305266231816
score on test set 0.9668235174641858
Model accuracy before manual truncation of coefficients
Max absolute error 294616.02239582967 microseconds
Mean absolute error 23037.463194195116 microseconds
R2 score = 0.9668235174641858
Truncating coefficients lower than 0.001
Model accuracy after manual truncation of coefficients
Max absolute error 294616.02239582967 microseconds
Mean absolute error 23037.463194195116 microseconds
R2 score = 0.9668235174641858
20.435127 * modulus_limbs^1 + 2.367142 * modulus_limbs^2 + 4.984185 * exp_w0_bit_length^1 * modulus_limbs^1 + 0.645117 * exp_w0_bit_length^1 * modulus_limbs^2 + 4.822542 * exp_w1_bit_length^1 * modulus_limbs^1 + 0.654884 *

In [36]:
model_multiplication_factor = 1000

In [37]:
import json

def process_polynomial_model(model, features_description):
#     expect only 2nd order cross terms
    features = []
    features_encoded_as_int = []
    for (feature_index, feature) in enumerate(features_description):
        name, max_power = feature
        for power in range(1, max_power+1):
            subname = "{}^{}".format(name, power)
            features.append(subname)
            features_encoded_as_int.append((feature_index, power))
            
    poly = PolynomialFeatures(degree = 2, interaction_only=True, include_bias = False)
    
    _ = poly.fit_transform([[0.0]*len(features)])
            
    unrolled_coeffs = []
    for term_idx in range(0, poly.powers_.shape[0]):
        coeff = model.coef_[term_idx]
        coeff = math.ceil(coeff * model_multiplication_factor)
        if coeff == 0:
            continue
        subparts = []
        for variable_idx in range(0, poly.powers_.shape[1]):
            power = poly.powers_[term_idx, variable_idx]
            if power != 0:
                if power == 1:
                    subparts.append(features_encoded_as_int[variable_idx])
#                     term_string = '{}'.format(variable_names[variable_idx])
#                     subparts.append(term_string)
                else:
#                     we do not expect terms like x*x due to features structure
                    assert(False)
#                     term_string = '{}^{}'.format(variable_names[variable_idx], power)
#                     subparts.append(term_string)
        if len(subparts) != 0:
            unrolled_coeffs.append((coeff, subparts))
            
    compressed_terms = []
    for term in unrolled_coeffs:
        coeff, terms = term
        if len(terms) < 2:
            compressed_terms.append((coeff, terms))
        else:
#         there are only two terms max
            if terms[0][0] == terms[1][0]:
                compressed_terms.append((coeff, [(terms[0][0], terms[0][1] + terms[1][1])]))
            else:
                compressed_terms.append((coeff, terms))
                
    deduped_terms = []
    for i in range(0, len(compressed_terms)):
        coeff, terms = compressed_terms[i]
        if len(terms) != 1:
            deduped_terms.append((coeff, terms))
            continue
            
        deduped = False
        for j in range(0, len(deduped_terms)):
            existing_coeff, existing_terms = deduped_terms[j]
            if len(existing_terms) != 1:
                continue
            if terms[0][0] == existing_terms[0][0] and terms[0][1] == existing_terms[0][1]:
                print("Dedup")
                existing_coeff += coeff
                deduped_terms[j] = (existing_coeff, existing_terms)
                deduped = True
                
        if deduped == False:
            deduped_terms.append((coeff, terms))
    
    return deduped_terms

def serialize_mnt_model(one_offs, miller_model, miller_features, final_exp_model, final_exp_features, filename):
    result = {}
    subres = []
    for (index, row) in one_offs.iterrows():
        subres.append([int(math.floor(row["modulus_limbs"])), math.ceil(row["gas"]*model_multiplication_factor)])
    result["one_off"] = subres
    result["multiplier"] = model_multiplication_factor
    result["miller_features"] = miller_features
    result["miller"] = process_polynomial_model(miller_model, miller_features)
    result["final_exp_features"] = final_exp_features
    result["final_exp"] = process_polynomial_model(final_exp_model, final_exp_features)
    
    with open(filename, 'w') as outfile:
        json.dump(result, outfile)

In [38]:
serialize_mnt_model(mnt4_one_off, mnt4_model_description["model"], mnt4_model_description["features"], mnt4_final_exp,[
        ("exp_w0_bit_length", 1), 
        ("exp_w0_hamming", 1), 
        ("exp_w1_bit_length", 1),
        ("exp_w1_hamming", 1),
        ("modulus_limbs", 4)], "mnt4_model.json")

In [39]:
serialize_mnt_model(mnt6_one_off, mnt6_model_description["model"], mnt6_model_description["features"], mnt6_final_exp,[
        ("exp_w0_bit_length", 1), 
        ("exp_w0_hamming", 1), 
        ("exp_w1_bit_length", 1),
        ("exp_w1_hamming", 1),
        ("modulus_limbs", 6)], "mnt6_model.json")