In [1]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
import math

# file_list = [("MNT4", "./mnt4/final_exp_parallel_1500.csv"),
#             ("MNT6", "./mnt6/final_exp_parallel_1500.csv")]

file_list = [("MNT4", "./mnt4/final_exp_parallel_alt_2000.csv"),
            ("MNT6", "./mnt6/final_exp_parallel_alt_2000.csv")]


gas_factor = 15

num_pairs = 2
x_bits = 1
x_hamming = 1

def get_dfs(files):
    results = []
    for file in files:
        (name, path) = file;
        df = pd.read_csv(path)
#         df = df[df["x_is_negative"] == 1.0]
        #df.loc[:,"num_pairs"] *= 0.5
        df.drop("x_is_negative", axis = 1, inplace = True)
        df.drop("x_bit_length", axis = 1, inplace = True)
        df.drop("x_hamming_weight", axis = 1, inplace = True)
        df.drop("num_pairs", axis = 1, inplace = True)
        df.drop("exp_w0_is_negative", axis = 1, inplace = True)
        results.append(df)
        
    return results

In [2]:
from sklearn import linear_model
from sklearn.linear_model import Lasso
from scipy.optimize import nnls

def factor_out_final_exp(df, group_by = 3, skip_bad_fits = False):
    df_final_exps = pd.DataFrame(columns = df.columns);
    df_final_exps.drop("num_pairs", axis = 1, inplace = True);
    df_final_exps.drop("run_microseconds", axis = 1, inplace = True);
    df_final_exps["final_exp_time"] = 0.0

    df_miller_loops = pd.DataFrame(columns = df.columns);
    df_miller_loops.drop("num_pairs", axis = 1, inplace = True);
    df_miller_loops.drop("run_microseconds", axis = 1, inplace = True);
    df_miller_loops["single_pair_miller_time"] = 0.0

    min_score = 1.0

    for k,g in df.groupby(np.arange(len(df))//group_by):
#         reg = linear_model.LinearRegression(fit_intercept = True)
#         model = reg.fit(g["num_pairs"][:, np.newaxis], g["run_microseconds"][:, np.newaxis])
        
        model = Lasso(alpha=0.0001,precompute=True,max_iter=1000,
            positive=True, random_state=9999, selection='random')
        model.fit(g["num_pairs"][:, np.newaxis], g["run_microseconds"][:, np.newaxis])
        
        score = model.score(g["num_pairs"][:, np.newaxis], g["run_microseconds"][:, np.newaxis])
        if score < min_score:
            min_score = score
            
        if score < 0.85 and skip_bad_fits:
#             print(g["num_pairs"])
#             print(g["run_microseconds"])
            continue
            
        slope = model.coef_[0];
        intercept = model.intercept_[0];
        
        if slope <= 1 or intercept <= 1:
            continue
            
        g_miller = g.iloc[0].copy()
        g_miller.drop("run_microseconds", inplace = True)
        g_final_exp = g.iloc[0].copy()
        g_final_exp.drop("run_microseconds", inplace = True)

#         g_miller["single_pair_miller_time"] = model.coef_[0][0];
        g_miller["single_pair_miller_time"] = slope;
        
        g_final_exp["final_exp_time"] = intercept;

        g_miller.drop("num_pairs", inplace = True)
        g_final_exp.drop("num_pairs", inplace = True)

        df_miller_loops = df_miller_loops.append(g_miller, verify_integrity=True)
        df_final_exps = df_final_exps.append(g_final_exp, verify_integrity=True)
        
    print("Minimal final exp fitting score = {}".format(min_score))
        
    return (df_miller_loops, df_final_exps)

In [3]:
from sklearn import linear_model
from sklearn.linear_model import Lasso
from scipy.optimize import nnls

def factor_out_final_exp_non_negative(df, group_by = 3, skip_bad_fits = False):
    df_final_exps = pd.DataFrame(columns = df.columns);
    df_final_exps.drop("num_pairs", axis = 1, inplace = True);
    df_final_exps.drop("run_microseconds", axis = 1, inplace = True);
    df_final_exps["final_exp_time"] = 0.0

    df_miller_loops = pd.DataFrame(columns = df.columns);
    df_miller_loops.drop("num_pairs", axis = 1, inplace = True);
    df_miller_loops.drop("run_microseconds", axis = 1, inplace = True);
    df_miller_loops["single_pair_miller_time"] = 0.0

    min_score = 1.0

    for k,g in df.groupby(np.arange(len(df))//group_by):
        g_copy = pd.DataFrame(g["num_pairs"])
        g_copy["intercept"] = 1.0
        
        model, res = nnls(g_copy[["num_pairs", "intercept"]][:], g["run_microseconds"][:])
            
        g_miller = g.iloc[0].copy()
        g_miller.drop("run_microseconds", inplace = True)
        g_final_exp = g.iloc[0].copy()
        g_final_exp.drop("run_microseconds", inplace = True)
        
        g_miller["single_pair_miller_time"] = model[0];
        g_final_exp["final_exp_time"] = model[1];

        g_miller.drop("num_pairs", inplace = True)
        g_final_exp.drop("num_pairs", inplace = True)

        df_miller_loops = df_miller_loops.append(g_miller, verify_integrity=True)
        df_final_exps = df_final_exps.append(g_final_exp, verify_integrity=True)
        
    print("Minimal final exp fitting score = {}".format(min_score))
        
    return (df_miller_loops, df_final_exps)

In [4]:
dataframes = get_dfs(file_list)

In [5]:
dataframes[0].head(15)

Unnamed: 0,modulus_limbs,group_limbs,exp_w0_bit_length,exp_w0_hamming,exp_w1_bit_length,exp_w1_hamming,run_microseconds
0,4,1,342,322,120,34,892
1,4,1,342,176,120,2,872
2,4,1,342,254,120,33,951
3,4,1,342,103,120,19,793
4,4,1,342,275,120,70,884
5,4,1,342,163,120,5,995
6,4,1,342,209,120,78,828
7,4,1,342,148,120,106,843
8,4,1,342,177,120,18,821
9,4,1,342,337,120,55,823


In [6]:
dataframes[1].head(15)

Unnamed: 0,modulus_limbs,group_limbs,exp_w0_bit_length,exp_w0_hamming,exp_w1_bit_length,exp_w1_hamming,run_microseconds
0,4,1,191,114,1792,488,2513
1,4,1,191,163,1792,52,2397
2,4,1,191,139,1792,1304,2475
3,4,1,191,61,1792,1535,2465
4,4,1,191,162,1792,792,2433
5,4,1,191,79,1792,600,2351
6,4,1,191,105,1792,264,2306
7,4,1,191,65,1792,1666,2291
8,4,1,191,160,1792,99,2410
9,4,1,191,72,1792,967,2349


In [7]:
from sklearn.model_selection import train_test_split

def split_df(df):
    train, test = train_test_split(
        df, test_size=0.10, random_state=42)
    
    print("Train samples {}, test samples {}".format(len(train), len(test)))
    
    return (train, test)

In [8]:
from sklearn import linear_model
from sklearn.linear_model import Lasso

from sklearn.metrics import max_error, mean_absolute_error, r2_score

def pretty_print_polynomial(poly, model, variable_names):
    terms = []

    for term_idx in range(0, poly.powers_.shape[0]):
        coeff = model.coef_[term_idx]
        if coeff == 0:
            continue
        coeff = np.around(coeff, decimals=6)
        subparts = []
        coeff_string = "{}".format(coeff)
        subparts.append(coeff_string)
        for variable_idx in range(0, poly.powers_.shape[1]):
            power = poly.powers_[term_idx, variable_idx]
            if power != 0:
                if power == 1:
                    term_string = '{}'.format(variable_names[variable_idx])
                    subparts.append(term_string)
                else:
                    term_string = '{}^{}'.format(variable_names[variable_idx], power)
                    subparts.append(term_string)
        if len(subparts) != 0:
            joined = " * ".join(subparts)
            terms.append(joined)

    polynomial_string = " + ".join(terms)
    print(polynomial_string)

In [9]:
mnt4_one_off = pd.read_csv("./mnt4/one_off_results.csv")

mnt4_one_off.head(15)

Unnamed: 0,modulus_limbs,gas
0,4,1455
1,5,2250
2,6,3135
3,7,4515
4,8,6435
5,9,8625
6,10,11145
7,11,14130
8,12,18210
9,13,22410


In [10]:
mnt6_one_off = pd.read_csv("./mnt6/one_off_results.csv")

mnt6_one_off.head(15)

Unnamed: 0,modulus_limbs,gas
0,4,2085
1,5,3855
2,6,4815
3,7,7290
4,8,9855
5,9,14070
6,10,18255
7,11,22095
8,12,29310
9,13,34710


In [11]:
from joblib import load

mnt4_model_description = load("mnt4_miller.joblib")
mnt6_model_description = load("mnt6_miller.joblib")

In [12]:
def evaluate_model(x, model_description, degree = 2):
    model = model_description["model"]
    features_description = model_description["features"]
    
    df = pd.DataFrame()

    for feature in features_description:
        name, max_power = feature
        if name not in ["x_bit_length", "x_hamming_weight"]:
            for i in range(1, max_power+1):
                subname = "{}^{}".format(name, i)
                df[subname] = [x[name] ** i]
        elif name == "x_bit_length":
            for i in range(1, max_power+1):
                subname = "{}^{}".format(name, i)
                df[subname] = [x_bits ** i]
        elif name == "x_hamming_weight":
            for i in range(1, max_power+1):
                subname = "{}^{}".format(name, i)
                df[subname] = [x_hamming ** i]
                
    poly = PolynomialFeatures(degree = degree, interaction_only=True, include_bias = False)
    
    X = poly.fit_transform(df)
    
    result = model.predict(X)
    
    return result[0]

In [13]:
def apply_correction(x, one_offs, miller_model_description):
    corrected = x["gas"] - one_offs[one_offs["modulus_limbs"] == x["modulus_limbs"]]["gas"].array[0]
    miller_contribution = num_pairs * evaluate_model(x, miller_model_description)
    corrected -= miller_contribution
    
    return corrected
    

In [14]:
def correct_for_parsing_and_miller(df, one_offs, miller_model_description):
    average = df.copy()
    average["gas"] = average["run_microseconds"].apply(lambda x: gas_factor * math.ceil(x))
    average["gas_corrected"] = average.apply(lambda x: apply_correction(x, one_offs, miller_model_description), axis = 1)
    average.drop("run_microseconds", axis = 1, inplace = True)
    average.drop("group_limbs", axis = 1, inplace = True)
    average.drop("gas", axis = 1, inplace = True)
    
    return average

In [15]:
mnt4 = correct_for_parsing_and_miller(dataframes[0], mnt4_one_off, mnt4_model_description)
mnt4.head(15)

Unnamed: 0,modulus_limbs,exp_w0_bit_length,exp_w0_hamming,exp_w1_bit_length,exp_w1_hamming,gas_corrected
0,4,342,322,120,34,3744.8856
1,4,342,176,120,2,3444.8856
2,4,342,254,120,33,4629.8856
3,4,342,103,120,19,2259.8856
4,4,342,275,120,70,3624.8856
5,4,342,163,120,5,5289.8856
6,4,342,209,120,78,2784.8856
7,4,342,148,120,106,3009.8856
8,4,342,177,120,18,2679.8856
9,4,342,337,120,55,2709.8856


In [None]:
mnt6 = correct_for_parsing_and_miller(dataframes[1], mnt6_one_off, mnt6_model_description)
mnt6.head(15)

In [None]:
def analyze_manual_poly(df, features_description, target, trunc_limit = 0.001, degree = 3):
    
    new_df = df.copy()
    features = []
    for feature in features_description:
        name, max_power = feature
        for i in range(1, max_power+1):
            subname = "{}^{}".format(name, i)
            new_df[subname] = new_df[name].apply(lambda x: x**i)
            features.append(subname)
            
    print(features)
            
    poly = PolynomialFeatures(degree = degree, interaction_only=True, include_bias = False)
        
    train, test = split_df(new_df)

    X_train = train[features]
    Y_train = train[target]
    
    X_train = poly.fit_transform(X_train)

    lin = Lasso(alpha=0.0001,precompute=True, max_iter=100000, fit_intercept=False,
                positive=True, random_state=9999, selection='random')
    lin.fit(X_train, Y_train)
    
    print("Intercept = {}".format(lin.intercept_))

    print("score on training set {}".format(lin.score(X_train, Y_train)))

    X_test = test[features]
    Y_test = test[target]
    
    X_test = poly.fit_transform(X_test)

    print("score on test set {}".format(lin.score(X_test, Y_test)))
    
    y_true = Y_test
    y_pred = lin.predict(X_test)

    print("Model accuracy before manual truncation of coefficients")
    print("Max absolute error {} microseconds".format(max_error(y_true, y_pred)))
    print("Mean absolute error {} microseconds".format(mean_absolute_error(y_true, y_pred)))
    print("R2 score = {}".format(r2_score(y_true, y_pred)))

    coeffs = lin.coef_.copy()
    for k in range(0, coeffs.shape[0]):
        c = coeffs[k]
        if c < trunc_limit:
            coeffs[k] = 0.0

    lin.coef_ = coeffs

    y_true = Y_test
    y_pred = lin.predict(X_test)

    print("Truncating coefficients lower than {}".format(trunc_limit))
    print("Model accuracy after manual truncation of coefficients")
    print("Max absolute error {} microseconds".format(max_error(y_true, y_pred)))
    print("Mean absolute error {} microseconds".format(mean_absolute_error(y_true, y_pred)))
    print("R2 score = {}".format(r2_score(y_true, y_pred)))
    
    pretty_print_polynomial(poly, lin, features)
    
    return lin

In [None]:
def analyze_mnt_final_exp(df, trunc_limit = 0.001, modulus_power = 6):
    print("Fitting final exp price")
    model_final_exp = analyze_manual_poly(df, [
        ("exp_w0_bit_length", 1), 
        ("exp_w0_hamming", 1), 
        ("exp_w1_bit_length", 1),
        ("exp_w1_hamming", 1),
        ("modulus_limbs", modulus_power)], "gas_corrected", trunc_limit = trunc_limit, degree = 2)
    
    return model_final_exp

In [None]:
mnt4_final_exp = analyze_mnt_final_exp(mnt4, trunc_limit = 0.001, modulus_power = 4)

In [None]:
mnt6_final_exp = analyze_mnt_final_exp(mnt6, trunc_limit = 0.001, modulus_power = 6)

In [None]:
model_multiplication_factor = 1000

In [None]:
import json

def process_polynomial_model(model, features_description):
#     expect only 2nd order cross terms
    features = []
    features_encoded_as_int = []
    for (feature_index, feature) in enumerate(features_description):
        name, max_power = feature
        for power in range(1, max_power+1):
            subname = "{}^{}".format(name, power)
            features.append(subname)
            features_encoded_as_int.append((feature_index, power))
            
    poly = PolynomialFeatures(degree = 2, interaction_only=True, include_bias = False)
    
    _ = poly.fit_transform([[0.0]*len(features)])
            
    unrolled_coeffs = []
    for term_idx in range(0, poly.powers_.shape[0]):
        coeff = model.coef_[term_idx]
        coeff = math.ceil(coeff * model_multiplication_factor)
        if coeff == 0:
            continue
        subparts = []
        for variable_idx in range(0, poly.powers_.shape[1]):
            power = poly.powers_[term_idx, variable_idx]
            if power != 0:
                if power == 1:
                    subparts.append(features_encoded_as_int[variable_idx])
#                     term_string = '{}'.format(variable_names[variable_idx])
#                     subparts.append(term_string)
                else:
#                     we do not expect terms like x*x due to features structure
                    assert(False)
#                     term_string = '{}^{}'.format(variable_names[variable_idx], power)
#                     subparts.append(term_string)
        if len(subparts) != 0:
            unrolled_coeffs.append((coeff, subparts))
            
    compressed_terms = []
    for term in unrolled_coeffs:
        coeff, terms = term
        if len(terms) < 2:
            compressed_terms.append((coeff, terms))
        else:
#         there are only two terms max
            if terms[0][0] == terms[1][0]:
                compressed_terms.append((coeff, [(terms[0][0], terms[0][1] + terms[1][1])]))
            else:
                compressed_terms.append((coeff, terms))
                
    deduped_terms = []
    for i in range(0, len(compressed_terms)):
        coeff, terms = compressed_terms[i]
        if len(terms) != 1:
            deduped_terms.append((coeff, terms))
            continue
            
        deduped = False
        for j in range(0, len(deduped_terms)):
            existing_coeff, existing_terms = deduped_terms[j]
            if len(existing_terms) != 1:
                continue
            if terms[0][0] == existing_terms[0][0] and terms[0][1] == existing_terms[0][1]:
                print("Dedup")
                existing_coeff += coeff
                deduped_terms[j] = (existing_coeff, existing_terms)
                deduped = True
                
        if deduped == False:
            deduped_terms.append((coeff, terms))
    
    return deduped_terms

def serialize_mnt_model(one_offs, miller_model, miller_features, final_exp_model, final_exp_features, filename):
    result = {}
    subres = []
    for (index, row) in one_offs.iterrows():
        subres.append([int(math.floor(row["modulus_limbs"])), math.ceil(row["gas"]*model_multiplication_factor)])
    result["one_off"] = subres
    result["multiplier"] = model_multiplication_factor
    result["miller_features"] = miller_features
    result["miller"] = process_polynomial_model(miller_model, miller_features)
    result["final_exp_features"] = final_exp_features
    result["final_exp"] = process_polynomial_model(final_exp_model, final_exp_features)
    
    with open(filename, 'w') as outfile:
        json.dump(result, outfile)

In [None]:
serialize_mnt_model(mnt4_one_off, mnt4_model_description["model"], mnt4_model_description["features"], mnt4_final_exp,[
        ("exp_w0_bit_length", 1), 
        ("exp_w0_hamming", 1), 
        ("exp_w1_bit_length", 1),
        ("exp_w1_hamming", 1),
        ("modulus_limbs", 4)], "mnt4_model.json")

In [None]:
serialize_mnt_model(mnt6_one_off, mnt6_model_description["model"], mnt6_model_description["features"], mnt6_final_exp,[
        ("exp_w0_bit_length", 1), 
        ("exp_w0_hamming", 1), 
        ("exp_w1_bit_length", 1),
        ("exp_w1_hamming", 1),
        ("modulus_limbs", 6)], "mnt6_model.json")