# Estimate Curves #
In this notebook we will try to use data obtained by benchmarking the main solver in order to find correlations between size of the instances, time limit and hyperparameter values in order to tune to the best hyperparameters

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import re
from scipy import signal
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

np.set_printoptions(edgeitems=30, linewidth=400, formatter=dict(float=lambda x: "%.5g" % x))

### Loading csv data

In [None]:
cost_csv_fname = "../runs/em_almostbest_cost.csv"

def read_csv(fname:str) -> [np.ndarray, np.array, np.array, np.ndarray]:
    with open(fname, 'r') as csvfile:
        firstline = csvfile.readline().strip().split(';')
        ncols = int(firstline[0])
        hyperparam_vals = []
        for i in range(ncols):
            hyperparam_vals.append(float(re.findall(r'\d.\d+', firstline[i+1])[0]))
        nodes_num_list = []
        rows = []
        runtimes = []
        for row in csvfile:
            row = row.strip().split(';')
            nodes_num_list.append(float(re.findall(r'\d+', row[0])[0]))
            rdata = []
            for j in range(ncols):
                if row[j+1] == 'null':
                    rdata.append(np.Infinity)
                else:
                    rdata.append(float(row[j + 1]))
            rows.append(rdata)
            runtimes.append(float(row[ncols+1]))
    runtimes = np.array(runtimes)
    nodes_num_list =np.array(nodes_num_list)
    hyperparam_vals = np.array(hyperparam_vals)
    rows = np.array(rows)
    return (rows, nodes_num_list, hyperparam_vals, runtimes)

cost_data, nNodes, param_values, runtime_data = read_csv(cost_csv_fname)

best_cost_pos = np.argmin(cost_data, axis=1)
best_param_values = param_values[best_cost_pos]

### Use linear regression to find a good polynomial

In [None]:
kernel_nNodes = np.log(nNodes)

useBias = True

poly = PolynomialFeatures(degree=1, include_bias=useBias)
poly_features = poly.fit_transform(kernel_nNodes.reshape(-1,1))
model = LinearRegression()
model.fit_intercept = False
model.fit(poly_features, np.log(best_param_values))
print('Model coeffs\n' + str(model.coef_))
predicted = model.predict(poly_features)
print('Avg Pred/True ratio = '+ str(np.average(np.abs(predicted / best_param_values))))

xstart = np.min(nNodes)
x = np.arange(start=xstart, stop=100000, dtype=np.float32)
if useBias:
    fx = np.exp(model.coef_[0]) * np.power(x, model.coef_[1])
else: 
    fx = np.power(x, model.coef_[0])

fig = plt.figure(figsize=(12, 6), dpi=200)

plt.plot(x, fx, c='blue', marker='')
plt.scatter(nNodes, best_param_values, c='red', marker='.', s=5)

plt.xlabel('Number of Nodes')
plt.ylabel('Grasp Chance')
plt.grid(True, 'major')
plt.grid(True, 'minor', color='lightgrey', linestyle='--')

plt.xscale('log')