## Predicting SAP Sales & Distribution Benchmark Results from cint_rate_base2006 Results
Simply read back the pickle model from disk and do some predictions.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import validation_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn import ensemble, neural_network
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pickle

pd.set_option('display.max_colwidth', None)
pd.set_option('mode.chained_assignment', None)

pkl_filename = "cint_rate_base2006_to_saps.pkl"

with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)

# this function is what we use to make predictions; if this is all you want, you can stop right here
def spec2saps(spec: float) -> float:
    a = np.array([spec])
    a = np.expand_dims(a, 0)
    saps = model.predict(a)[0]

    return (round(saps,-1))

## Analysis

Let's compare the polynomial fit with the ensemble model. Over the (small) manual validation set, the ensemble model provides qualitatively better results than the polynomial fit, except for the single very large sample (M10-4S). Above degree=3 the polynomial fit starts overfitting.

In [None]:
# generate some sample data for SPECintrate2006 between 0 and 10000
# warning: this will take a long time!
df = pd.DataFrame()

c = 0
cols = ['cint_rate_base2006', 'SAPS']

while (c < 10000):
    data = [[c, spec2saps(c)]] 
    res = pd.DataFrame(data, columns = cols)

    df = df.append(res)
    c = c + 5

df

In [None]:
X = df.iloc[:, 0].values.reshape(-1, 1)
y = df.iloc[:, 1].values.reshape(-1, 1)

X_seq = np.linspace(X.min(),X.max(),len(df)).reshape(-1,1)

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

degree = np.arange(0, 10)
train_score, val_score = validation_curve(PolynomialRegression(), X, y,
                                          'polynomialfeatures__degree', degree, cv=7)

plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');

In [None]:
degree=3

polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyreg.fit(X,y)

plt.figure()
plt.scatter(X,y)
plt.plot(X_seq,polyreg.predict(X_seq),color="black")
plt.title("Polynomial regression with degree "+str(degree))
plt.show()

c = np.polyfit(df['cint_rate_base2006'], df['SAPS'], degree)
print(c)

It's clear from the above that above a cint_rate_base2006 of ~5000 the estimation starts to fall apart.

In [None]:
# Fujitsu M10-4S (836550 SAPS and 13625.00 cint_rate_base2006)
print (spec2saps(13625))

In [None]:
# SUN FIRE V490 (ULTRASPARC IV, 6750 SAPS, 71.70 cint_rate_base2006)
print (spec2saps(71.70))

In [None]:
# Intel Xeon 7140M (10380 SAPS, 76.9 cint_rate_base2006)
print (spec2saps(76.9))

In [None]:
# Sun M9000 (2.88GHz, 175600 SAPS and 2400 cint_rate_base2006)
print (spec2saps(2400))

In [None]:
# Sun M3000 (2.52GHz, 4130 SAPS and 25.7 cint_rate_base2006)
print (spec2saps(25.7))

In [None]:
# IBM POWER 730, 47600 SAPS, 515 cint_rate_base2006, the error is large because the model is mostly influenced
# by Intel Xeon data points
print (spec2saps(515))

In [None]:
# Here's an Intel Xeon data point.. good fit
# CISCO UCS C260 M2 (INTEL XEON E7-2870, 2.40 GHZ)  36600 SAPS, 526 cint_rate_base2006
print (spec2saps(526))

In [None]:
# another Intel Xeon data point (a rather large system)
# CISCO UCS B200 M5 (INTEL XEON PLATINUM 8276, 2.20GHZ) 131170 SAPS, 2868.71 cint_rate_base2006
print (spec2saps(2868.71))

In [None]:
# Sun Fire X4200 (2800 SAPS, 23.1 cint_rate_base2006)
print (spec2saps(23.1))