# ML2CPP

## Preparing the dataset

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd

def populate_table(tablename, feature_names):
    iris = datasets.load_iris()
    X = iris.data  
    N = X.shape[0]
    y = iris.target.reshape(N,1)
    k = np.arange(N).reshape(N, 1)
    k_X_y = np.concatenate((k, X, y) , axis=1)
    lTable=pd.DataFrame(k_X_y)
    # print(lTable.head())
    lTable.columns = ['idx'] + feature_names + ['TGT'];
    lTable['TGT'] = lTable['TGT'].apply(int)
    lTable['idx'] = lTable['idx'].apply(int)
    lTable.to_csv(tablename , float_format='%.14g')



In [2]:
metadata = {"primary_key" : "KEY",
            "features" : ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm'],
            "targets" : ["TGT"],
            "table" : "iris"}

In [3]:
populate_table("/tmp/iris.csv" , metadata["features"])


In [4]:
df = pd.read_csv("/tmp/iris.csv")
df.sample(12, random_state=1960)

Unnamed: 0.1,Unnamed: 0,idx,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,TGT
114,114,114,5.8,2.8,5.1,2.4,2
74,74,74,6.4,2.9,4.3,1.3,1
9,9,9,4.9,3.1,1.5,0.1,0
88,88,88,5.6,3.0,4.1,1.3,1
25,25,25,5.0,3.0,1.6,0.2,0
5,5,5,5.4,3.9,1.7,0.4,0
48,48,48,5.3,3.7,1.5,0.2,0
117,117,117,7.7,3.8,6.7,2.2,2
83,83,83,6.0,2.7,5.1,1.6,1
105,105,105,7.6,3.0,6.6,2.1,2


## Training a Model

In [5]:


# train any scikit model on the iris dataset
from sklearn.preprocessing import PolynomialFeatures
clf = PolynomialFeatures(degree=3)
clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


PolynomialFeatures(degree=3)

In [6]:
clf.degree

3

## Deploying the Model

In [7]:

def generate_cpp_for_model(model):
    import pickle, json, requests, base64
    b64_data = base64.b64encode(pickle.dumps(model)).decode('utf-8')
    # send the model th the web service
    json_data={"Name":"model_cpp_sample", 
               "PickleData":b64_data , 
               "SQLDialect":"CPP",
               "FeatureNames" : metadata['features']}
    r = requests.post("https://sklearn2sql.herokuapp.com/model", json=json_data)
    content = r.json()
    lCPP = content["model"]["SQLGenrationResult"][0]["SQL"]
    # print(lCPP);
    return lCPP


lCPPCode = generate_cpp_for_model(clf);


In [8]:
print(lCPPCode)

namespace  {

	std::vector<std::string> get_input_names(){
		std::vector<std::string> lFeatures = { "Feature_0", "Feature_1", "Feature_2", "Feature_3" };

		return lFeatures;
	}

	std::vector<std::string> get_output_names(){
		std::vector<std::string> lOutputs = { "CST", "Feature_0__1", "Feature_1__1", "Feature_2__1", "Feature_3__1", "Feature_0__2", "Feature_0__1_mul_Feature_1__1", "Feature_0__1_mul_Feature_2__1", "Feature_0__1_mul_Feature_3__1", "Feature_1__2", "Feature_1__1_mul_Feature_2__1", "Feature_1__1_mul_Feature_3__1", "Feature_2__2", "Feature_2__1_mul_Feature_3__1", "Feature_3__2", "Feature_0__3", "Feature_0__2_mul_Feature_1__1", "Feature_0__2_mul_Feature_2__1", "Feature_0__2_mul_Feature_3__1", "Feature_0__1_mul_Feature_1__2", "Feature_0__1_mul_Feature_1__1_mul_Feature_2__1", "Feature_0__1_mul_Feature_1__1_mul_Feature_3__1", "Feature_0__1_mul_Feature_2__2", "Feature_0__1_mul_Feature_2__1_mul_Feature_3__1", "Feature_0__1_mul_Feature_3__2", "Feature_1__3", "Feature_1__2_mul_Feat

In [9]:
    def write_text_to_file(iCPPCode, oCPPFile):          
        with open(oCPPFile, "w") as text_file:
            text_file.write(iCPPCode)

    def add_cpp_main_function(iCPPCode, iCSVFile):
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = lCPPCode + iCPPCode
        lCPPCode = lCPPCode + "\tint main() {\n"
        lCPPCode = lCPPCode + "\t\tscore_csv_file(\"" + iCSVFile +"\");\n"
        lCPPCode = lCPPCode + "\treturn 0;\n}\n"
        return lCPPCode

    def compile_cpp_code_as_executable(iName):
        import subprocess
        lCommand = ["g++", "-Wall", "-Wno-unused-function", "-std=c++17" , "-g" ,  "-o", iName + ".exe",  iName + ".cpp"]
        print("EXECUTING" , "'" + " ".join(lCommand) + "'")
        result = subprocess.check_output(lCommand)
        # print(result)

    def execute_cpp_model(iName, iCSVFile):
        import subprocess
        result2 = subprocess.check_output([iName + ".exe",  iCSVFile])
        result2 = result2.decode()
        print(result2[:1000])
        print(result2[-1000:])
        return result2
        
    def execute_cpp_code(iCPPCode, iCSVFile):
        lName = "/tmp/sklearn2sql_cpp_" + str(id(clf));
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, lName + ".cpp")
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        write_text_to_file(str(result), lName + ".out")
        return lName + ".out"


In [10]:
populate_table("/tmp/iris2.csv" , ["Feature_0", "Feature_1", "Feature_2", "Feature_3"])
lCPPOutput = execute_cpp_code(lCPPCode , "/tmp/iris2.csv")
cpp_output = pd.read_csv(lCPPOutput)

EXECUTING 'g++ -Wall -Wno-unused-function -std=c++17 -g -o /tmp/sklearn2sql_cpp_140221356142256.exe /tmp/sklearn2sql_cpp_140221356142256.cpp'
idx,CST,Feature_0__1,Feature_1__1,Feature_2__1,Feature_3__1,Feature_0__2,Feature_0__1_mul_Feature_1__1,Feature_0__1_mul_Feature_2__1,Feature_0__1_mul_Feature_3__1,Feature_1__2,Feature_1__1_mul_Feature_2__1,Feature_1__1_mul_Feature_3__1,Feature_2__2,Feature_2__1_mul_Feature_3__1,Feature_3__2,Feature_0__3,Feature_0__2_mul_Feature_1__1,Feature_0__2_mul_Feature_2__1,Feature_0__2_mul_Feature_3__1,Feature_0__1_mul_Feature_1__2,Feature_0__1_mul_Feature_1__1_mul_Feature_2__1,Feature_0__1_mul_Feature_1__1_mul_Feature_3__1,Feature_0__1_mul_Feature_2__2,Feature_0__1_mul_Feature_2__1_mul_Feature_3__1,Feature_0__1_mul_Feature_3__2,Feature_1__3,Feature_1__2_mul_Feature_2__1,Feature_1__2_mul_Feature_3__1,Feature_1__1_mul_Feature_2__2,Feature_1__1_mul_Feature_2__1_mul_Feature_3__1,Feature_1__1_mul_Feature_3__2,Feature_2__3,Feature_2__2_mul_Feature_3__1,Feature_2

In [11]:
cpp_output.sample(12, random_state=1960)

Unnamed: 0,idx,CST,Feature_0__1,Feature_1__1,Feature_2__1,Feature_3__1,Feature_0__2,Feature_0__1_mul_Feature_1__1,Feature_0__1_mul_Feature_2__1,Feature_0__1_mul_Feature_3__1,...,Feature_1__3,Feature_1__2_mul_Feature_2__1,Feature_1__2_mul_Feature_3__1,Feature_1__1_mul_Feature_2__2,Feature_1__1_mul_Feature_2__1_mul_Feature_3__1,Feature_1__1_mul_Feature_3__2,Feature_2__3,Feature_2__2_mul_Feature_3__1,Feature_2__1_mul_Feature_3__2,Feature_3__3
114,114,1.0,5.8,2.8,5.1,2.4,33.64,16.24,29.58,13.92,...,21.952,39.984,18.816,72.828,34.272,16.128,132.651,62.424,29.376,13.824
74,74,1.0,6.4,2.9,4.3,1.3,40.96,18.56,27.52,8.32,...,24.389,36.163,10.933,53.621,16.211,4.901,79.507,24.037,7.267,2.197
9,9,1.0,4.9,3.1,1.5,0.1,24.01,15.19,7.35,0.49,...,29.791,14.415,0.961,6.975,0.465,0.031,3.375,0.225,0.015,0.001
88,88,1.0,5.6,3.0,4.1,1.3,31.36,16.8,22.96,7.28,...,27.0,36.9,11.7,50.43,15.99,5.07,68.921,21.853,6.929,2.197
25,25,1.0,5.0,3.0,1.6,0.2,25.0,15.0,8.0,1.0,...,27.0,14.4,1.8,7.68,0.96,0.12,4.096,0.512,0.064,0.008
5,5,1.0,5.4,3.9,1.7,0.4,29.16,21.06,9.18,2.16,...,59.319,25.857,6.084,11.271,2.652,0.624,4.913,1.156,0.272,0.064
48,48,1.0,5.3,3.7,1.5,0.2,28.09,19.61,7.95,1.06,...,50.653,20.535,2.738,8.325,1.11,0.148,3.375,0.45,0.06,0.008
117,117,1.0,7.7,3.8,6.7,2.2,59.29,29.26,51.59,16.94,...,54.872,96.748,31.768,170.582,56.012,18.392,300.763,98.758,32.428,10.648
83,83,1.0,6.0,2.7,5.1,1.6,36.0,16.2,30.6,9.6,...,19.683,37.179,11.664,70.227,22.032,6.912,132.651,41.616,13.056,4.096
105,105,1.0,7.6,3.0,6.6,2.1,57.76,22.8,50.16,15.96,...,27.0,59.4,18.9,130.68,41.58,13.23,287.496,91.476,29.106,9.261


In [12]:
skl_outputs = pd.DataFrame()
X = df[metadata['features']].values
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['idx']);

skl_output_transform = pd.DataFrame(clf.transform(X), columns=cpp_output.columns[1:]);
skl_output = pd.concat([skl_output_key, skl_output_transform] , axis=1)
skl_output.sample(12, random_state=1960)

Unnamed: 0,idx,CST,Feature_0__1,Feature_1__1,Feature_2__1,Feature_3__1,Feature_0__2,Feature_0__1_mul_Feature_1__1,Feature_0__1_mul_Feature_2__1,Feature_0__1_mul_Feature_3__1,...,Feature_1__3,Feature_1__2_mul_Feature_2__1,Feature_1__2_mul_Feature_3__1,Feature_1__1_mul_Feature_2__2,Feature_1__1_mul_Feature_2__1_mul_Feature_3__1,Feature_1__1_mul_Feature_3__2,Feature_2__3,Feature_2__2_mul_Feature_3__1,Feature_2__1_mul_Feature_3__2,Feature_3__3
114,114,1.0,5.8,2.8,5.1,2.4,33.64,16.24,29.58,13.92,...,21.952,39.984,18.816,72.828,34.272,16.128,132.651,62.424,29.376,13.824
74,74,1.0,6.4,2.9,4.3,1.3,40.96,18.56,27.52,8.32,...,24.389,36.163,10.933,53.621,16.211,4.901,79.507,24.037,7.267,2.197
9,9,1.0,4.9,3.1,1.5,0.1,24.01,15.19,7.35,0.49,...,29.791,14.415,0.961,6.975,0.465,0.031,3.375,0.225,0.015,0.001
88,88,1.0,5.6,3.0,4.1,1.3,31.36,16.8,22.96,7.28,...,27.0,36.9,11.7,50.43,15.99,5.07,68.921,21.853,6.929,2.197
25,25,1.0,5.0,3.0,1.6,0.2,25.0,15.0,8.0,1.0,...,27.0,14.4,1.8,7.68,0.96,0.12,4.096,0.512,0.064,0.008
5,5,1.0,5.4,3.9,1.7,0.4,29.16,21.06,9.18,2.16,...,59.319,25.857,6.084,11.271,2.652,0.624,4.913,1.156,0.272,0.064
48,48,1.0,5.3,3.7,1.5,0.2,28.09,19.61,7.95,1.06,...,50.653,20.535,2.738,8.325,1.11,0.148,3.375,0.45,0.06,0.008
117,117,1.0,7.7,3.8,6.7,2.2,59.29,29.26,51.59,16.94,...,54.872,96.748,31.768,170.582,56.012,18.392,300.763,98.758,32.428,10.648
83,83,1.0,6.0,2.7,5.1,1.6,36.0,16.2,30.6,9.6,...,19.683,37.179,11.664,70.227,22.032,6.912,132.651,41.616,13.056,4.096
105,105,1.0,7.6,3.0,6.6,2.1,57.76,22.8,50.16,15.96,...,27.0,59.4,18.9,130.68,41.58,13.23,287.496,91.476,29.106,9.261


In [13]:
cpp_skl_join = skl_output.join(cpp_output , how='left', on='idx', lsuffix='_skl', rsuffix='_cpp')

In [14]:
cpp_skl_join.sample(12, random_state=1960)

Unnamed: 0,idx_skl,CST_skl,Feature_0__1_skl,Feature_1__1_skl,Feature_2__1_skl,Feature_3__1_skl,Feature_0__2_skl,Feature_0__1_mul_Feature_1__1_skl,Feature_0__1_mul_Feature_2__1_skl,Feature_0__1_mul_Feature_3__1_skl,...,Feature_1__3_cpp,Feature_1__2_mul_Feature_2__1_cpp,Feature_1__2_mul_Feature_3__1_cpp,Feature_1__1_mul_Feature_2__2_cpp,Feature_1__1_mul_Feature_2__1_mul_Feature_3__1_cpp,Feature_1__1_mul_Feature_3__2_cpp,Feature_2__3_cpp,Feature_2__2_mul_Feature_3__1_cpp,Feature_2__1_mul_Feature_3__2_cpp,Feature_3__3_cpp
114,114,1.0,5.8,2.8,5.1,2.4,33.64,16.24,29.58,13.92,...,21.952,39.984,18.816,72.828,34.272,16.128,132.651,62.424,29.376,13.824
74,74,1.0,6.4,2.9,4.3,1.3,40.96,18.56,27.52,8.32,...,24.389,36.163,10.933,53.621,16.211,4.901,79.507,24.037,7.267,2.197
9,9,1.0,4.9,3.1,1.5,0.1,24.01,15.19,7.35,0.49,...,29.791,14.415,0.961,6.975,0.465,0.031,3.375,0.225,0.015,0.001
88,88,1.0,5.6,3.0,4.1,1.3,31.36,16.8,22.96,7.28,...,27.0,36.9,11.7,50.43,15.99,5.07,68.921,21.853,6.929,2.197
25,25,1.0,5.0,3.0,1.6,0.2,25.0,15.0,8.0,1.0,...,27.0,14.4,1.8,7.68,0.96,0.12,4.096,0.512,0.064,0.008
5,5,1.0,5.4,3.9,1.7,0.4,29.16,21.06,9.18,2.16,...,59.319,25.857,6.084,11.271,2.652,0.624,4.913,1.156,0.272,0.064
48,48,1.0,5.3,3.7,1.5,0.2,28.09,19.61,7.95,1.06,...,50.653,20.535,2.738,8.325,1.11,0.148,3.375,0.45,0.06,0.008
117,117,1.0,7.7,3.8,6.7,2.2,59.29,29.26,51.59,16.94,...,54.872,96.748,31.768,170.582,56.012,18.392,300.763,98.758,32.428,10.648
83,83,1.0,6.0,2.7,5.1,1.6,36.0,16.2,30.6,9.6,...,19.683,37.179,11.664,70.227,22.032,6.912,132.651,41.616,13.056,4.096
105,105,1.0,7.6,3.0,6.6,2.1,57.76,22.8,50.16,15.96,...,27.0,59.4,18.9,130.68,41.58,13.23,287.496,91.476,29.106,9.261


In [15]:
for col in cpp_output.columns:
    lDiff = cpp_skl_join[col + "_skl"] - cpp_skl_join[col + "_cpp"]
    print(lDiff.describe())

count    150.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
count    150.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
count    150.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
count    150.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
count    150.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
count    150.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
count    1.500000e+02
mean     0.000000e+00
std      2.667514e-15
min     -7.105427e-15
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      7.105427e-15
dtype: float64