# ML2CPP

## Preparing the dataset

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd

def populate_table(tablename, feature_names):
    iris = datasets.load_iris()
    X = iris.data  
    N = X.shape[0]
    y = iris.target.reshape(N,1)
    k = np.arange(N).reshape(N, 1)
    k_X_y = np.concatenate((k, X, y) , axis=1)
    lTable=pd.DataFrame(k_X_y)
    # print(lTable.head())
    lTable.columns = ['idx'] + feature_names + ['TGT'];
    lTable['TGT'] = lTable['TGT'].apply(int)
    lTable['idx'] = lTable['idx'].apply(int)
    lTable.to_csv(tablename , float_format='%.14g')



In [2]:
metadata = {"primary_key" : "KEY",
            "features" : ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm'],
            "targets" : ["TGT"],
            "table" : "iris"}

In [3]:
populate_table("/tmp/iris.csv" , metadata["features"])


In [4]:
df = pd.read_csv("/tmp/iris.csv")
df.sample(12, random_state=1960)

Unnamed: 0.1,Unnamed: 0,idx,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,TGT
114,114,114,5.8,2.8,5.1,2.4,2
74,74,74,6.4,2.9,4.3,1.3,1
9,9,9,4.9,3.1,1.5,0.1,0
88,88,88,5.6,3.0,4.1,1.3,1
25,25,25,5.0,3.0,1.6,0.2,0
5,5,5,5.4,3.9,1.7,0.4,0
48,48,48,5.3,3.7,1.5,0.2,0
117,117,117,7.7,3.8,6.7,2.2,2
83,83,83,6.0,2.7,5.1,1.6,1
105,105,105,7.6,3.0,6.6,2.1,2


## Training a Model

In [5]:


# train any scikit model on the iris dataset
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators = 512, random_state=1960)
clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


  return f(**kwargs)


GradientBoostingClassifier(n_estimators=512, random_state=1960)

## Deploying the Model

In [6]:

def generate_cpp_for_model(model):
    import pickle, json, requests, base64
    b64_data = base64.b64encode(pickle.dumps(model)).decode('utf-8')
    # send the model th the web service
    json_data={"Name":"model_cpp_sample", 
               "PickleData":b64_data , 
               "SQLDialect":"CPP",
               "FeatureNames" : metadata['features']}
    r = requests.post("https://sklearn2sql.herokuapp.com/model", json=json_data)
    content = r.json()
    lCPP = content["model"]["SQLGenrationResult"][0]["SQL"]
    # print(lCPP);
    return lCPP


lCPPCode = generate_cpp_for_model(clf);


In [7]:
print(lCPPCode)

namespace  {

	namespace SubModel_0_0 {
	
		typedef std::vector<double> tNodeData;
		std::map<int, tNodeData> Decision_Tree_Node_data = {
				{ 1 ,  {2. }} ,
				{ 3 ,  {-1. }} ,
				{ 5 ,  {-1. }} ,
				{ 6 ,  {-1. }} 
		};
		
	
		int get_decision_tree_node_index(std::any Feature_0, std::any Feature_1, std::any Feature_2, std::any Feature_3) {
			int lNodeIndex = (Feature_3 <= 0.800000011920929) ? ( 1 ) : ( (Feature_3 <= 1.550000011920929) ? ( 3 ) : ( (Feature_2 <= 6.5) ? ( 5 ) : ( 6 ) ) );
		
			return lNodeIndex;
		}
		
	
		std::vector<std::string> get_input_names(){
			std::vector<std::string> lFeatures = { "Feature_0", "Feature_1", "Feature_2", "Feature_3" };
	
			return lFeatures;
		}
	
		std::vector<std::any> get_classes(){
			std::vector<std::any> lClasses = { 0, 1, 2 };
	
			return lClasses;
		}
	
		std::vector<std::string> get_output_names(){
			std::vector<std::string> lOutputs = { 
				"Score_0", "Score_1", "Score_2",
				"Proba_0", "Proba_1", "Proba_2",
				"LogProba_0", "

In [8]:
    def write_text_to_file(iCPPCode, oCPPFile):          
        with open(oCPPFile, "w") as text_file:
            text_file.write(iCPPCode)

    def add_cpp_main_function(iCPPCode, iCSVFile):
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = lCPPCode + iCPPCode
        lCPPCode = lCPPCode + "\tint main() {\n"
        lCPPCode = lCPPCode + "\t\tscore_csv_file(\"" + iCSVFile +"\");\n"
        lCPPCode = lCPPCode + "\treturn 0;\n}\n"
        return lCPPCode

    def compile_cpp_code_as_executable(iName):
        import subprocess
        lCommand = ["g++", "-Wall", "-Wno-unused-function", "-std=c++17" , "-g" ,  "-o", iName + ".exe",  iName + ".cpp"]
        print("EXECUTING" , "'" + " ".join(lCommand) + "'")
        result = subprocess.check_output(lCommand)
        # print(result)

    def execute_cpp_model(iName, iCSVFile):
        import subprocess
        result2 = subprocess.check_output([iName + ".exe",  iCSVFile])
        result2 = result2.decode()
        print(result2[:1000])
        print(result2[-1000:])
        return result2
        
    def execute_cpp_code(iCPPCode, iCSVFile):
        lName = "/tmp/sklearn2sql_cpp_" + str(id(clf));
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, lName + ".cpp")
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        write_text_to_file(str(result), lName + ".out")
        return lName + ".out"


In [9]:
populate_table("/tmp/iris2.csv" , ["Feature_0", "Feature_1", "Feature_2", "Feature_3"])
lCPPOutput = execute_cpp_code(lCPPCode , "/tmp/iris2.csv")
cpp_output = pd.read_csv(lCPPOutput)

EXECUTING 'g++ -Wall -Wno-unused-function -std=c++17 -g -o /tmp/sklearn2sql_cpp_140539469663200.exe /tmp/sklearn2sql_cpp_140539469663200.cpp'
idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
0,0.33333333333333,0.08652017333333,-0.05751198666667,0.40687265824134,0.31788417836929,0.27524316338936,-0.89925502151029,-1.14606818151029,-1.29010034151029,0,0.40687265824134
1,0.33333333333333,0.08652017333333,-0.05751198666667,0.40687265824134,0.31788417836929,0.27524316338936,-0.89925502151029,-1.14606818151029,-1.29010034151029,0,0.40687265824134
2,0.33333333333333,0.08652017333333,-0.05751198666667,0.40687265824134,0.31788417836929,0.27524316338936,-0.89925502151029,-1.14606818151029,-1.29010034151029,0,0.40687265824134
3,0.33333333333333,0.08652017333333,-0.05751198666667,0.40687265824134,0.31788417836929,0.27524316338936,-0.89925502151029,-1.14606818151029,-1.29010034151029,0,0.40687265824134
4,0.33333333333333,0.08652017333333,-0

In [10]:
cpp_output.sample(12, random_state=1960)

Unnamed: 0,idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,0.333333,0.087103,-0.057022,0.406742,0.317968,0.27529,-0.899575,-1.145806,-1.289931,0,0.406742
74,74,0.270833,0.036869,-0.099321,0.402898,0.318849,0.278253,-0.909072,-1.143036,-1.279226,0,0.402898
9,9,0.333333,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
88,88,0.270833,0.036869,-0.102023,0.403201,0.319089,0.27771,-0.908321,-1.142285,-1.281177,0,0.403201
25,25,0.333333,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
5,5,0.333333,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
48,48,0.333333,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
117,117,0.333333,0.085525,-0.059784,0.407256,0.317867,0.274877,-0.898314,-1.146122,-1.291431,0,0.407256
83,83,-0.666667,-0.645379,-0.625453,0.326414,0.333438,0.340148,-1.119588,-1.0983,-1.078374,2,0.340148
105,105,0.333333,0.085525,-0.059784,0.407256,0.317867,0.274877,-0.898314,-1.146122,-1.291431,0,0.407256


In [11]:
skl_outputs = pd.DataFrame()
X = df[metadata['features']].values
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['idx']);

skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)

Unnamed: 0,idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,2.303171e-27,1.309959e-18,1.0,-61.335511,-41.17654,0.0,2
74,74,,,,6.596766e-21,1.0,2.067148e-15,-46.467707,-1.776357e-15,-33.812607,1
9,9,,,,1.0,2.493157e-15,1.384112e-25,0.0,-33.62523,-57.239569,0
88,88,,,,2.0947349999999998e-19,1.0,4.948223e-15,-43.00969,-4.440892e-15,-32.939748,1
25,25,,,,1.0,3.19032e-15,1.7735450000000002e-25,0.0,-33.37866,-56.991647,0
5,5,,,,1.0,1.506137e-15,3.9810309999999997e-26,0.0,-34.12923,-58.485672,0
48,48,,,,1.0,2.130823e-15,8.201153999999999e-26,0.0,-33.78227,-57.762938,0
117,117,,,,2.964277e-26,2.627665e-18,1.0,-58.780579,-40.48044,0.0,2
83,83,,,,2.945826e-21,1.0,1.088228e-13,-47.273898,-1.083578e-13,-29.849056,1
105,105,,,,3.662639e-27,2.6517439999999997e-20,1.0,-60.871614,-45.07648,0.0,2


In [12]:
cpp_skl_join = skl_output.join(cpp_output , how='left', on='idx', lsuffix='_skl', rsuffix='_cpp')

In [13]:
cpp_skl_join.sample(12, random_state=1960)

Unnamed: 0,idx_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_cpp,Score_2_cpp,Proba_0_cpp,Proba_1_cpp,Proba_2_cpp,LogProba_0_cpp,LogProba_1_cpp,LogProba_2_cpp,Decision_cpp,DecisionProba
114,114,,,,2.303171e-27,1.309959e-18,1.0,-61.335511,-41.17654,0.0,...,0.087103,-0.057022,0.406742,0.317968,0.27529,-0.899575,-1.145806,-1.289931,0,0.406742
74,74,,,,6.596766e-21,1.0,2.067148e-15,-46.467707,-1.776357e-15,-33.812607,...,0.036869,-0.099321,0.402898,0.318849,0.278253,-0.909072,-1.143036,-1.279226,0,0.402898
9,9,,,,1.0,2.493157e-15,1.384112e-25,0.0,-33.62523,-57.239569,...,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
88,88,,,,2.0947349999999998e-19,1.0,4.948223e-15,-43.00969,-4.440892e-15,-32.939748,...,0.036869,-0.102023,0.403201,0.319089,0.27771,-0.908321,-1.142285,-1.281177,0,0.403201
25,25,,,,1.0,3.19032e-15,1.7735450000000002e-25,0.0,-33.37866,-56.991647,...,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
5,5,,,,1.0,1.506137e-15,3.9810309999999997e-26,0.0,-34.12923,-58.485672,...,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
48,48,,,,1.0,2.130823e-15,8.201153999999999e-26,0.0,-33.78227,-57.762938,...,0.08652,-0.057512,0.406873,0.317884,0.275243,-0.899255,-1.146068,-1.2901,0,0.406873
117,117,,,,2.964277e-26,2.627665e-18,1.0,-58.780579,-40.48044,0.0,...,0.085525,-0.059784,0.407256,0.317867,0.274877,-0.898314,-1.146122,-1.291431,0,0.407256
83,83,,,,2.945826e-21,1.0,1.088228e-13,-47.273898,-1.083578e-13,-29.849056,...,-0.645379,-0.625453,0.326414,0.333438,0.340148,-1.119588,-1.0983,-1.078374,2,0.340148
105,105,,,,3.662639e-27,2.6517439999999997e-20,1.0,-60.871614,-45.07648,0.0,...,0.085525,-0.059784,0.407256,0.317867,0.274877,-0.898314,-1.146122,-1.291431,0,0.407256


In [14]:
condition = (cpp_skl_join.Decision_cpp != cpp_skl_join.Decision_skl)
cpp_skl_join[condition]


Unnamed: 0,idx_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_cpp,Score_2_cpp,Proba_0_cpp,Proba_1_cpp,Proba_2_cpp,LogProba_0_cpp,LogProba_1_cpp,LogProba_2_cpp,Decision_cpp,DecisionProba
50,50,,,,4.861960e-22,1.000000e+00,6.019265e-17,-49.075430,0.000000e+00,-3.734898e+01,...,0.036869,-0.102023,0.403201,0.319089,0.277710,-0.908321,-1.142285,-1.281177,0,0.403201
51,51,,,,5.438182e-21,1.000000e+00,1.636978e-15,-46.660842,-1.776357e-15,-3.404592e+01,...,0.036869,-0.102023,0.403201,0.319089,0.277710,-0.908321,-1.142285,-1.281177,0,0.403201
52,52,,,,4.648410e-22,1.000000e+00,3.066197e-15,-49.120347,-3.552714e-15,-3.341834e+01,...,0.036869,-0.102023,0.403201,0.319089,0.277710,-0.908321,-1.142285,-1.281177,0,0.403201
53,53,,,,1.096339e-19,1.000000e+00,3.023902e-15,-43.657140,-3.552714e-15,-3.343223e+01,...,0.038446,-0.099321,0.402695,0.319192,0.278113,-0.909575,-1.141962,-1.279729,0,0.402695
54,54,,,,8.367237e-21,1.000000e+00,3.041173e-15,-46.229963,-3.552714e-15,-3.342653e+01,...,0.038446,-0.099321,0.402695,0.319192,0.278113,-0.909575,-1.141962,-1.279729,0,0.402695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,145,,,,3.433953e-27,6.561614e-20,1.000000e+00,-60.936085,-4.417047e+01,0.000000e+00,...,0.085525,-0.059784,0.407256,0.317867,0.274877,-0.898314,-1.146122,-1.291431,0,0.407256
146,146,,,,1.092477e-25,6.964033e-15,1.000000e+00,-57.476180,-3.259802e+01,-7.105427e-15,...,0.087103,-0.057022,0.406742,0.317968,0.275290,-0.899575,-1.145806,-1.289931,0,0.406742
147,147,,,,3.433953e-27,1.006186e-19,1.000000e+00,-60.936085,-4.374295e+01,0.000000e+00,...,0.085525,-0.059784,0.407256,0.317867,0.274877,-0.898314,-1.146122,-1.291431,0,0.407256
148,148,,,,2.454827e-26,1.888848e-16,1.000000e+00,-58.969156,-3.620539e+01,0.000000e+00,...,0.085525,-0.059784,0.407256,0.317867,0.274877,-0.898314,-1.146122,-1.291431,0,0.407256
