# ML2CPP

## Preparing the dataset

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd

def populate_table(tablename, feature_names):
    iris = datasets.load_iris()
    X = iris.data  
    N = X.shape[0]
    y = iris.target.reshape(N,1)
    k = np.arange(N).reshape(N, 1)
    k_X_y = np.concatenate((k, X, y) , axis=1)
    lTable=pd.DataFrame(k_X_y)
    # print(lTable.head())
    lTable.columns = ['idx'] + feature_names + ['TGT'];
    lTable['TGT'] = lTable['TGT'].apply(int)
    lTable['idx'] = lTable['idx'].apply(int)
    lTable.to_csv(tablename , float_format='%.14g')



In [2]:
metadata = {"primary_key" : "KEY",
            "features" : ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm'],
            "targets" : ["TGT"],
            "table" : "iris"}

In [3]:
populate_table("/tmp/iris.csv" , metadata["features"])


In [4]:
df = pd.read_csv("/tmp/iris.csv")
df.sample(12, random_state=1960)

Unnamed: 0.1,Unnamed: 0,idx,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,TGT
114,114,114,5.8,2.8,5.1,2.4,2
74,74,74,6.4,2.9,4.3,1.3,1
9,9,9,4.9,3.1,1.5,0.1,0
88,88,88,5.6,3.0,4.1,1.3,1
25,25,25,5.0,3.0,1.6,0.2,0
5,5,5,5.4,3.9,1.7,0.4,0
48,48,48,5.3,3.7,1.5,0.2,0
117,117,117,7.7,3.8,6.7,2.2,2
83,83,83,6.0,2.7,5.1,1.6,1
105,105,105,7.6,3.0,6.6,2.1,2


## Training a Model

In [5]:


# train any scikit model on the iris dataset
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 512, random_state=1960)
clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


  clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


## Deploying the Model

In [6]:

def generate_cpp_for_model(model):
    import pickle, json, requests, base64
    b64_data = base64.b64encode(pickle.dumps(model)).decode('utf-8')
    # send the model th the web service
    json_data={"Name":"model_cpp_sample", 
               "PickleData":b64_data , 
               "SQLDialect":"CPP",
               "FeatureNames" : metadata['features']}
    r = requests.post("http://127.88.88.88:1888/model", json=json_data)
    content = r.json()
    lCPP = content["model"]["SQLGenrationResult"][0]["SQL"]
    # print(lCPP);
    return lCPP


lCPPCode = generate_cpp_for_model(clf);


In [7]:
print(lCPPCode[:10000] + "\n\n\n .... \n\n\n" + lCPPCode[-10000:])

namespace  {

	std::vector<std::any> get_classes(){
		std::vector<std::any> lClasses = { 0, 1, 2 };

		return lClasses;
	}

	namespace RF_Tree_0 {
	
		std::vector<std::any> get_classes(){
			std::vector<std::any> lClasses = { 0, 1, 2 };
	
			return lClasses;
		}
	
		typedef std::vector<double> tNodeData;
		std::map<int, tNodeData> Decision_Tree_Node_data = {
				{ 2 ,  {1.0, 0.0, 0.0 }} ,
				{ 4 ,  {0.0, 1.0, 0.0 }} ,
				{ 5 ,  {0.0, 0.0, 1.0 }} ,
				{ 8 ,  {1.0, 0.0, 0.0 }} ,
				{ 9 ,  {0.0, 1.0, 0.0 }} ,
				{ 13 ,  {0.0, 0.0, 1.0 }} ,
				{ 15 ,  {0.0, 1.0, 0.0 }} ,
				{ 16 ,  {0.0, 0.0, 1.0 }} ,
				{ 17 ,  {0.0, 1.0, 0.0 }} ,
				{ 18 ,  {0.0, 0.0, 1.0 }} 
		};
		
	
		int get_decision_tree_node_index(std::any Feature_0, std::any Feature_1, std::any Feature_2, std::any Feature_3) {
			int lNodeIndex = (Feature_0 <= 5.450000047683716) ? ( (Feature_3 <= 0.7000000029802322) ? ( 2 ) : ( (Feature_3 <= 1.600000023841858) ? ( 4 ) : ( 5 ) ) ) : ( (Feature_2 <= 4.75) ? ( (Feature_3 <= 

In [8]:
    def write_text_to_file(iCPPCode, oCPPFile):          
        with open(oCPPFile, "w") as text_file:
            text_file.write(iCPPCode)

    def add_cpp_boost_layer(iModelName):
        lCPPCode = "#include \"Generic.i\"\n"
        lCPPCode = lCPPCode + "#include \"/tmp/" + iModelName + "_model_specific.i\"\n\n"
        lCPPCode = lCPPCode + "#include <boost/python.hpp>\n"
        lCPPCode = lCPPCode + "using namespace boost::python;\n\n"
        lCPPCode = lCPPCode + "BOOST_PYTHON_MODULE(" + iModelName + ") {\n"
        lCPPCode = lCPPCode + "\tdef(\"score_csv_file\", score_csv_file); \n"
        lCPPCode = lCPPCode + "}\n"
        lCPPCode = lCPPCode + "\n"
        return lCPPCode 
         

    def compile_cpp_code_as_shared_lib(iName):
        import subprocess
        # g++ -I$(PYTHON_INCLUDE) -I$(BOOST_INC) -fPIC -c $(TARGET).C
        lCommand = ["g++", "-I/usr/include/python3.10", "-Wno-unused-function", 
                    "-fPIC", "-std=c++17" , "-g" ,  
                    "-o",  iName + ".o",
                    "-c",  iName + ".cpp"]
        print("EXECUTING" , "'" + " ".join(lCommand) + "'")
        result = subprocess.check_output(lCommand)
        # 	g++ -shared -Wl,--export-dynamic $(TARGET).o -L$(BOOST_LIB) -lboost_python-$(PYTHON_VERSION) -L/usr/lib/python$(PYTHON_VERSION)/config -lpython$(PYTHON_VERSION) -o $(TARGET).so
        lCommand2 = ["g++" , iName + ".o", "-shared",  "-Wl,--export-dynamic",  "-lboost_python310", "-L/usr/lib/python3.10/config" , "-lpython3.10" ,  "-o",  iName + ".so" ]
        print("EXECUTING" , "'" + " ".join(lCommand2) + "'")
        result2 = subprocess.check_output(lCommand2)
        # print(result)

    def execute_boost_python_model(iModelName, iCSVFile):
        import sys
        sys.path = sys.path + ['/tmp']
        import importlib
        lModelPythonModule = importlib.import_module(iModelName)
        result2 = lModelPythonModule.score_csv_file(iCSVFile)
        print(result2[:10])
        print(result2[-10:])
        return result2
        
    def deploy_cpp_code_in_python(iCPPCode, iCSVFile):
        lModelName = "sklearn2sql_cpp_" + str(id(clf))
        lName = "/tmp/" + lModelName;
        write_text_to_file(iCPPCode, lName + "_model_specific.i")
        lCPPCode = add_cpp_boost_layer(lModelName)
        print("BOOST_PYTHON_LAYER_START")
        print(lCPPCode)
        print("BOOST_PYTHON_LAYER_END")
        write_text_to_file(lCPPCode, lName + ".cpp")
        compile_cpp_code_as_shared_lib(lName)
        result = execute_boost_python_model(lModelName, iCSVFile)
        write_text_to_file(str(result), lName + ".out")
        return lName + ".out"


In [9]:
populate_table("/tmp/iris2.csv" , ["Feature_0", "Feature_1", "Feature_2", "Feature_3"])
lCPPOutput = deploy_cpp_code_in_python(lCPPCode , "/tmp/iris2.csv")
cpp_output = pd.read_csv(lCPPOutput)

BOOST_PYTHON_LAYER_START
#include "Generic.i"
#include "/tmp/sklearn2sql_cpp_140201627744480_model_specific.i"

#include <boost/python.hpp>
using namespace boost::python;

BOOST_PYTHON_MODULE(sklearn2sql_cpp_140201627744480) {
	def("score_csv_file", score_csv_file); 
}


BOOST_PYTHON_LAYER_END
EXECUTING 'g++ -I/usr/include/python3.10 -Wno-unused-function -fPIC -std=c++17 -g -o /tmp/sklearn2sql_cpp_140201627744480.o -c /tmp/sklearn2sql_cpp_140201627744480.cpp'


In file included from /usr/include/boost/smart_ptr/detail/sp_thread_sleep.hpp:22,
                 from /usr/include/boost/smart_ptr/detail/yield_k.hpp:23,
                 from /usr/include/boost/smart_ptr/detail/spinlock_gcc_atomic.hpp:14,
                 from /usr/include/boost/smart_ptr/detail/spinlock.hpp:42,
                 from /usr/include/boost/smart_ptr/detail/spinlock_pool.hpp:25,
                 from /usr/include/boost/smart_ptr/shared_ptr.hpp:29,
                 from /usr/include/boost/shared_ptr.hpp:17,
                 from /usr/include/boost/python/converter/shared_ptr_to_python.hpp:12,
                 from /usr/include/boost/python/converter/arg_to_python.hpp:15,
                 from /usr/include/boost/python/call.hpp:15,
                 from /usr/include/boost/python/object_core.hpp:14,
                 from /usr/include/boost/python/args.hpp:22,
                 from /usr/include/boost/python.hpp:11,
                 from /tmp/sklearn2sql_cpp_140201627744480.c

EXECUTING 'g++ /tmp/sklearn2sql_cpp_140201627744480.o -shared -Wl,--export-dynamic -lboost_python310 -L/usr/lib/python3.10/config -lpython3.10 -o /tmp/sklearn2sql_cpp_140201627744480.so'
idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
0,,,,1.00000000000000,0.00000000000000,0.00000000000000,0.00000000000000,-32.23619130191664,-32.23619130191664,0,1.00000000000000
1,,,,0.99804687500000,0.00195312500000,0.00000000000000,-0.00195503483580,-6.23832462503951,-32.23619130191664,0,0.99804687500000
2,,,,1.00000000000000,0.00000000000000,0.00000000000000,0.00000000000000,-32.23619130191664,-32.23619130191664,0,1.00000000000000
3,,,,1.00000000000000,0.00000000000000,0.00000000000000,0.00000000000000,-32.23619130191664,-32.23619130191664,0,1.00000000000000
4,,,,1.00000000000000,0.00000000000000,0.00000000000000,0.00000000000000,-32.23619130191664,-32.23619130191664,0,1.00000000000000
5,,,,1.00000000000000,0.00000000000000,0.00000000000000

COLUMN_GUESSED_AS_INT  150
COLUMN_GUESSED_AS_INT idx 150
COLUMN_GUESSED_AS_DOUBLE Feature_0 150
COLUMN_GUESSED_AS_DOUBLE Feature_1 150
COLUMN_GUESSED_AS_DOUBLE Feature_2 150
COLUMN_GUESSED_AS_DOUBLE Feature_3 150
COLUMN_GUESSED_AS_INT TGT 150


00,0.94531250000000,-32.23619130191664,-2.90612011486430,-0.05623971832288,2,0.94531250000000
122,,,,0.00000000000000,0.00000000000000,1.00000000000000,-32.23619130191664,-32.23619130191664,0.00000000000000,2,1.00000000000000
123,,,,0.00000000000000,0.04492187500000,0.95507812500000,-32.23619130191664,-3.10283040911036,-0.04596213556464,2,0.95507812500000
124,,,,0.00000000000000,0.00195312500000,0.99804687500000,-32.23619130191664,-6.23832462503951,-0.00195503483580,2,0.99804687500000
125,,,,0.00000000000000,0.00781250000000,0.99218750000000,-32.23619130191664,-4.85203026391962,-0.00784317746103,2,0.99218750000000
126,,,,0.00000000000000,0.09570312500000,0.90429687500000,-32.23619130191664,-2.34650432692888,-0.10059757095327,2,0.90429687500000
127,,,,0.00000000000000,0.02734375000000,0.97265625000000,-32.23619130191664,-3.59926729542425,-0.02772454801486,2,0.97265625000000
128,,,,0.00000000000000,0.00000000000000,1.00000000000000,-32.23619130191664,-32.23619130191664,0.00000000000000,2

In [10]:
cpp_output.sample(12, random_state=1960)

Unnamed: 0,idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,,,,0.0,0.0,1.0,-32.236191,-32.236191,0.0,2,1.0
74,74,,,,0.0,1.0,0.0,-32.236191,0.0,-32.236191,1,1.0
9,9,,,,1.0,0.0,0.0,0.0,-32.236191,-32.236191,0,1.0
88,88,,,,0.0,1.0,0.0,-32.236191,0.0,-32.236191,1,1.0
25,25,,,,0.996094,0.003906,0.0,-0.003914,-5.545177,-32.236191,0,0.996094
5,5,,,,1.0,0.0,0.0,0.0,-32.236191,-32.236191,0,1.0
48,48,,,,1.0,0.0,0.0,0.0,-32.236191,-32.236191,0,1.0
117,117,,,,0.0,0.0,1.0,-32.236191,-32.236191,0.0,2,1.0
83,83,,,,0.0,0.697266,0.302734,-32.236191,-0.360589,-1.1949,1,0.697266
105,105,,,,0.0,0.0,1.0,-32.236191,-32.236191,0.0,2,1.0


In [11]:
skl_outputs = pd.DataFrame()
X = df[metadata['features']].values
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['idx']);

skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)

  return np.log(proba)


Unnamed: 0,idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
74,74,,,,0.0,1.0,0.0,-inf,0.0,-inf,1
9,9,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
88,88,,,,0.0,1.0,0.0,-inf,0.0,-inf,1
25,25,,,,0.996094,0.003906,0.0,-0.003914,-5.545177,-inf,0
5,5,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
48,48,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
117,117,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
83,83,,,,0.0,0.699219,0.300781,-inf,-0.357792,-1.201372,1
105,105,,,,0.0,0.0,1.0,-inf,-inf,0.0,2


In [12]:
cpp_skl_join = skl_output.join(cpp_output , how='left', on='idx', lsuffix='_skl', rsuffix='_cpp')

In [13]:
cpp_skl_join.sample(12, random_state=1960)

Unnamed: 0,idx_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_cpp,Score_2_cpp,Proba_0_cpp,Proba_1_cpp,Proba_2_cpp,LogProba_0_cpp,LogProba_1_cpp,LogProba_2_cpp,Decision_cpp,DecisionProba
114,114,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,0.0,0.0,1.0,-32.236191,-32.236191,0.0,2,1.0
74,74,,,,0.0,1.0,0.0,-inf,0.0,-inf,...,,,0.0,1.0,0.0,-32.236191,0.0,-32.236191,1,1.0
9,9,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,1.0,0.0,0.0,0.0,-32.236191,-32.236191,0,1.0
88,88,,,,0.0,1.0,0.0,-inf,0.0,-inf,...,,,0.0,1.0,0.0,-32.236191,0.0,-32.236191,1,1.0
25,25,,,,0.996094,0.003906,0.0,-0.003914,-5.545177,-inf,...,,,0.996094,0.003906,0.0,-0.003914,-5.545177,-32.236191,0,0.996094
5,5,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,1.0,0.0,0.0,0.0,-32.236191,-32.236191,0,1.0
48,48,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,1.0,0.0,0.0,0.0,-32.236191,-32.236191,0,1.0
117,117,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,0.0,0.0,1.0,-32.236191,-32.236191,0.0,2,1.0
83,83,,,,0.0,0.699219,0.300781,-inf,-0.357792,-1.201372,...,,,0.0,0.697266,0.302734,-32.236191,-0.360589,-1.1949,1,0.697266
105,105,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,0.0,0.0,1.0,-32.236191,-32.236191,0.0,2,1.0


In [14]:
condition = (cpp_skl_join.Decision_cpp != cpp_skl_join.Decision_skl)
cpp_skl_join[condition]


Unnamed: 0,idx_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_cpp,Score_2_cpp,Proba_0_cpp,Proba_1_cpp,Proba_2_cpp,LogProba_0_cpp,LogProba_1_cpp,LogProba_2_cpp,Decision_cpp,DecisionProba
