# ML2CPP

## Preparing the dataset

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd

def populate_table(tablename, feature_names):
    iris = datasets.load_iris()
    X = iris.data  
    N = X.shape[0]
    y = iris.target.reshape(N,1)
    k = np.arange(N).reshape(N, 1)
    k_X_y = np.concatenate((k, X, y) , axis=1)
    lTable=pd.DataFrame(k_X_y)
    # print(lTable.head())
    lTable.columns = ['idx'] + feature_names + ['TGT'];
    lTable['TGT'] = lTable['TGT'].apply(int)
    lTable['idx'] = lTable['idx'].apply(int)
    lTable.to_csv(tablename , float_format='%.14g')



In [2]:
metadata = {"primary_key" : "KEY",
            "features" : ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm'],
            "targets" : ["TGT"],
            "table" : "iris"}

In [3]:
populate_table("/tmp/iris.csv" , metadata["features"])


In [4]:
df = pd.read_csv("/tmp/iris.csv")
df.sample(12, random_state=1960)

Unnamed: 0.1,Unnamed: 0,idx,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,TGT
114,114,114,5.8,2.8,5.1,2.4,2
74,74,74,6.4,2.9,4.3,1.3,1
9,9,9,4.9,3.1,1.5,0.1,0
88,88,88,5.6,3.0,4.1,1.3,1
25,25,25,5.0,3.0,1.6,0.2,0
5,5,5,5.4,3.9,1.7,0.4,0
48,48,48,5.3,3.7,1.5,0.2,0
117,117,117,7.7,3.8,6.7,2.2,2
83,83,83,6.0,2.7,5.1,1.6,1
105,105,105,7.6,3.0,6.6,2.1,2


## Training a Model

In [5]:


# train any scikit model on the iris dataset
from sklearn.svm import SVC
clf = SVC()
clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


  return f(**kwargs)


SVC()

## Deploying the Model

In [6]:

def generate_cpp_for_model(model):
    import pickle, json, requests, base64
    b64_data = base64.b64encode(pickle.dumps(model)).decode('utf-8')
    # send the model th the web service
    json_data={"Name":"model_cpp_sample", 
               "PickleData":b64_data , 
               "SQLDialect":"CPP",
               "FeatureNames" : metadata['features']}
    r = requests.post("https://sklearn2sql.herokuapp.com/model", json=json_data)
    content = r.json()
    lCPP = content["model"]["SQLGenrationResult"][0]["SQL"]
    # print(lCPP);
    return lCPP


lCPPCode = generate_cpp_for_model(clf);


In [7]:
print(lCPPCode)

namespace  {

	namespace pb_0_1 {
	
		std::vector<std::any> lProblem_data_dual_0_1 = { 
		0.0 ,0.3169291682042101 ,1.0 ,1.0 ,0.07471753254958248 ,1.0 ,1.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-1.0 ,-0.0 ,-0.0 ,-0.39164670075379254 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-1.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-0.0 ,-1.0 ,-1.0 , };
		
	
		std::vector<std::vector<std::any> > lProblem_data_sv_0_1 = { 
		 { 5.7, 3.8, 1.7, 0.3 },
		 { 5.4, 3.4, 1.7, 0.2 },
		 { 5.1, 3.3, 1.7, 0.5 },
		 { 4.8, 3.4, 1.9, 0.2 },
		 { 5.0, 3.0, 1.6, 0.2 },
		 { 4.5, 2.3, 1.3, 0.3 },
		 { 5.1, 3.8, 1.9, 0.4 },
		 { 7.0, 3.2, 4.7, 1.4 },
		 { 6.4, 3.2, 4.5, 1.5 },
		 { 6.9, 3.1, 4.9, 1.5 },
		 { 6.5, 2.8, 4.6, 1.5 },
		 { 5.7, 2.8, 4.5, 1.3 },
		 { 6.3, 3.3, 4.7, 1.6 },
		 { 4.9, 2.4, 3.3, 1.0 },
		 { 6.6, 2.9, 4.6, 1.3 },
		 { 6.1, 2.9, 4.7, 1.4 },
		 { 5.6, 2.9, 3.6, 1.3 },
		 { 5.6, 3.0, 4.5, 1.5 },
		 { 6.2, 2.2, 4.5, 1.5 },
		 { 5.9, 3.2, 4.8, 1.8 },
		 { 6.3, 2.5, 4.9, 1.5 },
		 { 6.1, 2.8,

In [8]:
    def write_text_to_file(iCPPCode, oCPPFile):          
        with open(oCPPFile, "w") as text_file:
            text_file.write(iCPPCode)

    def add_cpp_main_function(iCPPCode, iCSVFile):
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = lCPPCode + iCPPCode
        lCPPCode = lCPPCode + "\tint main() {\n"
        lCPPCode = lCPPCode + "\t\tscore_csv_file(\"" + iCSVFile +"\");\n"
        lCPPCode = lCPPCode + "\treturn 0;\n}\n"
        return lCPPCode

    def compile_cpp_code_as_executable(iName):
        import subprocess
        lCommand = ["g++", "-Wall", "-Wno-unused-function", "-std=c++17" , "-g" ,  "-o", iName + ".exe",  iName + ".cpp"]
        print("EXECUTING" , "'" + " ".join(lCommand) + "'")
        result = subprocess.check_output(lCommand)
        # print(result)

    def execute_cpp_model(iName, iCSVFile):
        import subprocess
        result2 = subprocess.check_output([iName + ".exe",  iCSVFile])
        result2 = result2.decode()
        print(result2[:1000])
        print(result2[-1000:])
        return result2
        
    def execute_cpp_code(iCPPCode, iCSVFile):
        lName = "/tmp/sklearn2sql_cpp_" + str(id(clf));
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, lName + ".cpp")
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        write_text_to_file(str(result), lName + ".out")
        return lName + ".out"


In [9]:
populate_table("/tmp/iris2.csv" , ["Feature_0", "Feature_1", "Feature_2", "Feature_3"])
lCPPOutput = execute_cpp_code(lCPPCode , "/tmp/iris2.csv")
cpp_output = pd.read_csv(lCPPOutput)

EXECUTING 'g++ -Wall -Wno-unused-function -std=c++17 -g -o /tmp/sklearn2sql_cpp_139862809705632.exe /tmp/sklearn2sql_cpp_139862809705632.cpp'
idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
0,2.23543749285287,1.16096255367151,-0.25650709255536,,,,,,,0,
1,2.23070832213714,1.16512869379274,-0.25452710603067,,,,,,,0,
2,2.23621785787935,1.13797871623364,-0.25279155284187,,,,,,,0,
3,2.23026120044212,1.15798503052157,-0.25271988795884,,,,,,,0,
4,2.23674113247089,1.15296311743861,-0.25579565287686,,,,,,,0,
5,2.22570596272710,1.18956408640365,-0.25784401501336,,,,,,,0,
6,2.23505058479494,1.14061214489691,-0.25245053759740,,,,,,,0,
7,2.23250762285382,1.16911424296031,-0.25645479377281,,,,,,,0,
8,2.23033229367838,1.14089627089005,-0.24933602966973,,,,,,,0,
9,2.23053932295177,1.16983920742397,-0.25547973200357,,,,,,,0,
10,2.23260251173577,1.17638899924110,-0.25814046717415,,,,,,,0,
11,2.23057054079204,1.16939414188246,-0.25539691052344,,

In [10]:
cpp_output.sample(12, random_state=1960)

Unnamed: 0,idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,-0.241222,1.09845,2.229156,,,,,,,2,
74,74,-0.222127,2.237579,0.891333,,,,,,,1,
9,9,2.230539,1.169839,-0.25548,,,,,,,0,
88,88,-0.213071,2.240024,0.851788,,,,,,,1,
25,25,2.224047,1.186375,-0.256142,,,,,,,0,
5,5,2.225706,1.189564,-0.257844,,,,,,,0,
48,48,2.233387,1.172744,-0.257762,,,,,,,0,
117,117,-0.206625,0.812894,2.248088,,,,,,,2,
83,83,-0.240594,1.187573,2.188873,,,,,,,2,
105,105,-0.217309,0.816423,2.252009,,,,,,,2,


In [11]:
skl_outputs = pd.DataFrame()
X = df[metadata['features']].values
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['idx']);

skl_output_score = pd.DataFrame(clf.decision_function(X), columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)

Unnamed: 0,idx,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,-0.241222,1.09845,2.229156,,,,,,,2
74,74,-0.222127,2.237579,0.891333,,,,,,,1
9,9,2.230539,1.169839,-0.25548,,,,,,,0
88,88,-0.213071,2.240024,0.851788,,,,,,,1
25,25,2.224047,1.186375,-0.256142,,,,,,,0
5,5,2.225706,1.189564,-0.257844,,,,,,,0
48,48,2.233387,1.172744,-0.257762,,,,,,,0
117,117,-0.206625,0.812894,2.248088,,,,,,,2
83,83,-0.240594,1.187573,2.188873,,,,,,,2
105,105,-0.217309,0.816423,2.252009,,,,,,,2


In [12]:
cpp_skl_join = skl_output.join(cpp_output , how='left', on='idx', lsuffix='_skl', rsuffix='_cpp')

In [13]:
cpp_skl_join.sample(12, random_state=1960)

Unnamed: 0,idx_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_cpp,Score_2_cpp,Proba_0_cpp,Proba_1_cpp,Proba_2_cpp,LogProba_0_cpp,LogProba_1_cpp,LogProba_2_cpp,Decision_cpp,DecisionProba
114,114,-0.241222,1.09845,2.229156,,,,,,,...,1.09845,2.229156,,,,,,,2,
74,74,-0.222127,2.237579,0.891333,,,,,,,...,2.237579,0.891333,,,,,,,1,
9,9,2.230539,1.169839,-0.25548,,,,,,,...,1.169839,-0.25548,,,,,,,0,
88,88,-0.213071,2.240024,0.851788,,,,,,,...,2.240024,0.851788,,,,,,,1,
25,25,2.224047,1.186375,-0.256142,,,,,,,...,1.186375,-0.256142,,,,,,,0,
5,5,2.225706,1.189564,-0.257844,,,,,,,...,1.189564,-0.257844,,,,,,,0,
48,48,2.233387,1.172744,-0.257762,,,,,,,...,1.172744,-0.257762,,,,,,,0,
117,117,-0.206625,0.812894,2.248088,,,,,,,...,0.812894,2.248088,,,,,,,2,
83,83,-0.240594,1.187573,2.188873,,,,,,,...,1.187573,2.188873,,,,,,,2,
105,105,-0.217309,0.816423,2.252009,,,,,,,...,0.816423,2.252009,,,,,,,2,


In [14]:
condition = (cpp_skl_join.Decision_cpp != cpp_skl_join.Decision_skl)
cpp_skl_join[condition]


Unnamed: 0,idx_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_cpp,Score_2_cpp,Proba_0_cpp,Proba_1_cpp,Proba_2_cpp,LogProba_0_cpp,LogProba_1_cpp,LogProba_2_cpp,Decision_cpp,DecisionProba
