# ML2CPP

## Preparing the dataset

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd

def populate_table(tablename, feature_names):
    iris = datasets.load_iris()
    X = iris.data  
    N = X.shape[0]
    y = iris.target.reshape(N,1)
    k = np.arange(N).reshape(N, 1)
    k_X_y = np.concatenate((k, X, y) , axis=1)
    lTable=pd.DataFrame(k_X_y)
    # print(lTable.head())
    lTable.columns = ['idx'] + feature_names + ['TGT'];
    lTable['TGT'] = lTable['TGT'].apply(int)
    lTable['idx'] = lTable['idx'].apply(int)
    lTable.to_csv(tablename , float_format='%.14g')



In [2]:
metadata = {"primary_key" : "KEY",
            "features" : ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm'],
            "targets" : ["TGT"],
            "table" : "iris"}

In [3]:
populate_table("/tmp/iris.csv" , metadata["features"])


In [4]:
df = pd.read_csv("/tmp/iris.csv")
df.sample(12, random_state=1960)

Unnamed: 0.1,Unnamed: 0,idx,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,TGT
114,114,114,5.8,2.8,5.1,2.4,2
74,74,74,6.4,2.9,4.3,1.3,1
9,9,9,4.9,3.1,1.5,0.1,0
88,88,88,5.6,3.0,4.1,1.3,1
25,25,25,5.0,3.0,1.6,0.2,0
5,5,5,5.4,3.9,1.7,0.4,0
48,48,48,5.3,3.7,1.5,0.2,0
117,117,117,7.7,3.8,6.7,2.2,2
83,83,83,6.0,2.7,5.1,1.6,1
105,105,105,7.6,3.0,6.6,2.1,2


## Training a Model

In [5]:

from sklearn.svm import OneClassSVM

clf = OneClassSVM()

# train any scikit model on the iris dataset
clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


OneClassSVM()

## Deploying the Model

In [6]:

def generate_cpp_for_model(model):
    import pickle, json, requests, base64
    b64_data = base64.b64encode(pickle.dumps(model)).decode('utf-8')
    # send the model th the web service
    json_data={"Name":"model_cpp_sample", 
               "PickleData":b64_data , 
               "SQLDialect":"CPP",
               "FeatureNames" : metadata['features']}
    r = requests.post("https://sklearn2sql.herokuapp.com/model", json=json_data)
    content = r.json()
    lCPP = content["model"]["SQLGenrationResult"][0]["SQL"]
    # print(lCPP);
    return lCPP


lCPPCode = generate_cpp_for_model(clf);


In [7]:
print(lCPPCode)

namespace  {

	std::vector<std::any> lProblem_data_dual = { 
	 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 0.9345504106360158 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 0.06544958936398419 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,};
	

	std::vector<std::vector<std::any> > lProblem_data_sv = { 
	 { 5.1, 3.5, 1.4, 0.2 },
	 { 4.9, 3.0, 1.4, 0.2 },
	 { 4.7, 3.2, 1.3, 0.2 },
	 { 4.6, 3.1, 1.5, 0.2 },
	 { 5.0, 3.6, 1.4, 0.2 },
	 { 4.6, 3.4, 1.4, 0.3 },
	 { 4.4, 2.9, 1.4, 0.2 },
	 { 4.9, 3.1, 1.5, 0.1 },
	 { 5.4, 3.7, 1.5, 0.2 },
	 { 4.8, 3.4, 1.6, 0.2 },
	 { 4.8, 3.0, 1.4, 0.1 },
	 { 4.3, 3.0, 1.1, 0.1 },
	 { 5.8, 4.0, 1.2, 0.2 },
	 { 5.7, 4.4, 1.5, 0.4 },
	 { 5.4, 3.9, 1.3, 0.4

In [8]:
    def write_text_to_file(iCPPCode, oCPPFile):          
        with open(oCPPFile, "w") as text_file:
            text_file.write(iCPPCode)

    def add_cpp_main_function(iCPPCode, iCSVFile):
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = lCPPCode + iCPPCode
        lCPPCode = lCPPCode + "\tint main() {\n"
        lCPPCode = lCPPCode + "\t\tscore_csv_file(\"" + iCSVFile +"\");\n"
        lCPPCode = lCPPCode + "\treturn 0;\n}\n"
        return lCPPCode

    def compile_cpp_code_as_executable(iName):
        import subprocess
        lCommand = ["g++", "-Wall", "-Wno-unused-function", "-std=c++17" , "-g" ,  "-o", iName + ".exe",  iName + ".cpp"]
        print("EXECUTING" , "'" + " ".join(lCommand) + "'")
        result = subprocess.check_output(lCommand)
        # print(result)

    def execute_cpp_model(iName, iCSVFile):
        import subprocess
        result2 = subprocess.check_output([iName + ".exe",  iCSVFile])
        result2 = result2.decode()
        print(result2[:1000])
        print(result2[-1000:])
        return result2
        
    def execute_cpp_code(iCPPCode, iCSVFile):
        lName = "/tmp/sklearn2sql_cpp_" + str(id(clf));
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, lName + ".cpp")
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        write_text_to_file(str(result), lName + ".out")
        return lName + ".out"


In [9]:
populate_table("/tmp/iris2.csv" , ["Feature_0", "Feature_1", "Feature_2", "Feature_3"])
lCPPOutput = execute_cpp_code(lCPPCode , "/tmp/iris2.csv")
cpp_output = pd.read_csv(lCPPOutput)

EXECUTING 'g++ -Wall -Wno-unused-function -std=c++17 -g -o /tmp/sklearn2sql_cpp_140625027087232.exe /tmp/sklearn2sql_cpp_140625027087232.cpp'
idx,AnomalyScore,OutlierIndicator
0,-0.34323780111765,-1
1,-0.76620153358461,-1
2,-1.39860335283394,-1
3,-1.02110917929168,-1
4,-0.57323878814235,-1
5,0.25354493881046,1
6,-1.13691136835318,-1
7,0.02613937420360,1
8,-2.46755916466066,-1
9,-0.45538736610649,-1
10,-0.26124668409760,-1
11,-0.00627556327994,-1
12,-1.17793271027331,-1
13,-4.10280702926477,-1
14,-3.21288283440612,-1
15,-3.17108134207523,-1
16,-1.37581488301602,-1
17,-0.17801968917610,-1
18,-0.00000031297016,-1
19,-0.33890384040303,-1
20,0.88336069368934,1
21,0.02479113206537,1
22,-3.25215304605427,-1
23,1.28981112379763,1
24,0.80029316795027,1
25,0.14815288416468,1
26,0.69558526880331,1
27,0.11027363512144,1
28,-0.22081487754398,-1
29,-0.28498808876470,-1
30,-0.09809518364322,-1
31,0.45673637603277,1
32,-1.71654677463784,-1
33,-2.48649679258551,-1
34,-0.23238236435215,-1
35,-1.27995437

In [10]:
cpp_output.sample(12, random_state=1960)

Unnamed: 0,idx,AnomalyScore,OutlierIndicator
114,114,-1.272349,-1
74,74,2.705024,1
9,9,-0.455387,-1
88,88,3.237676,1
25,25,0.148153,1
5,5,0.253545,1
48,48,-0.210193,-1
117,117,-11.010502,-1
83,83,0.746726,1
105,105,-8.752248,-1


In [11]:
skl_outputs = pd.DataFrame()
X = df[metadata['features']].values
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['idx']);

skl_output_outlier_indicator = pd.DataFrame(clf.predict(X), columns=['OutlierIndicator'])
skl_output_outlier_score = pd.DataFrame(clf.decision_function(X), columns=['AnomalyScore'])
skl_output = pd.concat([skl_output_key, skl_output_outlier_score, skl_output_outlier_indicator] , axis=1)
skl_output.sample(12, random_state=1960)

Unnamed: 0,idx,AnomalyScore,OutlierIndicator
114,114,-1.272349,-1
74,74,2.705024,1
9,9,-0.455387,-1
88,88,3.237676,1
25,25,0.148153,1
5,5,0.253545,1
48,48,-0.210193,-1
117,117,-11.010502,-1
83,83,0.746726,1
105,105,-8.752248,-1


In [12]:
cpp_skl_join = skl_output.join(cpp_output , how='left', on='idx', lsuffix='_skl', rsuffix='_cpp')

In [13]:
cpp_skl_join.sample(12, random_state=1960)

Unnamed: 0,idx_skl,AnomalyScore_skl,OutlierIndicator_skl,idx_cpp,AnomalyScore_cpp,OutlierIndicator_cpp
114,114,-1.272349,-1,114,-1.272349,-1
74,74,2.705024,1,74,2.705024,1
9,9,-0.455387,-1,9,-0.455387,-1
88,88,3.237676,1,88,3.237676,1
25,25,0.148153,1,25,0.148153,1
5,5,0.253545,1,5,0.253545,1
48,48,-0.210193,-1,48,-0.210193,-1
117,117,-11.010502,-1,117,-11.010502,-1
83,83,0.746726,1,83,0.746726,1
105,105,-8.752248,-1,105,-8.752248,-1


In [14]:
condition = (cpp_skl_join.OutlierIndicator_cpp != cpp_skl_join.OutlierIndicator_skl)
cpp_skl_join[condition]


Unnamed: 0,idx_skl,AnomalyScore_skl,OutlierIndicator_skl,idx_cpp,AnomalyScore_cpp,OutlierIndicator_cpp
