# ML2CPP

## Preparing the dataset

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd

def populate_table(tablename, feature_names):
    iris = datasets.load_iris()
    X = iris.data  
    N = X.shape[0]
    y = iris.target.reshape(N,1)
    k = np.arange(N).reshape(N, 1)
    k_X_y = np.concatenate((k, X, y) , axis=1)
    lTable=pd.DataFrame(k_X_y)
    # print(lTable.head())
    lTable.columns = ['idx'] + feature_names + ['TGT'];
    lTable['TGT'] = lTable['TGT'].apply(int)
    lTable['idx'] = lTable['idx'].apply(int)
    lTable.to_csv(tablename , float_format='%.14g')



In [2]:
metadata = {"primary_key" : "KEY",
            "features" : ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm'],
            "targets" : ["TGT"],
            "table" : "iris"}

In [3]:
populate_table("/tmp/iris.csv" , metadata["features"])


In [4]:
df = pd.read_csv("/tmp/iris.csv")
df.sample(12, random_state=1960)

Unnamed: 0.1,Unnamed: 0,idx,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,TGT
114,114,114,5.8,2.8,5.1,2.4,2
74,74,74,6.4,2.9,4.3,1.3,1
9,9,9,4.9,3.1,1.5,0.1,0
88,88,88,5.6,3.0,4.1,1.3,1
25,25,25,5.0,3.0,1.6,0.2,0
5,5,5,5.4,3.9,1.7,0.4,0
48,48,48,5.3,3.7,1.5,0.2,0
117,117,117,7.7,3.8,6.7,2.2,2
83,83,83,6.0,2.7,5.1,1.6,1
105,105,105,7.6,3.0,6.6,2.1,2


## Training a Model

In [5]:

from sklearn.covariance import EllipticEnvelope

clf = EllipticEnvelope()

# train any scikit model on the iris dataset
clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


EllipticEnvelope()

## Deploying the Model

In [6]:

def generate_cpp_for_model(model):
    import pickle, json, requests, base64
    b64_data = base64.b64encode(pickle.dumps(model)).decode('utf-8')
    # send the model th the web service
    json_data={"Name":"model_cpp_sample", 
               "PickleData":b64_data , 
               "SQLDialect":"CPP",
               "FeatureNames" : metadata['features']}
    r = requests.post("https://sklearn2sql.herokuapp.com/model", json=json_data)
    content = r.json()
    lCPP = content["model"]["SQLGenrationResult"][0]["SQL"]
    # print(lCPP);
    return lCPP


lCPPCode = generate_cpp_for_model(clf);


In [7]:
print(lCPPCode)

namespace  {

	std::vector<std::string> get_input_names(){
		std::vector<std::string> lFeatures = { "Feature_0", "Feature_1", "Feature_2", "Feature_3" };

		return lFeatures;
	}

	std::vector<std::string> get_output_names(){
		std::vector<std::string> lOutputs = { 
			"AnomalyScore","OutlierIndicator" };

		return lOutputs;
	}

	tTable compute_outlier_scores(std::any Feature_0, std::any Feature_1, std::any Feature_2, std::any Feature_3) {
		std::any Feature_0_c = Feature_0 - 5.754411764705883;

		std::any Feature_1_c = Feature_1 - 3.054411764705883;

		std::any Feature_2_c = Feature_2 - 3.6080882352941193;

		std::any Feature_3_c = Feature_3 - 1.1500000000000004;

		std::any lMahalanobis = 10.874533961614407 * Feature_0_c * Feature_0_c + -7.523594252028072 * Feature_0_c * Feature_1_c + -8.670095822562605 * Feature_0_c * Feature_2_c + 8.302222841507668 * Feature_0_c * Feature_3_c + -7.523594252028072 * Feature_1_c * Feature_0_c + 13.824125187366795 * Feature_1_c * Feature_1_c + 10.30781

In [8]:
    def write_text_to_file(iCPPCode, oCPPFile):          
        with open(oCPPFile, "w") as text_file:
            text_file.write(iCPPCode)

    def add_cpp_main_function(iCPPCode, iCSVFile):
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = lCPPCode + iCPPCode
        lCPPCode = lCPPCode + "\tint main() {\n"
        lCPPCode = lCPPCode + "\t\tscore_csv_file(\"" + iCSVFile +"\");\n"
        lCPPCode = lCPPCode + "\treturn 0;\n}\n"
        return lCPPCode

    def compile_cpp_code_as_executable(iName):
        import subprocess
        lCommand = ["g++", "-Wall", "-Wno-unused-function", "-std=c++17" , "-g" ,  "-o", iName + ".exe",  iName + ".cpp"]
        print("EXECUTING" , "'" + " ".join(lCommand) + "'")
        result = subprocess.check_output(lCommand)
        # print(result)

    def execute_cpp_model(iName, iCSVFile):
        import subprocess
        result2 = subprocess.check_output([iName + ".exe",  iCSVFile])
        result2 = result2.decode()
        print(result2[:1000])
        print(result2[-1000:])
        return result2
        
    def execute_cpp_code(iCPPCode, iCSVFile):
        lName = "/tmp/sklearn2sql_cpp_" + str(id(clf));
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, lName + ".cpp")
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        write_text_to_file(str(result), lName + ".out")
        return lName + ".out"


In [9]:
populate_table("/tmp/iris2.csv" , ["Feature_0", "Feature_1", "Feature_2", "Feature_3"])
lCPPOutput = execute_cpp_code(lCPPCode , "/tmp/iris2.csv")
cpp_output = pd.read_csv(lCPPOutput)

EXECUTING 'g++ -Wall -Wno-unused-function -std=c++17 -g -o /tmp/sklearn2sql_cpp_139760826463904.exe /tmp/sklearn2sql_cpp_139760826463904.cpp'
idx,AnomalyScore,OutlierIndicator
0,8.24915334711441,1
1,7.03733865038413,1
2,7.99500550586754,1
3,7.73849212676779,1
4,7.83754456393621,1
5,6.27179138643661,1
6,7.31302726310739,1
7,8.49199378937914,1
8,6.39844743207702,1
9,7.86479806317459,1
10,6.88862729152132,1
11,7.23464053421681,1
12,7.61570653822403,1
13,6.11221598950551,1
14,1.48820159416802,1
15,0.03227725878276,1
16,4.11656630069086,1
17,7.89103961421618,1
18,5.55097011712715,1
19,6.79483365249443,1
20,7.53017881733020,1
21,7.31534580204309,1
22,6.44141939353085,1
23,7.30523129500270,1
24,3.34440799344498,1
25,7.80256626152535,1
26,8.35458702064837,1
27,8.13323437177804,1
28,7.86584517723186,1
29,7.70088781980230,1
30,8.26768298784379,1
31,4.90127340425317,1
32,-0.74845884481820,-1
33,2.11513815972678,1
34,8.26366128403188,1
35,6.21667729203274,1
36,4.88788747581053,1
37,6.3130852860941

In [10]:
cpp_output.sample(12, random_state=1960)

Unnamed: 0,idx,AnomalyScore,OutlierIndicator
114,114,-6.71691,-1
74,74,8.171262,1
9,9,7.864798,1
88,88,8.282556,1
25,25,7.802566,1
5,5,6.271791,1
48,48,7.115684,1
117,117,-12.967743,-1
83,83,6.776288,1
105,105,-0.767654,-1


In [11]:
skl_outputs = pd.DataFrame()
X = df[metadata['features']].values
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['idx']);

skl_output_outlier_indicator = pd.DataFrame(clf.predict(X), columns=['OutlierIndicator'])
skl_output_outlier_score = pd.DataFrame(clf.decision_function(X), columns=['AnomalyScore'])
skl_output = pd.concat([skl_output_key, skl_output_outlier_score, skl_output_outlier_indicator] , axis=1)
skl_output.sample(12, random_state=1960)

Unnamed: 0,idx,AnomalyScore,OutlierIndicator
114,114,-6.71691,-1
74,74,8.171262,1
9,9,7.864798,1
88,88,8.282556,1
25,25,7.802566,1
5,5,6.271791,1
48,48,7.115684,1
117,117,-12.967743,-1
83,83,6.776288,1
105,105,-0.767654,-1


In [12]:
cpp_skl_join = skl_output.join(cpp_output , how='left', on='idx', lsuffix='_skl', rsuffix='_cpp')

In [13]:
cpp_skl_join.sample(12, random_state=1960)

Unnamed: 0,idx_skl,AnomalyScore_skl,OutlierIndicator_skl,idx_cpp,AnomalyScore_cpp,OutlierIndicator_cpp
114,114,-6.71691,-1,114,-6.71691,-1
74,74,8.171262,1,74,8.171262,1
9,9,7.864798,1,9,7.864798,1
88,88,8.282556,1,88,8.282556,1
25,25,7.802566,1,25,7.802566,1
5,5,6.271791,1,5,6.271791,1
48,48,7.115684,1,48,7.115684,1
117,117,-12.967743,-1,117,-12.967743,-1
83,83,6.776288,1,83,6.776288,1
105,105,-0.767654,-1,105,-0.767654,-1


In [14]:
condition = (cpp_skl_join.OutlierIndicator_cpp != cpp_skl_join.OutlierIndicator_skl)
cpp_skl_join[condition]


Unnamed: 0,idx_skl,AnomalyScore_skl,OutlierIndicator_skl,idx_cpp,AnomalyScore_cpp,OutlierIndicator_cpp
