In [None]:
# comment / uncomment if needed
import sys
sys.path.append("/opt/symetry/python") 

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import py4j.java_gateway as py4jjg
import sys
import time

print("irisExample.ipynb start")

import SMLPy4JGateway as smlgw

In [None]:
# Start the Py4J server.
gateway_server = smlgw.SMLPy4JGatewayServer(java_classpath='/opt/symetry/lib/*')

In [None]:
print('Getting the Python client.')
# Delay to allow Py4J gateway server to be ready.
time.sleep(2)
gateway = smlgw.get_python_client()
sml = gateway.jvm

In [None]:
# Create a local, unpersisted SML project.
user = 'c1'
prj_name = 'my_prj'
prj_type = 0
persist = False

prj = smlgw.createSMLProject(
    gateway,
    user,
    prj_name,
    prj_type,
    persist)

In [None]:
dataset = "/datasets/c/IrisFiles/Iris_data.csv"
print("Learn dataset %s." % dataset)
header_read = False
CHUNK_SIZE = 50
row_count = 0
reader = pd.read_csv(dataset, sep=',', header=0, dtype=object, chunksize=CHUNK_SIZE)
for pdf in reader:
    if not header_read:
        cols = pdf.columns
        attr_names = list(cols)

        # Guess the SML attribute types.
        pdf_test_json = smlgw.pandas_df_to_sml_json(pdf)
        test_jdf = sml.DataFrame()
        test_jdf.fromJSON(pdf_test_json)
        attr_type_chars = sml.com.rtlm.util.AttributeTypes.guessTypes(test_jdf)
        attr_types = ",".join([str(atc) for atc in attr_type_chars])

        print("SML Attribute Names: [%r]" % attr_names)
        print("SML Attribute Types: [%r]" % attr_types)
        header_read = True

    # Next step is important: needs to use the utility function 'pandas_df_to_sml_json' to
    # serialize pandas dataframe to a json string so that symetryml can efficiently 
    # create a sml dataframe from a python dataframe.
    pdfJson = smlgw.pandas_df_to_sml_json(pdf, attr_types)
    jdf = sml.DataFrame()
    jdf.fromJSON(pdfJson) # we got our dataframe, ready to learn it!
    prj.learn(jdf)

    row_count += jdf.getSize()
    print("Rows processed: %d" % row_count)

In [None]:
print("Exploring the data.")
print("Total number of attributes: %d" % len(attr_names))
m1 = prj.univariate(7)
m2 = prj.univariate("sepal_width_b2")
print("Attribute 7 univariate stats: %r" % m1)
print("Attribute sepal_width_b2 univariate stats (should be the same): %r" % m2)

In [None]:
print("Measuring some univariate stats.")
x = range(1, len(attr_names))
attr_variance = [prj.univariate(i-1)["variance"] for i in x]
attr_mean = [prj.univariate(i-1)["mean"] for i in x]
print("Attribute variances: %r" % attr_variance)
print("Attribute means: %r" % attr_mean)

In [None]:
print("Plotting Distributions")
slv = prj.univariate("sepal_length")['stddev']
slm = prj.univariate("sepal_length")['mean']
swv = prj.univariate("sepal_width")['stddev']
swm = prj.univariate("sepal_width")['mean']
plv = prj.univariate("petal_length")['stddev']
plm = prj.univariate("petal_length")['mean']
pwv = prj.univariate("petal_width")['stddev']
pwm = prj.univariate("petal_width")['mean']
x = ['sepal_length','sepal_width','petal_length','petal_width']
e = np.array([slv,swv,plv,pwv])
y = np.array([slm,swm,plm,pwm])
plt.errorbar(x, y, e, linestyle='None', marker='^')
plt.title('Distribution: Sepal|Petal width & length')
plt.show()

In [None]:
print("Measuring some bivariate stats.")
b1 = prj.bivariate(0, 2)
print("Bivariate stats for attributes [sepal_length, petal_length]: linear correlation = %.2f, covariance = %.2f" %
    (b1["linCorr"], b1["covar"]))

In [None]:
print("Calculating all of the pairwise coefficients.")
lin_corr = []
for attr1, attr1_name in enumerate(attr_names):
    temp = []
    for attr2, attr2_name in enumerate(attr_names):
        b1 = prj.bivariate(attr1, attr2)
        temp.append(b1["linCorr"])
    lin_corr.append(temp)
    
plt.xticks(range(0, len(attr_names)))
plt.yticks(range(0, len(attr_names)))
plt.imshow(lin_corr, cmap="hot", interpolation='none')
plt.title("Linear Correlation")
_ = plt.colorbar()

In [None]:
# Perform z test.
print("Performing Z test.")
zres = prj.ztest(0, 6, 7)
print("Z test for attributes [sepal_length, sepal_width_b1, sepal_width_b2]: z = %.2f, zp = %.2f" %
    (zres[6], zres[7]))

In [None]:
print("Performing PCA exploration with [sepal_length, sepal_width, petal_width].")
attr_ind_name_map = sml.java.util.TreeMap()
attr_ind_name_map.put(0, 'sepal_length')
attr_ind_name_map.put(1, 'sepal_width')
attr_ind_name_map.put(3, 'petal_width')
pcaRsp = prj.pca(attr_ind_name_map, np.NAN,False)  # returns a tuple[eigenvalues,eigenvectors]
pcaRspEVals = pcaRsp.getEigenValues()
pcaRspEVecs = pcaRsp.getEigenVectors()
e2 = []
v2 = []
eValColCnt = pcaRspEVals.getColumnCount()
eVecColCnt = pcaRspEVecs.getColumnCount()
for i in range(eValColCnt):
    e2.append(pcaRspEVals.getIndexValue(0, i))
for i in range(eValColCnt):
    temp = []
    for j in range(eVecColCnt):
        temp.append(pcaRspEVecs.getIndexValue(i, j))
    v2.append(temp)
print("Eigen Values: %r" % e2)
print("Eigen Vectors: %r" % v2)

In [None]:
print("Building model with Iris_versicolor as target.")
input_attr_map = sml.java.util.TreeMap()
input_attr_map.put(0, 'sepal_length')
input_attr_map.put(1, 'sepal_width')
input_attr_map.put(2, 'petal_length')
input_attr_map.put(3, 'petal_width')
int_class = gateway.jvm.int
target_attr_arr = gateway.new_array(int_class, 1)
target_attr_arr[0] = 13
rez = prj.buildModel(input_attr_map, target_attr_arr, "lda", "irisLDAModel", None)

In [None]:
#print(rez)
status_str = sml.CoreUtil.getErrorString(rez)
print(status_str)

In [None]:
print("Making prediction.")
# Build prediction SML DataFrame
col = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
String_class = gateway.jvm.String
sel_attr_names = gateway.new_array(String_class, len(col))
for i, attr_name in enumerate(col):
    sel_attr_names[i] = attr_name
types = 'C,C,C,C'
sel_attr_types = sml.com.rtlm.util.AttributeTypes.getTypes(types)
df = sml.com.rtlm.json.DataFrame()
df.setAttributeNames(sel_attr_names)
df.setAttributeTypes(sel_attr_types)
d = ["4.9", "2.4", "3.3", "1"]
sel_attr_vals = gateway.new_array(String_class, len(d))
for i, attr_val in enumerate(d):
    sel_attr_vals[i] = attr_val
df.addTuple(sel_attr_vals)
results1 = prj.predict(df, "irisLDAModel")
print("Predicted value of Iris_versicolor (should be 1): %s" % results1["res"])

In [None]:
print("Making prediction.")
# Reset prediction SML DataFrame
df.clear()
d2 = ["4.3", "3", "1.0", "0.1"]
sel_attr_vals = gateway.new_array(String_class, len(d))
for i, attr_val in enumerate(d2):
    sel_attr_vals[i] = attr_val
df.addTuple(sel_attr_vals)
results2 = prj.predict(df, "irisLDAModel")
print("Predicted value of Iris_versicolor (should be 0): %s" % results2["res"])

In [None]:
# Clean up
prj.deleteModel("irisLDAModel")
prj.clear()
gateway.shutdown()
gateway_server.kill_server()