<a href="https://colab.research.google.com/github/mtnman38/harply/blob/main/harply_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import tensorflow_probability as tfp
from tensorflow.keras import Model
from tensorflow.keras.losses import Loss
from tensorflow.nn import relu, softmax
from tensorflow.keras.layers import (Dense,
                                     Flatten,
                                     Reshape,
                                     Input,
                                     Lambda,
                                     Dropout,
                                     Layer)

tf.keras.backend.set_floatx('float64')

# **harply** Initial Explorations

## Get some data

In [2]:
ds_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
column_names = ['fixed acidity',
                'volatile acidity', 
                'citric acid', 
                'residual sugar',
                'chlorides',
                'free sulfur dioxide',
                'total sulfur dioxide', 
                'density', 
                'pH', 
                'sulphates',
                'alcohol', 
                'quality']

In [3]:
dataset_path = tf.keras.utils.get_file(ds_url.split('/')[-1], ds_url)
dataset_csv = pd.read_csv(dataset_path,
                          names=column_names,
                          na_values="?",
                          comment='\t',
                          sep=";",
                          skipinitialspace=True,
                          header=0)

df = dataset_csv[['alcohol',
                  'pH',
                  'density',
                  'chlorides']]

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv


## Next steps

In [4]:
df.head()

Unnamed: 0,alcohol,pH,density,chlorides
0,9.4,3.51,0.9978,0.076
1,9.8,3.2,0.9968,0.098
2,9.8,3.26,0.997,0.092
3,9.8,3.16,0.998,0.075
4,9.4,3.51,0.9978,0.076


Let's take a look at correlations and some summary descriptions.

In [5]:
df.corr()

Unnamed: 0,alcohol,pH,density,chlorides
alcohol,1.0,0.205633,-0.49618,-0.221141
pH,0.205633,1.0,-0.341699,-0.265026
density,-0.49618,-0.341699,1.0,0.200632
chlorides,-0.221141,-0.265026,0.200632,1.0


In [6]:
df.describe()

Unnamed: 0,alcohol,pH,density,chlorides
count,1599.0,1599.0,1599.0,1599.0
mean,10.422983,3.311113,0.996747,0.087467
std,1.065668,0.154386,0.001887,0.047065
min,8.4,2.74,0.99007,0.012
25%,9.5,3.21,0.9956,0.07
50%,10.2,3.31,0.99675,0.079
75%,11.1,3.4,0.997835,0.09
max,14.9,4.01,1.00369,0.611


The idea will be to mimic these descriptions and correlations.

## Build out the generator model

In [7]:
class Linear(Layer):
    def __init__(self, units0, units1, units2):
        super(Linear, self).__init__()
        self.units0 = units0
        self.units1 = units1
        self.units2 = units2

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(self.units0, self.units1, self.units2),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            shape=(self.units2,),
             initializer="random_normal", 
             trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [8]:
class CustomModel(Model):
  def __init__(self):
    super(CustomModel, self).__init__()
    self.linear1 = Linear(1599, 4, 10)
    self.flatten1 = Flatten()
    self.dense1 = Dense(4)

  def call(self, x):
    x = self.linear1(x)
    x = softmax(x)
    x = self.flatten1(x)
    x = self.dense1(x)
    return x

In [9]:
model = CustomModel()

In [42]:
class CustomLoss(Loss):

  def call(self, y_true, y_pred):
    y_true_corr = tfp.stats.correlation(y_true)
    y_pred_corr = tfp.stats.correlation(y_pred)
    totals = tf.reduce_mean(tf.square(y_true_corr - y_pred_corr), axis=0)
    means = tf.reduce_mean(tf.square(y_true - y_pred), axis=0)
    return tf.reduce_mean(totals) + tf.reduce_mean(means)

In [43]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = CustomLoss()

In [44]:
data = tf.constant(df.to_numpy())
epochs = 10

In [45]:
for each in range(epochs):
  with tf.GradientTape() as tape:
    y_pred = model(data)
    loss_value = loss_fn(data, y_pred)

  gradients = tape.gradient(loss_value, model.trainable_weights)
  optimizer.apply_gradients(zip(gradients, model.trainable_weights))

In [46]:
model(data)

<tf.Tensor: shape=(1599, 4), dtype=float64, numpy=
array([[10.30906224,  1.0362963 ,  0.54383745,  1.19214163],
       [15.19387939,  1.54244352,  1.02007275,  0.37834129],
       [12.05397194,  1.85194154,  1.00070335,  1.23849901],
       ...,
       [14.52087333,  3.01691054,  0.70359041, -0.13794225],
       [14.98895261,  3.30933571,  0.79840208,  0.22540011],
       [14.98309562,  3.28014352,  0.68453508, -0.01744639]])>

In [49]:
predictions_df = pd.DataFrame(model(data).numpy())
print('Correlations of predictions:')
print(predictions_df.corr())
print('Descriptions of predictions:')
print(predictions_df.describe())

Correlations of predictions:
          0         1         2         3
0  1.000000  0.053760  0.074294 -0.233048
1  0.053760  1.000000 -0.248543 -0.024021
2  0.074294 -0.248543  1.000000  0.067572
3 -0.233048 -0.024021  0.067572  1.000000
Descriptions of predictions:
                 0            1            2            3
count  1599.000000  1599.000000  1599.000000  1599.000000
mean     13.471965     2.562855     0.830007     0.423768
std       1.331002     0.982561     0.221443     0.647593
min       8.699499    -1.328547     0.011460    -1.943034
25%      12.523962     1.898876     0.685913     0.036588
50%      13.486138     2.587390     0.818854     0.415254
75%      14.420316     3.228241     0.972634     0.809539
max      17.040527     5.659247     1.730419     2.805694


In [50]:
original_df = pd.DataFrame(data.numpy())
print('Correlations of predictions:')
print(original_df.corr())
print('Descriptions of predictions:')
print(original_df.describe())

Correlations of predictions:
          0         1         2         3
0  1.000000  0.205633 -0.496180 -0.221141
1  0.205633  1.000000 -0.341699 -0.265026
2 -0.496180 -0.341699  1.000000  0.200632
3 -0.221141 -0.265026  0.200632  1.000000
Descriptions of predictions:
                 0            1            2            3
count  1599.000000  1599.000000  1599.000000  1599.000000
mean     10.422983     3.311113     0.996747     0.087467
std       1.065668     0.154386     0.001887     0.047065
min       8.400000     2.740000     0.990070     0.012000
25%       9.500000     3.210000     0.995600     0.070000
50%      10.200000     3.310000     0.996750     0.079000
75%      11.100000     3.400000     0.997835     0.090000
max      14.900000     4.010000     1.003690     0.611000
