<a href="https://colab.research.google.com/github/mtnman38/harply/blob/main/harply_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import tensorflow_probability as tfp
from tensorflow.keras import Model
from tensorflow.keras.losses import Loss
from tensorflow.nn import relu, softmax
from tensorflow.keras.layers import (Dense,
                                     Flatten,
                                     Reshape,
                                     Input,
                                     Lambda,
                                     Dropout,
                                     Layer)

tf.keras.backend.set_floatx('float64')

# **harply** Initial Explorations

## Get some data

In [2]:
ds_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
column_names = ['fixed acidity',
                'volatile acidity', 
                'citric acid', 
                'residual sugar',
                'chlorides',
                'free sulfur dioxide',
                'total sulfur dioxide', 
                'density', 
                'pH', 
                'sulphates',
                'alcohol', 
                'quality']

In [3]:
dataset_path = tf.keras.utils.get_file(ds_url.split('/')[-1], ds_url)
dataset_csv = pd.read_csv(dataset_path,
                          names=column_names,
                          na_values="?",
                          comment='\t',
                          sep=";",
                          skipinitialspace=True,
                          header=0)

df = dataset_csv[['alcohol',
                  'pH',
                  'density',
                  'chlorides']]

## Next steps

In [4]:
df.head()

Unnamed: 0,alcohol,pH,density,chlorides
0,9.4,3.51,0.9978,0.076
1,9.8,3.2,0.9968,0.098
2,9.8,3.26,0.997,0.092
3,9.8,3.16,0.998,0.075
4,9.4,3.51,0.9978,0.076


Let's take a look at correlations and some summary descriptions.

In [5]:
df.corr()

Unnamed: 0,alcohol,pH,density,chlorides
alcohol,1.0,0.205633,-0.49618,-0.221141
pH,0.205633,1.0,-0.341699,-0.265026
density,-0.49618,-0.341699,1.0,0.200632
chlorides,-0.221141,-0.265026,0.200632,1.0


In [6]:
df.describe()

Unnamed: 0,alcohol,pH,density,chlorides
count,1599.0,1599.0,1599.0,1599.0
mean,10.422983,3.311113,0.996747,0.087467
std,1.065668,0.154386,0.001887,0.047065
min,8.4,2.74,0.99007,0.012
25%,9.5,3.21,0.9956,0.07
50%,10.2,3.31,0.99675,0.079
75%,11.1,3.4,0.997835,0.09
max,14.9,4.01,1.00369,0.611


The idea will be to mimic these descriptions and correlations.

## Build out the generator model

In [7]:
class Linear(Layer):
    def __init__(self, units0, units1, units2):
        super(Linear, self).__init__()
        self.units0 = units0
        self.units1 = units1
        self.units2 = units2

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(self.units0, self.units1, self.units2),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            shape=(self.units2,),
             initializer="random_normal", 
             trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [8]:
class CustomModel(Model):
  def __init__(self):
    super(CustomModel, self).__init__()
    self.linear1 = Linear(1599, 4, 10)
    self.flatten1 = Flatten()
    self.dense1 = Dense(4)

  def call(self, x):
    x = self.linear1(x)
    x = softmax(x)
    x = self.flatten1(x)
    x = self.dense1(x)
    return x

In [9]:
model = CustomModel()

In [10]:
class CustomLoss(Loss):

  def call(self, y_true, y_pred):
    y_true_corr = tfp.stats.correlation(y_true)
    y_pred_corr = tfp.stats.correlation(y_pred)
    totals = tf.reduce_mean(tf.square(y_true_corr - y_pred_corr), axis=0)
    means = tf.reduce_mean(tf.square(y_true - y_pred), axis=0)
    return tf.reduce_mean(totals) + tf.reduce_mean(means)

In [11]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = CustomLoss()

In [12]:
data = tf.constant(df.to_numpy())
epochs = 500

In [13]:
for each in range(epochs):
  with tf.GradientTape() as tape:
    y_pred = model(data)
    loss_value = loss_fn(data, y_pred)

  gradients = tape.gradient(loss_value, model.trainable_weights)
  optimizer.apply_gradients(zip(gradients, model.trainable_weights))

In [14]:
model(data)

<tf.Tensor: shape=(1599, 4), dtype=float64, numpy=
array([[ 9.40796342,  3.5091313 ,  0.98659709,  0.09150087],
       [ 9.80298409,  3.20252452,  1.15364944,  0.10500415],
       [ 9.80407279,  3.26016147,  1.04359304,  0.0872506 ],
       ...,
       [10.99475684,  3.4193684 ,  1.06581186,  0.06361694],
       [10.20139078,  3.56877528,  0.95878785,  0.06908192],
       [10.99514049,  3.39035739,  1.05198327,  0.05221967]])>

In [15]:
predictions_df = pd.DataFrame(model(data).numpy())
print('Correlations of synthetic dataset:')
print(predictions_df.corr())
print('Descriptions of synthetic dataset:')
print(predictions_df.describe())

Correlations of synthetic dataset:
          0         1         2         3
0  1.000000  0.204041 -0.566276 -0.231973
1  0.204041  1.000000 -0.322507 -0.286715
2 -0.566276 -0.322507  1.000000  0.207679
3 -0.231973 -0.286715  0.207679  1.000000
Descriptions of synthetic dataset:
                 0            1            2            3
count  1599.000000  1599.000000  1599.000000  1599.000000
mean     10.422744     3.312442     1.004713     0.084494
std       1.061157     0.153913     0.071220     0.066396
min       8.398502     2.718405     0.657352    -0.073612
25%       9.505077     3.210682     0.964996     0.059916
50%      10.198738     3.311449     1.007889     0.074718
75%      11.095701     3.401933     1.051974     0.090943
max      14.892051     3.991706     1.198238     0.794609


In [16]:
original_df = pd.DataFrame(data.numpy())
print('Correlations of original dataset:')
print(original_df.corr())
print('Descriptions of original dataset:')
print(original_df.describe())

Correlations of original dataset:
          0         1         2         3
0  1.000000  0.205633 -0.496180 -0.221141
1  0.205633  1.000000 -0.341699 -0.265026
2 -0.496180 -0.341699  1.000000  0.200632
3 -0.221141 -0.265026  0.200632  1.000000
Descriptions of original dataset:
                 0            1            2            3
count  1599.000000  1599.000000  1599.000000  1599.000000
mean     10.422983     3.311113     0.996747     0.087467
std       1.065668     0.154386     0.001887     0.047065
min       8.400000     2.740000     0.990070     0.012000
25%       9.500000     3.210000     0.995600     0.070000
50%      10.200000     3.310000     0.996750     0.079000
75%      11.100000     3.400000     0.997835     0.090000
max      14.900000     4.010000     1.003690     0.611000
