# tensorflow pistachio


In [1]:
import tensorflow as tf
print(tf.__version__)

2024-05-03 19:58:49.091691: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.1


## arff to csv





In [2]:
import pandas as pd 
from scipy.io import arff
import os 
label_mapping = {'Kirmizi_Pistachio': 0, 'Siit_Pistachio': 1}

def load_arff_file(input_arff: str) -> pd.DataFrame:
    """convert arff file to parquet"""
    if not os.path.exists(input_arff):
        raise ValueError(f"input file '{input_arff}' does not exist")
    print(f'loading arff file {input_arff}')
    data, meta = arff.loadarff(input_arff)
    print(f"arff metadata: {meta}")
    df = pd.DataFrame(data)
    df['Class'] = df['Class'].astype(str).map(label_mapping)
    
    return df
##################

arff_filename = './data/Pistachio_16_Features_Dataset.arff'
csv_filename = './data/pistachio_16.csv'
if not os.path.exists(csv_filename):
    df = load_arff_file(arff_filename)
    df.head()
    df.to_csv(csv_filename, index=False, header=True)
    print(f'wrote file to {csv_filename}')
else:
    print(f'{csv_filename} exists')


./data/pistachio_16.csv exists


## dataset


In [10]:
import numpy as np
from typing import List
# want a stratified split here
def split_csv_data(infilename: str, filenames: List[str], fractions: List[float]):
    df = pd.read_csv(infilename, header=0)
    df = df.sample(frac=1.0, random_state=34)
    columns = df.columns
    print(f'total_records = {len(df)}')
    if len(filenames) != len(fractions):
        raise ValueError('list of filenames must be of same length as split fractions')
    renorm = sum(fractions)
    lower_bound = 0
    df['split_var'] = np.random.uniform(size=len(df))

    for filename, frac in zip(filenames, fractions):
        upper_bound = lower_bound + frac/renorm
        this_data = df.loc[(df.split_var >= lower_bound) & (df.split_var < upper_bound) ][columns]
        this_data.to_csv(filename, index=False, header=True)
        print(f'wrote {len(this_data)} records to {filename}')
        lower_bound = upper_bound
#################################
    
train_filename = './data/pistachio_train.csv'
valid_filename = './data/pistachio_valid.csv'
test_filename = './data/pistachio_test.csv'
filenames = [train_filename, valid_filename, test_filename]
fractions = [0.75, 0.10, 0.15]

if not (os.path.exists(train_filename) and os.path.exists(test_filename) and os.path.exists(valid_filename)):
    split_csv_data(csv_filename, filenames, fractions )
else:
    print(f'{filenames} exist')


for i in filenames:
    df = pd.read_csv(i, header=0)
    print(f'file: {i}, records {len(df)}')



['./data/pistachio_train.csv', './data/pistachio_valid.csv', './data/pistachio_test.csv'] exist
file: ./data/pistachio_train.csv, records 1589
file: ./data/pistachio_valid.csv, records 221
file: ./data/pistachio_test.csv, records 338


In [11]:
def map_func(features, labels):
    return tf.transpose(tf.stack([features[k] for k in features])), tf.reshape(labels,[-1,1])

batch_size = 10
# use dataset.map to concatenate feature dictionary into tensor
pistachio_train_batches = tf.data.experimental.make_csv_dataset(
    train_filename, batch_size=batch_size,
    num_epochs=1,
    label_name="Class").map(map_func)
pistachio_test_data = tf.data.experimental.make_csv_dataset(
    test_filename,
    batch_size=batch_size,
    num_epochs=1,
    label_name="Class").map(map_func)
validation_data = tf.data.experimental.make_csv_dataset(
    valid_filename, batch_size=batch_size,
    num_epochs=1,
    label_name="Class").map(map_func)

In [12]:
batch = 0
for feature_batch, label_batch in pistachio_train_batches.take(2):
    # print(f'{batch}, {label_batch.shape}')
    # cat_batch = tf.stack([feature_batch['AREA'],feature_batch['PERIMETER']],axis=1)
    # cat_batch = tf.stack([feature_batch[k] for k in feature_batch],axis=1)

    # batch += 1
    
    print("'label': {}".format(label_batch))
    # print(cat_batch)
    print(f"features batch shape: {feature_batch.shape}")
    # print(feature_batch.shape)
    

'label': [[0]
 [1]
 [1]
 [0]]
features batch shape: (4, 16)
'label': [[0]
 [0]
 [1]
 [0]]
features batch shape: (4, 16)


2024-05-03 15:35:45.953447: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## Model

In [47]:
# from tensorflow.keras.layers import Dense, Flatten, Conv2D, BatchNormalization
# from tensorflow.keras import Model

# class PistachioModel(Model):
#     def __init__(self, units: int=10):
#         super().__init__()
#         self._units = units
    

#     def build(self, input_shape):
#         self.bn = BatchNormalization(axis=0, input_shape=input_shape)
#         self.d1 = Dense(self._units, activation='relu', input_shape=input_shape)
#         self.d2 = Dense(self._units)
#         self.lout = Dense(1, activation='sigmoid')
        


#     def call(self, x):
#         x = self.bn(x)
#         x = self.d1(x)
#         x = self.d2(x)
#         return self.lout(x)

# # Create an instance of the model
# model = PistachioModel()

## Keras model.fit api

In [48]:
# model.compile(optimizer='adam',
#               loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
#               metrics=['accuracy', 'auc'])







In [8]:
# model.fit(pistachio_train_batches, epochs=10)

## sequential model

In [13]:
model2 = tf.keras.models.Sequential([
  tf.keras.layers.BatchNormalization(), 
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(16),
  tf.keras.layers.Dense(1, activation='sigmoid')
])


model2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy', 'auc', 'precision', 'recall'])

In [5]:
# model2.fit(pistachio_train_batches, epochs=20, validation_data=validation_data)


In [17]:
# model2.evaluate(pistachio_test_data,verbose=2)


85/85 - 0s - 2ms/step - accuracy: 0.8698 - auc: 0.9361 - loss: 0.3439 - precision: 0.8252 - recall: 0.8613


2024-05-03 15:37:06.076313: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[0.343885213136673,
 0.8698225021362305,
 0.9361041188240051,
 0.8251748085021973,
 0.8613138794898987]

In [18]:
# for features, labels in pistachio_test_data.take(1):
#     predictions = model2(features)
#     for p,l in zip(predictions, labels):
#         print(f'predicted prob: {p}, label: {l}')

predicted prob: [0.6096879], label: [1]
predicted prob: [0.13835986], label: [0]
predicted prob: [0.06268708], label: [0]
predicted prob: [0.10202026], label: [0]


2024-05-03 15:37:17.202003: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## custom training loop stuff

In [27]:
model = tf.keras.models.Sequential([
  tf.keras.layers.BatchNormalization(), 
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(16),
  tf.keras.layers.Dense(1, activation='sigmoid')
])



In [13]:
train_loss = []
valid_loss = []
epoch_times = []
from time import time


optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4)
loss=tf.keras.losses.BinaryCrossentropy(from_logits=False)
start_time = time()
epochs = 2
steps_per_output=10
for epoch in range(epochs):
    epoch_start_time = time()

    for batch_no, (train_x, train_y) in enumerate(pistachio_train_batches):
        with tf.GradientTape() as tape:
            output = model3(train_x)
            the_loss = loss(train_y, output)
        train_loss.append(the_loss)
            

        grads = tape.gradient(the_loss, model3.trainable_weights)
        optimizer.apply_gradients(zip(grads, model3.trainable_weights))

        if batch_no % steps_per_output == 0:
            print(f'epoch {epoch}, batch {batch_no}, training loss {the_loss}')
    this_epoch_time = time() - epoch_start_time
    print(f'epoch {epoch}, time {this_epoch_time}')
    epoch_times.append(this_epoch_time)
    
            
        



epoch 0, batch 0, training loss 1852.3375244140625
epoch 0, batch 10, training loss 1219.437744140625
epoch 0, batch 20, training loss 353.310546875
epoch 0, batch 30, training loss 346.21746826171875
epoch 0, batch 40, training loss 441.00518798828125
epoch 0, batch 50, training loss 293.82550048828125
epoch 0, batch 60, training loss 348.50836181640625
epoch 0, batch 70, training loss 188.50709533691406
epoch 0, batch 80, training loss 527.7088623046875
epoch 0, batch 90, training loss 316.4344177246094
epoch 0, batch 100, training loss 247.21017456054688
epoch 0, batch 110, training loss 145.61026000976562
epoch 0, batch 120, training loss 275.33489990234375
epoch 0, batch 130, training loss 127.54837799072266
epoch 0, batch 140, training loss 295.907958984375
epoch 0, batch 150, training loss 260.89910888671875


2024-05-03 20:07:19.146993: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


epoch 0, time 10.288832426071167
epoch 1, batch 0, training loss 285.18707275390625
epoch 1, batch 10, training loss 223.09078979492188
epoch 1, batch 20, training loss 499.3887634277344
epoch 1, batch 30, training loss 140.48171997070312
epoch 1, batch 40, training loss 521.537109375
epoch 1, batch 50, training loss 165.34548950195312
epoch 1, batch 60, training loss 130.39639282226562
epoch 1, batch 70, training loss 357.6820983886719
epoch 1, batch 80, training loss 242.6947784423828
epoch 1, batch 90, training loss 190.4488067626953
epoch 1, batch 100, training loss 223.4416046142578
epoch 1, batch 110, training loss 227.3134765625
epoch 1, batch 120, training loss 416.3309020996094
epoch 1, batch 130, training loss 165.32669067382812
epoch 1, batch 140, training loss 313.72735595703125
epoch 1, batch 150, training loss 353.150390625
epoch 1, time 11.163836240768433


2024-05-03 20:07:31.708628: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [14]:
epoch_times

[10.288832426071167, 11.163836240768433]

In [None]:
https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch

In [28]:
train_loss_records = []
valid_loss_records = []
epoch_times = []
from time import time

# loss function
train_loss_ob = tf.keras.losses.BinaryCrossentropy(from_logits=False, name='training_loss')
# training metrics
train_metrics = {
    "train_roc": tf.keras.metrics.AUC(name="training_ROC_AUC"),
    "train_acc": tf.keras.metrics.BinaryAccuracy(name="train_accuracy"),
    "train_recall": tf.keras.metrics.Recall(name="train_recall"),
    "train_precision":  tf.keras.metrics.Precision(name="train_precision")
}

# validation
# loss function
valid_loss_ob = tf.keras.losses.BinaryCrossentropy(from_logits=False, name='valid')
# validing metrics
valid_metrics = {
    "valid_roc_auc": tf.keras.metrics.AUC(name="valid_ROC_AUC"),
    "valid_acc": tf.keras.metrics.BinaryAccuracy(name="valid_accuracy"),
    "valid_recall": tf.keras.metrics.Recall(name="valid_recall"),
    "valid_precision": tf.keras.metrics.Precision(name="valid_precision")
}
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4)



In [29]:
@tf.function
def train_step(train_x, train_y):
    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = model(train_x, training=True)
        loss = train_loss_ob(train_y, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    for metric in train_metrics.values():
        metric.update_state(train_y, predictions)
    return loss
 
  # train_loss(loss)
  # train_accuracy(labels, predictions)

In [None]:
# @tf.function
def test_step(images, labels):
    pass
    # # training=False is only needed if there are layers with different
    # # behavior during training versus inference (e.g. Dropout).
    # predictions = model(images, training=False)
    # t_loss = loss_object(labels, predictions)
    
    # test_loss(t_loss)
    # test_accuracy(labels, predictions)

In [30]:
train_loss_records = []
valid_loss_records = []
train_metric_records = []
epoch_times = []

start_time = time()
epochs = 2
steps_per_output=50
total_batches = 0
for epoch in range(epochs):
    # clear the metric states - these metrics are computed over all batches in the epoch
    for metric in train_metrics.values():
        metric.reset_state()
    # start the clock for this epoch
    epoch_start_time = time()
    
    # run through batches
    for batch_no, (train_x, train_y) in enumerate(pistachio_train_batches):
        # take a training step
        the_loss = train_step(train_x, train_y)
        # this is the individual loss for a training batch
        train_loss.append(the_loss)
        
        if batch_no % steps_per_output == 0:
            print(f'epoch {epoch}, batch {batch_no}, training loss {the_loss}')
        
    # time for this epoch    
    this_epoch_time = time() - epoch_start_time
    print(f'epoch {epoch}, time {this_epoch_time}')
    # evaluate metrics over the epoch
    epoch_metrics = {k: v.result() for k,v in train_metrics.items()}
    for k,v in epoch_metrics.items():
        print(f'{k}: {v}')
        
    train_metric_records.append(epoch_metrics)
    epoch_times.append(this_epoch_time)
    
    

epoch 0, batch 0, training loss 0.7548421025276184
epoch 0, batch 10, training loss 0.7286007404327393
epoch 0, batch 20, training loss 0.7910215854644775
epoch 0, batch 30, training loss 0.7089307904243469
epoch 0, batch 40, training loss 0.7511945366859436
epoch 0, batch 50, training loss 0.6782774329185486
epoch 0, batch 60, training loss 0.6774525046348572
epoch 0, batch 70, training loss 0.5865039229393005
epoch 0, batch 80, training loss 0.7561821341514587
epoch 0, batch 90, training loss 0.6875983476638794
epoch 0, batch 100, training loss 0.7411220073699951
epoch 0, batch 110, training loss 0.6971716284751892
epoch 0, batch 120, training loss 0.653087317943573
epoch 0, batch 130, training loss 0.6775817275047302
epoch 0, batch 140, training loss 0.6160794496536255
epoch 0, batch 150, training loss 0.5910268425941467


2024-05-03 21:03:20.624650: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


epoch 0, time 5.132985591888428
train_roc: 0.4983018934726715
train_acc: 0.5078665614128113
train_recall: 0.4281567633152008
train_precision: 0.4319179952144623
epoch 1, batch 0, training loss 0.7238677144050598
epoch 1, batch 10, training loss 0.6127718091011047
epoch 1, batch 20, training loss 0.6200535893440247
epoch 1, batch 30, training loss 0.6818579435348511
epoch 1, batch 40, training loss 0.6232209205627441
epoch 1, batch 50, training loss 0.6710546612739563
epoch 1, batch 60, training loss 0.6384653449058533
epoch 1, batch 70, training loss 0.5474119782447815
epoch 1, batch 80, training loss 0.6812781691551208
epoch 1, batch 90, training loss 0.5983659625053406
epoch 1, batch 100, training loss 0.5465803742408752
epoch 1, batch 110, training loss 0.6512739658355713
epoch 1, batch 120, training loss 0.5343424677848816
epoch 1, batch 130, training loss 0.5651676058769226
epoch 1, batch 140, training loss 0.5229609608650208
epoch 1, batch 150, training loss 0.6935484409332275
ep

2024-05-03 21:03:23.275484: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
https://www.tensorflow.org/guide/migrate/early_stopping
This accumulates training and validation losses as averages over an epoch of training/the valudation set. 
Can do training loss per batch, or training + validation per epoch
remember to set training = true/false in metric evaluation/updating
moose


In [None]:

for epoch in range(EPOCHS):
  # Reset the metrics at the start of the next epoch
  train_loss.reset_state()
  train_accuracy.reset_state()
  test_loss.reset_state()
  test_accuracy.reset_state()

  for images, labels in train_ds:
    train_step(images, labels)

  for test_images, test_labels in test_ds:
    test_step(test_images, test_labels)

  print(
    f'Epoch {epoch + 1}, '
    f'Loss: {train_loss.result():0.2f}, '
    f'Accuracy: {train_accuracy.result() * 100:0.2f}, '
    f'Test Loss: {test_loss.result():0.2f}, '
    f'Test Accuracy: {test_accuracy.result() * 100:0.2f}'
  )