# tensorflow pistachio


In [1]:
import tensorflow as tf
print(tf.__version__)

2024-11-27 04:27:59.446787: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.1


## arff to csv





In [2]:
import pandas as pd 
from scipy.io import arff
import os 
label_mapping = {'Kirmizi_Pistachio': 0, 'Siit_Pistachio': 1}

def load_arff_file(input_arff: str) -> pd.DataFrame:
    """convert arff file to parquet"""
    if not os.path.exists(input_arff):
        raise ValueError(f"input file '{input_arff}' does not exist")
    print(f'loading arff file {input_arff}')
    data, meta = arff.loadarff(input_arff)
    print(f"arff metadata: {meta}")
    df = pd.DataFrame(data)
    df['Class'] = df['Class'].astype(str).map(label_mapping)
    
    return df
##################

arff_filename = './data/Pistachio_16_Features_Dataset.arff'
csv_filename = './data/pistachio_16.csv'
if not os.path.exists(csv_filename):
    df = load_arff_file(arff_filename)
    df.head()
    df.to_csv(csv_filename, index=False, header=True)
    print(f'wrote file to {csv_filename}')
else:
    print(f'{csv_filename} exists')


./data/pistachio_16.csv exists


## dataset


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def split_csv_data(infilename: str, train_filename: str, test_filename:str, test_fraction: float):
    df = pd.read_csv(infilename, header=0)
    columns = df.columns
    df['split_var'] = np.random.uniform(size=len(df))
    train_df = df.loc[df.split_var <= test_fraction][columns]
    test_df = df.loc[df.split_var > test_fraction][columns]
    train_df.to_csv(train_filename, index=False, header=True)
    test_df.to_csv(test_filename, index=False, header=True)
    print(f'wrote {len(train_df)} records to {train_filename}')
    print(f'wrote {len(test_df)} records to {test_filename}')

def df_to_dataset(df: pd.DataFrame, target_column: str):
    feature_df = df.copy()
    target = feature_df.pop(target_column)
    dataset = tf.data.Dataset.from_tensor_slices((dict(feature_df), target))
    dataset = dataset.shuffle(buffer_size=len(feature_df))\
        .batch(2)\
        .prefetch(2)
    return dataset

def split_data_to_frames(infilename: str, test_fraction: float=0.2, val_fraction: float=0.2, seed:int=43):
    """ load csv as dataframe, split"""

    df = pd.read_csv(infilename)
    train_val_df, test_df = train_test_split(df, test_size=test_fraction, random_state=seed)
    train_df, val_df = train_test_split(train_val_df, test_size=val_fraction, random_state=seed+1)


    return train_df, val_df, test_df

train_df, valid_df, test_df = split_data_to_frames(csv_filename)

for setname, df in zip(['train','validation','test'],[train_df, valid_df, test_df]):
    print(setname)
    print(f'df shape = {df.shape}')
    agged = df.groupby('Class').agg({'AREA':'count'}).reset_index()
    print(agged)


feature_columns = list(train_df.columns)
feature_columns.remove('Class')
feature_columns

# train_ds, valid_ds, test_ds = get_datasets(csv_filename, 'Class')

#     train_ds = df_to_dataset(train_df, target_column)
#     val_ds = df_to_dataset(val_df, target_column)
#     test_ds = df_to_dataset(test_df, target_column)


    
# train_filename = './data/pistachio_train.csv'
# test_filename = './data/pistachio_test.csv'

# if not (os.path.exists(train_filename) and os.path.exists(test_filename)):
#     split_csv_data(csv_filename, train_filename, test_filename, 0.2)
# else:
#     print(f'{train_filename} and {test_filename} exist')






train
df shape = (1374, 17)
   Class  AREA
0      0   801
1      1   573
validation
df shape = (344, 17)
   Class  AREA
0      0   204
1      1   140
test
df shape = (430, 17)
   Class  AREA
0      0   227
1      1   203


['AREA',
 'PERIMETER',
 'MAJOR_AXIS',
 'MINOR_AXIS',
 'ECCENTRICITY',
 'EQDIASQ',
 'SOLIDITY',
 'CONVEX_AREA',
 'EXTENT',
 'ASPECT_RATIO',
 'ROUNDNESS',
 'COMPACTNESS',
 'SHAPEFACTOR_1',
 'SHAPEFACTOR_2',
 'SHAPEFACTOR_3',
 'SHAPEFACTOR_4']

In [4]:
train_ds = df_to_dataset(train_df,'Class')
valid_ds = df_to_dataset(valid_df,'Class')
test_ds = df_to_dataset(test_df,'Class')

In [5]:
def get_dataset_class_proportions(the_dataset):
    def count_class(counts, batch, num_classes=2):
        labels = batch[1] # class is second element of tuple
        for i in range(num_classes):
            cc = tf.cast(labels == i, tf.int32)
            counts[str(i)] += tf.reduce_sum(cc)
        return counts
    initial_state = {'0':0, '1':0}
    proportions = {k: v.numpy() for k,v in the_dataset.reduce(reduce_func=count_class, initial_state=initial_state).items()}
    total = sum(proportions.values())
    proportions.update({f'proportion_{k}': v/total for k,v in proportions.items()})
    
    return proportions
    

print(f'train: {get_dataset_class_proportions(train_ds)}')
print(f'valid: {get_dataset_class_proportions(valid_ds)}')
print(f'test: {get_dataset_class_proportions(test_ds)}')

   


train: {'0': 801, '1': 573, 'proportion_0': 0.5829694323144105, 'proportion_1': 0.4170305676855895}
valid: {'0': 204, '1': 140, 'proportion_0': 0.5930232558139535, 'proportion_1': 0.4069767441860465}
test: {'0': 227, '1': 203, 'proportion_0': 0.5279069767441861, 'proportion_1': 0.4720930232558139}


In [60]:
# def map_func(features, labels):
#     return tf.transpose(tf.stack([features[k] for k in features])), tf.reshape(labels,[-1,1])

# # use dataset.map to concatenate feature dictionary into tensor
# pistachio_train_batches = tf.data.experimental.make_csv_dataset(
#     train_filename, batch_size=4,
#     num_epochs=1,
#     label_name="Class").map(map_func)
# pistachio_test_batches = tf.data.experimental.make_csv_dataset(
#     test_filename, batch_size=4,
#     num_epochs=1,
#     label_name="Class").map(map_func)

In [6]:
batch = 0
# for feature_batch, label_batch in pistachio_train_batches.take(2):
for feature_batch, label_batch in train_ds.take(2):
    # print(f'{batch}, {label_batch.shape}')
    # cat_batch = tf.stack([feature_batch['AREA'],feature_batch['PERIMETER']],axis=1)
    # cat_batch = tf.stack([feature_batch[k] for k in feature_batch],axis=1)

    # batch += 1
    
    print("'label': {}".format(label_batch))
    # print(cat_batch)
    print(f"features batch shape: {feature_batch['AREA'].shape}")
    # print(feature_batch.shape)
    

'label': [1 0]
features batch shape: (2,)
'label': [0 0]
features batch shape: (2,)


2024-11-27 04:28:14.714007: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## Functional API
try this instead, dataset is weird

In [7]:
train_ds.cardinality().numpy()


687

In [11]:
from typing import List, Dict
from tensorflow.keras.layers import Dense, Flatten, Conv2D, BatchNormalization, Normalization
from tensorflow.keras import Model
from tensorflow.keras.metrics import Accuracy, AUC, Recall, Precision

def get_pistachio_model(feature_columns: List[str], train_dataset: tf.data.Dataset, units: int=10):
    """build a pistachio model using functional api"""
    def _get_feature_normalizers():
        """initialise and adapt the feature normalisers"""
        print(f'preprocessing - initialising normalisers')
        normalizers = {}
        for feature in feature_columns:
            normaliser =  Normalization(axis=None, name=f'normalizer_{feature}')
            just_this_feature_ds = train_dataset.map(lambda x,y: x[feature])
            normaliser.adapt(just_this_feature_ds)
            normalizers[feature] = normaliser
        return normalizers
        
    def _build_model(normalizers: Dict):
        normalized_inputs = []
        raw_inputs = []
        for feature in feature_columns:
            feature_input = tf.keras.Input(shape=(1,), name=feature)
            raw_inputs.append(feature_input)
            normalized_input = normalizers[feature](feature_input)
            normalized_inputs.append(normalized_input)

        input_layer = tf.keras.layers.concatenate(normalized_inputs)

        # densely connected layers
        d1 = Dense(units, activation='relu', name='dense_1')
        d2 = Dense(units, activation='relu', name='dense_2')

        # output layer
        output_layer = Dense(1, activation='sigmoid', name='output')

        # define graph
        x = d1(input_layer)
        x = d2(x)
        final_output = output_layer(x)
        model = tf.keras.Model(raw_inputs, final_output)
        return model
    normalizers = _get_feature_normalizers()
    model = _build_model(normalizers)
    return model
 

In [12]:
import os
logdir = './pistachio_model_logs'
os.makedirs(logdir, exist_ok=True)

metrics = [
    tf.keras.metrics.Accuracy(),
    tf.keras.metrics.AUC(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall()]

callbacks = [
    tf.keras.callbacks.TensorBoard(logdir, update_freq='batch'),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.001) 
    # tf.keras.callbacks.LearningRateScheduler(lambda epoch, lr: max(lr*0.9, 1e-3))
]

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# build the model
model = get_pistachio_model(feature_columns, train_ds)

# compile the model
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=metrics)

ValueError: Argument(s) not recognized: {'lr': 0.01}

In [10]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

NameError: name 'model' is not defined

In [23]:
model.fit(
    train_ds,
    epochs=30,
    callbacks=callbacks,
    validation_data=valid_ds)

  output, from_logits = _get_logits(


[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 985us/step - accuracy: 0.0000e+00 - auc_4: 0.7439 - loss: 0.5602 - precision_4: 0.6182 - recall_4: 0.7557


<keras.src.callbacks.history.History at 0x77dabc5f66d0>

## Model

In [48]:
from tensorflow.keras.layers import Dense, Flatten, Conv2D, BatchNormalization, Normalization
# from tensorflow.keras import preprocessing
from tensorflow.keras import Model

class PistachioModel(Model):
    def __init__(self, units: int=10):
        super().__init__()
        self._units = units
        self.normalizers = {}
    
    def define_normalizers(self, train_dataset, feature_columns):
        '''define normalizers based on keys in feature dataset, and adapt them'''
        for k in feature_columns:
            self.normalizers[k] = Normalization(axis=None, name=f'normalizer_{k}')
            just_this_feature_ds = train_dataset.map(lambda x,y: x[k])
            self.normalizers[k].adapt(just_this_feature_ds)    

    def build(self, features):

        # raise error if no normalizers defined
        if (len(self.normalizers) == 0):
            raise ValueError('cannot build until normalizers defined')


        self.normalized_inputs = []

        for k in features:
            feature_input = tf.keras.Input(shape=(1,), name=f'raw_{k}')
            # normed_input = self.normalizers[k](feature_input)
            # self.normalized_inputs.append(normed_input)
            
        # self.all_inputs = tf.layers.concatenate(name='all_normed_inputs')
        
        self.d1 = Dense(self._units, activation='relu', name='dense_1')
        self.d2 = Dense(self._units, activation='relu', name='dense_2')
        self.lout = Dense(1, activation='sigmoid', name='output')
        


    def call(self, x):
        normed_inputs = []
        for k in feature_columns:
            normed_k = self.normalizers[k](x)
            normed_inputs.append(normed_k)
        all_normed_inputs = tf.layers.concatenate(normed_inputs)
          
        x = self.d1(all_normed_inputs)
        x = self.d2(x)
        return self.lout(x)

# Create an instance of the model
model = PistachioModel()

In [49]:


model.define_normalizers(train_ds, feature_columns)
model.build(feature_columns)





2024-11-24 03:27:27.824160: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-11-24 03:27:27.895836: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-11-24 03:27:27.955659: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-11-24 03:27:28.017901: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-11-24 03:27:28.077613: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-11-24 03:27:28.138449: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-11-24 03:27:28.205840: W tensorflow/core/framework/local_rendezvous.cc:404] L

## Keras model.fit api

In [50]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy', 'auc'])







In [51]:
history = model.fit(train_ds, epochs=10)

Epoch 1/10


TypeError: Exception encountered when calling Normalization.call().

[1mExpected float32, but got Tensor("Cast:0", shape=(None,), dtype=float32) of type 'SymbolicTensor'.[0m

Arguments received by Normalization.call():
  • inputs={'AREA': 'tf.Tensor(shape=(None,), dtype=float32)', 'PERIMETER': 'tf.Tensor(shape=(None,), dtype=float32)', 'MAJOR_AXIS': 'tf.Tensor(shape=(None,), dtype=float32)', 'MINOR_AXIS': 'tf.Tensor(shape=(None,), dtype=float32)', 'ECCENTRICITY': 'tf.Tensor(shape=(None,), dtype=float32)', 'EQDIASQ': 'tf.Tensor(shape=(None,), dtype=float32)', 'SOLIDITY': 'tf.Tensor(shape=(None,), dtype=float32)', 'CONVEX_AREA': 'tf.Tensor(shape=(None,), dtype=float32)', 'EXTENT': 'tf.Tensor(shape=(None,), dtype=float32)', 'ASPECT_RATIO': 'tf.Tensor(shape=(None,), dtype=float32)', 'ROUNDNESS': 'tf.Tensor(shape=(None,), dtype=float32)', 'COMPACTNESS': 'tf.Tensor(shape=(None,), dtype=float32)', 'SHAPEFACTOR_1': 'tf.Tensor(shape=(None,), dtype=float32)', 'SHAPEFACTOR_2': 'tf.Tensor(shape=(None,), dtype=float32)', 'SHAPEFACTOR_3': 'tf.Tensor(shape=(None,), dtype=float32)', 'SHAPEFACTOR_4': 'tf.Tensor(shape=(None,), dtype=float32)'}

## sequential model

In [66]:
model2 = tf.keras.models.Sequential([
  tf.keras.layers.BatchNormalization(), 
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(16),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model2.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy', 'auc'])

In [68]:
model2.fit(pistachio_train_batches, epochs=10)


Epoch 1/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 935us/step - accuracy: 0.7284 - auc: 0.8170 - loss: 0.5204
Epoch 2/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880us/step - accuracy: 0.7720 - auc: 0.8719 - loss: 0.4395
Epoch 3/10
[1m  1/111[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 30ms/step - accuracy: 0.7500 - auc: 1.0000 - loss: 0.2692

2024-04-30 02:52:53.956163: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-30 02:52:54.084876: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 881us/step - accuracy: 0.8066 - auc: 0.8861 - loss: 0.4293
Epoch 4/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step - accuracy: 0.7972 - auc: 0.8884 - loss: 0.4236
Epoch 5/10
[1m  1/111[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 29ms/step - accuracy: 1.0000 - auc: 1.0000 - loss: 0.2399

2024-04-30 02:52:54.215464: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-30 02:52:54.346195: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8084 - auc: 0.8767 - loss: 0.4463  
Epoch 6/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 904us/step - accuracy: 0.7599 - auc: 0.8354 - loss: 0.5226 
Epoch 7/10
[1m  1/111[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 39ms/step - accuracy: 0.7500 - auc: 0.6667 - loss: 0.5059

2024-04-30 02:52:54.488375: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-30 02:52:54.622119: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 878us/step - accuracy: 0.7879 - auc: 0.8461 - loss: 0.4867
Epoch 8/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 894us/step - accuracy: 0.8098 - auc: 0.8833 - loss: 0.4322
Epoch 9/10
[1m  1/111[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 28ms/step - accuracy: 1.0000 - auc: 1.0000 - loss: 0.2679

2024-04-30 02:52:54.760318: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-30 02:52:54.891889: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 917us/step - accuracy: 0.7919 - auc: 0.8692 - loss: 0.4556
Epoch 10/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 932us/step - accuracy: 0.7603 - auc: 0.8445 - loss: 0.4759 


2024-04-30 02:52:55.023937: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-30 02:52:55.158823: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


<keras.src.callbacks.history.History at 0x7f8e2ac1f350>

In [69]:
model2.evaluate(pistachio_test_batches,verbose=2)


427/427 - 0s - 1ms/step - accuracy: 0.8639 - auc: 0.9313 - loss: 0.3545


2024-04-30 02:54:06.293289: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[0.35447990894317627, 0.8639296293258667, 0.9313317537307739]

In [84]:
for features, labels in pistachio_test_batches.take(1):
    predictions = model2(features)
    for p,l in zip(predictions, labels):
        print(f'predicted prob: {p}, label: {l}')

predicted prob: [0.8645645], label: [1]
predicted prob: [0.31095818], label: [0]
predicted prob: [0.94427866], label: [1]
predicted prob: [0.00063194], label: [0]


2024-04-30 03:04:05.021128: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## custom training loop stuff

In [None]:
@tf.function
def train_step(images, labels):
  with tf.GradientTape() as tape:
    # training=True is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(images, training=True)
    loss = loss_object(labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  train_loss(loss)
  train_accuracy(labels, predictions)

In [None]:
@tf.function
def test_step(images, labels):
  # training=False is only needed if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  predictions = model(images, training=False)
  t_loss = loss_object(labels, predictions)

  test_loss(t_loss)
  test_accuracy(labels, predictions)

In [None]:

for epoch in range(EPOCHS):
  # Reset the metrics at the start of the next epoch
  train_loss.reset_state()
  train_accuracy.reset_state()
  test_loss.reset_state()
  test_accuracy.reset_state()

  for images, labels in train_ds:
    train_step(images, labels)

  for test_images, test_labels in test_ds:
    test_step(test_images, test_labels)

  print(
    f'Epoch {epoch + 1}, '
    f'Loss: {train_loss.result():0.2f}, '
    f'Accuracy: {train_accuracy.result() * 100:0.2f}, '
    f'Test Loss: {test_loss.result():0.2f}, '
    f'Test Accuracy: {test_accuracy.result() * 100:0.2f}'
  )