In [None]:
import glob
import os
import tensorflow as tf
from PIL import Image
from tqdm import tqdm
from joblib import Parallel, delayed
import numpy as np

In [None]:
os.listdir("/gpfsscratch/rech/zpf/uyf36me/training_patches/")

In [None]:
main_path="/gpfsscratch/rech/zpf/uyf36me/validation_patches/"

label_0=os.path.join(main_path,"label_0/**.png") #basic
label_1=os.path.join(main_path,"label_1/**.png") #proof
label_2=os.path.join(main_path,"label_2/**.png") #theorem
label_3=os.path.join(main_path,"label_3/**.png") #overlap

vals=[len(glob.glob(label_0)),len(glob.glob(label_1)),len(glob.glob(label_2)),len(glob.glob(label_3))]
print(vals)
print("ratios in the data --")

for val in vals:
    print(val/sum(vals))
    

In [None]:
import cv2
def white_padding_and_scaling(default_shape,file_loc,overwrite=False):
    """
    2- adds white padding wherever necessary
    3- takes bitwise NOT transformation this esentially inverts the image sets black -0 as background while 
    255 is set as foreground
    4- if overwrite true then makes a new file with '_t' suffix 
    """
    try:
        img_array=cv2.imread(file_loc)
        shape=img_array.shape
    except:
        print("error in white padding--",file_loc)
        return

    padding_height=0
    padding_width=0
    crop_width=False
    crop_height=False

    if(shape[0]<=default_shape[0]): #if img is small in width then we need padding then 
        padding_height=default_shape[0]-shape[0]
    else:
        crop_height=True
        padding_height=0
    if(shape[1]<=default_shape[1]):
        padding_width=default_shape[1]-shape[1]
    else:
        crop_width=True
        padding_width=0
    if(padding_width>0 or padding_height>0):
        colour_fill=(255,255,255) #colour to pad this is white
        new_array=cv2.copyMakeBorder(img_array, 0,padding_height , 0, padding_width, cv2.BORDER_CONSTANT,value=colour_fill)
    else:
        new_array=img_array[0:default_shape[0], 0:default_shape[1]]

    if(crop_width==True):
        new_array=new_array[0:default_shape[0], 0:default_shape[1]]
    if(crop_height==True):
        new_array=new_array[0:default_shape[0], 0:default_shape[1]]


    new_array=cv2.bitwise_not(new_array)
    if(overwrite==True):
        new_name=file_loc.replace(".png","_t.png")
        #print(new_name)
        cv2.imwrite(new_name,new_array)
        os.remove(file_loc)
        return

    return new_array
       
            

In [None]:
#generate dataset


path="/gpfsscratch/rech/zpf/uyf36me/validation_patches/**/**.png"

png_files=glob.glob(path)


            
filtered_files=list(filter(lambda x: not x.endswith("_t.png"),png_files))
print(len(filtered_files))

bad_files=list(filter(lambda x:  x.endswith("_t_t.png"),png_files))
print(len(bad_files))


print("--running transformations")
image_shapes=(400,1400)
n_jobs=-2
#res=Parallel(n_jobs=n_jobs,backend="threading",verbose=2)(delayed(white_padding_and_scaling)
                                           #(default_shape=image_shapes,file_loc=fname,overwrite=True) for fname in tqdm(filtered_files))


In [None]:
strategy=tf.distribute.MirroredStrategy()
devices=strategy.num_replicas_in_sync

print("no of devices: {}".format(strategy.num_replicas_in_sync))

In [None]:
image_shapes=(400,1400)
batch_per_gpu=16
batch_size=batch_per_gpu*devices

sub_sample_validation_dataset=tf.keras.preprocessing.image_dataset_from_directory(
    directory="/gpfsscratch/rech/zpf/uyf36me/validation_patches/",
    image_size=image_shapes,
    batch_size=batch_size,
    seed=2,
    labels='inferred',
    label_mode='categorical',
    shuffle=False #<<<<<<<<change this when training
    )

sub_sample_validation_dataset=sub_sample_validation_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [8]:
labels=None
for x, y in tqdm(sub_sample_validation_dataset):
    if(labels is None):
        labels=y
    else:
        labels=np.concatenate([labels,y])
        
#ground truth        
y_true=np.argmax(labels,axis=1)

100%|██████████| 4136/4136 [16:30<00:00,  4.17it/s]


In [9]:
image_shapes=(400,1400)
batch_per_gpu=16
batch_size=batch_per_gpu*devices

validation_dataset=tf.keras.preprocessing.image_dataset_from_directory(
    directory="/gpfsscratch/rech/zpf/uyf36me/validation_patches/",
    image_size=image_shapes,
    batch_size=batch_size,
    seed=2,
    shuffle=False #<<<<<<<<change this when training
    )

validation_dataset=validation_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

Found 529303 files belonging to 4 classes.


In [None]:
from tensorflow_addons.optimizers import AdamW ,LAMB
txt_files=glob.glob("/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/**.txt")
txt_files.sort()
txt_files

epochs_run=[5,10,5,7,7,7,10,10]
model_names=[]

for file in txt_files:
    model_name=file[:-4]
    model_names.append(model_name)

model_runs=[]
for model in model_names:
    for i in range(1,100):
        model_path=model+str(i)+".h5"
        if(os.path.exists(model_path)):
            model_runs.append(model_path)
            
log_file="validation_runs.txt"
try:            
    with open(log_file,"r") as fhand:
        lines=fhand.readlines()

    runs_so_far=[line.split(",")[0] for line in lines]
    for element in runs_so_far:
        if(element in model_runs):
            model_runs.remove(element)
    
except FileNotFoundError:
    pass

def append_last_run(val,log_file):
    with open(log_file,"a") as f:
        f.write(val)


for model_path in model_runs:
    print(model_path)
    with strategy.scope():
        model = tf.keras.models.load_model(model_path)
    val_loss,val_acc=model.evaluate(validation_dataset)
    line_to_write=f'{model_path},{val_loss},{val_acc}\n'
    append_last_run(line_to_write,log_file)
        

/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/r_efficientnetv2m_avg5.h5
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CP

2023-04-24 19:55:01.975996: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/r_efficientnetv2m_avg6.h5


2023-04-24 20:20:08.397673: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/r_efficientnetv2m_avg7.h5


2023-04-24 20:44:35.707526: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/r_efficientnetv2m_avg8.h5


2023-04-24 21:10:38.998861: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/r_efficientnetv2m_avg9.h5


2023-04-24 21:36:23.785159: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/r_efficientnetv2s_avg1.h5


2023-04-24 22:02:45.522005: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



/gpfsdswork/projects/rech/zpf/uyf36me/finetuning_vision/new_models/r_efficientnetv2s_avg2.h5


2023-04-24 22:21:39.215334: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



 670/4136 [===>..........................] - ETA: 15:19 - loss: 0.7502 - accuracy: 0.7426

In [None]:
print("yo")

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from tensorflow_addons.optimizers import AdamW ,LAMB

def evaluate_f1_for_tf_model(model_path,validation_dataset,y_true,show_confusion_report=True):
    
    #460h cpu for 28K images
    # 4 A100 can do the job in

    class_names=["Basic","Proof","Theorem","Overlap"]
    

    # Wrap the loaded model inside the strategy scope to distribute it across the GPUs
    with strategy.scope():
        model = tf.keras.models.load_model(model_path)
        

    #show model arch
    print(model.summary())
    

    
    #generating predictions
    predictions=model.predict(validation_dataset)
    
    #generating predictions
    y_pred = np.argmax(predictions, axis=1)
    
    if(show_confusion_report is True):
        print('Confusion Matrix')
        print(classification_report(y_true, y_pred, target_names=class_names))
        
    return f1_score(y_true,y_pred,average="macro")
    
#"EfficientNetB0.h5","EfficientNetB0_max.h5","EfficientNetB0_avg.h5",
        #"EfficientNetB4_avg.h5","efficientnetv2s_avg.h5",
models=["./new_models/r_efficientnetv2s_avg11.h5"]

for model in models:
    val_loss,val_acc=model.evaluate(validation_dataset)
    print(val_loss,val_acc)
    _f1_score=evaluate_f1_for_tf_model(model_path=model,validation_dataset=sub_sample_validation_dataset,y_true=y_true)
    print(f"f1 score of the {model} is {_f1_score}")
    


2023-04-24 15:00:31.196227: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 529303
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:9"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}



0.8945791721343994 0.6349085569381714


OSError: Unable to load model. Filepath is not an hdf5 file (or h5py is not available) or SavedModel. Received: filepath=<keras.engine.sequential.Sequential object at 0x14c04c62bee0>

In [None]:
from tensorflow_addons.optimizers import AdamW ,LAMB
#460h cpu for 28K images
# 4 A100 can do the job in

class_names=["Basic","Proof","Theorem","Overlap"]
model_path="efficientnetv2s_avg.h5"

# Wrap the loaded model inside the strategy scope to distribute it across the GPUs
with strategy.scope():
    model = tf.keras.models.load_model(model_path)

print(model.summary())


In [None]:
predictions=model.predict(sub_sample_validation_dataset)

In [None]:
predictions.shape

In [None]:
y_pred.shape

In [None]:
y_true=np.argmax(labels,axis=1)
y_true.shape

In [26]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_pred = np.argmax(predictions, axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_true, y_pred))
print('Classification Report')
target_names = ['Basic', 'Proofs', 'Theorems','Overlaps']
print(classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix
[[123291   1248  34516     16]
 [ 47168    987  16129      0]
 [  1812     35   5401     18]
 [   955      6    222      1]]
Classification Report
              precision    recall  f1-score   support

       Basic       0.71      0.78      0.74    159071
      Proofs       0.43      0.02      0.03     64284
    Theorems       0.10      0.74      0.17      7266
    Overlaps       0.03      0.00      0.00      1184

    accuracy                           0.56    231805
   macro avg       0.32      0.38      0.24    231805
weighted avg       0.61      0.56      0.52    231805



In [None]:
#evaluate model on larger dataset to see performance difference
#use F1 score to measure the impact
#decide the pooling part
#do big arch lead to bad generalization
#flops vs accuracy