# Import Libraries

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from math import log10, sqrt

import datetime

import os
import time

from matplotlib import pyplot as plt
from IPython import display
import ntpath
import librosa
import numpy as np
from PIL import Image

import cv2
from scipy.io import wavfile
import csv
from google.colab import files
from skimage.metrics import structural_similarity as ssim
!pip install pesq
from pesq import pesq

!pip install -q -U tensorboard

# Import Dataset

In [None]:
# let's try with MOBIPHONE dataset
!wget --no-check-certificate -r "https://drive.google.com/uc?export=download&id=1MflXkdaeAFyBftKkCTTuUXkfJ-fOTzus" -O "MOBIPHONE.zip"
!unzip MOBIPHONE.zip
PATH = 'MOBIPHONE/'
folders = ["Apple iPhone 5", "HTC desire c", "HTC Sensation xe", "LG GS290", "LG L3", "LG Optimus L5", "LG Optimus L9", "Nokia 5530", "Nokia C5", "Nokia N70", "Samsung e1230", "Samsung E2121B", "Samsung E2600", "Samsung Galaxy GT-I9100 s2", "Samsung GT-I8190 mini", "Samsung GT-N7100 (galaxy note2)", "Samsung Galaxy Nexus S", "Samsung s5830i", "Sony Ericson c902", "Sony ericson c510i"]
# "Vodafone joy 845" will be used for test, all of the above ones for training

# Splitting Dataset into Training and Testing parts

In [None]:
# now, let's convert audio files into images...
!mkdir frequency_images_train # directory where to save frequency representation images for train
!mkdir frequency_images_test # directory where to save frequency representation images for test

# ask input parameters
data = input("1-STFT / 2-LOG MEL SPECT\n")
reconstruction = input("1-Original Phase / 2-Griffinlim\n")
while (data != "1" and data != "2") or (reconstruction != "1" and reconstruction != "2"):
    data = input("1-STFT / 2-LOG MEL SPECT\n")
    reconstruction = input("1-Original Phase / 2-Griffinlim\n")


n_files = 24 # inside test folder
i = 1 # index of output file (progressive)
starting_point=200 # where to start mask
m = 8 # to have a total degradation of 24 rows/columns


#training folders cycle
id=1 #progressive ID, just to rename images
for i in range(len(folders)-1):
  for audio in os.listdir(PATH+folders[i]+"/"):
    ext = os.path.splitext(audio)[-1].lower()
    if ext == ".wav":
      y, sr = librosa.load(PATH+folders[i]+"/"+audio, sr=16000)
      if len(y)>=20*sr-1:
        y = y[0:20*sr]
        if data == "1":
          freq_rep = librosa.stft(y, n_fft=1023, win_length=625, hop_length=625, window='rect')
          ph = np.angle(freq_rep)
        else:
          freq_rep = librosa.stft(y, n_fft=1023, win_length=625, hop_length=625, window='rect')
          ph = np.angle(freq_rep)
          freq_rep = librosa.feature.melspectrogram(S=abs(freq_rep), n_mels=512, sr=16000, power=1.0)

        if data == "1":
          mag_db = librosa.amplitude_to_db(np.abs(freq_rep))
        else:
          mag_db = librosa.amplitude_to_db(freq_rep)  # mel

        # max and min values (in db) for conversion
        max_val = np.max(np.max(mag_db))
        min_val = np.min(np.min(mag_db))
        # converting into image
        freq_rep_img = np.round(((mag_db-min_val)/(max_val-min_val))*(255-0)+0)

        #create also masked image
        mask = np.ones([len(freq_rep_img), len(freq_rep_img[0])])
        mask[starting_point:(starting_point+m+1), :] = 0
        
        corrupted_img = mask * freq_rep_img

        composed = np.zeros([len(freq_rep_img), len(freq_rep_img[0])*2])
        composed[:,0:len(freq_rep_img[0])] = freq_rep_img
        composed[:,len(freq_rep_img[0]):len(freq_rep_img[0])*2] = corrupted_img

        composed_image = Image.fromarray(composed.astype(np.uint8))  
        composed_image.save("frequency_images_train/speaker_train_"+str(id)+".png")
        id=id+1


#testing folder
m = 8 # just to start filling it
id=1
for audio in sorted(os.listdir(PATH+"Vodafone joy 845/")):
  ext = os.path.splitext(audio)[-1].lower()
  if ext == ".wav":
      y, sr = librosa.load(PATH+"Vodafone joy 845/"+audio, sr = 16000)
      y = y[0:20*sr]
      if data == "1":
         freq_rep = librosa.stft(y, n_fft=1023, win_length=625, hop_length=625, window='rect')
         ph = np.angle(freq_rep)
      else:
        freq_rep = librosa.stft(y, n_fft=1023, win_length=625, hop_length=625, window='rect')
        ph = np.angle(freq_rep)
        freq_rep = librosa.feature.melspectrogram(S=abs(freq_rep), n_mels=512, sr=16000, power=1.0)

      if data == "1":
        mag_db = librosa.amplitude_to_db(np.abs(freq_rep))
      else:
        mag_db = librosa.amplitude_to_db(freq_rep)  # mel

      # max and min values (in db) for conversion
      max_val = np.max(np.max(mag_db))
      min_val = np.min(np.min(mag_db))
      # converting into image
      freq_rep_img = np.round(((mag_db-min_val)/(max_val-min_val))*(255-0)+0)

      #create also masked image
      mask = np.ones([len(freq_rep_img), len(freq_rep_img[0])])
      mask[starting_point:(starting_point+m+1), :] = 0
      
      corrupted_img = mask * freq_rep_img

      composed = np.zeros([len(freq_rep_img), len(freq_rep_img[0])*2])
      composed[:,0:len(freq_rep_img[0])] = freq_rep_img
      composed[:,len(freq_rep_img[0]):len(freq_rep_img[0])*2] = corrupted_img

      composed_image = Image.fromarray(composed.astype(np.uint8))
      save_name = audio[:-4] #to eliminate '.wav' from file name  
      composed_image.save("frequency_images_test/speaker_test_"+str(id)+".png")
      id=id+1


# Training and Testing functions

In [None]:
# Params
BUFFER_SIZE = 400
BATCH_SIZE = 1
IMG_WIDTH = 512
IMG_HEIGHT = 512

In [None]:
# Load real and input(corrupted) image

def load(image_file):
  image = tf.io.read_file(image_file)
  image = tf.image.decode_png(image)
  w = tf.shape(image)[1]

  w = w // 2
  real_image = image[:, :w, :]
  input_image = image[:, w:, :]

  input_image = tf.cast(input_image, tf.float32)
  real_image = tf.cast(real_image, tf.float32)

  return input_image, real_image

In [None]:
def resize(input_image, real_image, height, width):
  input_image = tf.image.resize(input_image, [height, width],
                                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
  real_image = tf.image.resize(real_image, [height, width],
                               method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

  return input_image, real_image

In [None]:
def random_crop(input_image, real_image):
  stacked_image = tf.stack([input_image, real_image], axis=0)
  cropped_image = tf.image.random_crop(
      stacked_image, size=[2, IMG_HEIGHT, IMG_WIDTH, 3])

  return cropped_image[0], cropped_image[1]

In [None]:
# normalizing the images to [-1, 1]
def normalize(input_image, real_image):
  input_image = (input_image / 127.5) - 1
  real_image = (real_image / 127.5) - 1

  return input_image, real_image

In [None]:
@tf.function()
def random_jitter(input_image, real_image):
  # resizing to 286 x 286 x 3
  input_image, real_image = resize(input_image, real_image, IMG_HEIGHT, IMG_WIDTH)

  return input_image, real_image

In [None]:
def load_image_train(image_file):
  input_image, real_image = load(image_file)
  input_image, real_image = normalize(input_image, real_image)

  return input_image, real_image

In [None]:
def load_image_test(image_file):
  input_image, real_image = load(image_file)
  input_image, real_image = normalize(input_image, real_image)

  return input_image, real_image

# Populating Training and Testing datasets

In [None]:
train_dataset = tf.data.Dataset.list_files('frequency_images_train/*.png')
train_dataset = train_dataset.map(load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)

In [None]:
test_dataset = tf.data.Dataset.list_files('frequency_images_test/*.png', shuffle=False)
test_dataset = test_dataset.map(load_image_test,deterministic=True)
test_dataset = test_dataset.batch(BATCH_SIZE)

# Build the generator

In [None]:
OUTPUT_CHANNELS = 1 # gray

## Downsampling layer

In [None]:
def downsample(filters, size, apply_batchnorm=True):
  initializer = tf.random_normal_initializer(0., 0.02)

  result = tf.keras.Sequential()
  result.add(
      tf.keras.layers.Conv2D(filters, size, strides=2, padding='same',
                             kernel_initializer=initializer, use_bias=False))

  if apply_batchnorm:
    result.add(tf.keras.layers.BatchNormalization())

  result.add(tf.keras.layers.LeakyReLU())

  return result

## Upsampling Layer

In [None]:
def upsample(filters, size, apply_dropout=False):
  initializer = tf.random_normal_initializer(0., 0.02)

  result = tf.keras.Sequential()
  result.add(tf.keras.layers.Conv2DTranspose(filters, size, strides=2,
                                    padding='same',
                                    kernel_initializer=initializer,
                                    use_bias=False))

  result.add(tf.keras.layers.BatchNormalization())

  if apply_dropout:
      result.add(tf.keras.layers.Dropout(0.5))

  result.add(tf.keras.layers.ReLU())

  return result

In [None]:
def Generator():
  inputs = tf.keras.layers.Input(shape=[512, 512, 1])

  down_stack = [
    downsample(32, 4, apply_batchnorm=False),
    downsample(64, 4), # (bs, 128, 128, 64)
    downsample(128, 4), # (bs, 64, 64, 128)
    downsample(256, 4), # (bs, 32, 32, 256)
    downsample(512, 4), # (bs, 16, 16, 512)
    downsample(512, 4), # (bs, 8, 8, 512)
    downsample(512, 4), # (bs, 4, 4, 512)
    downsample(512, 4), # (bs, 2, 2, 512)
    downsample(512, 4), # (bs, 1, 1, 512)
  ]

  up_stack = [
    upsample(512, 4, apply_dropout=True), # (bs, 2, 2, 1024)
    upsample(512, 4, apply_dropout=True), # (bs, 4, 4, 1024)
    upsample(512, 4, apply_dropout=True), # (bs, 8, 8, 1024)
    upsample(512, 4), # (bs, 16, 16, 1024)
    upsample(256, 4), # (bs, 32, 32, 512)
    upsample(128, 4), # (bs, 64, 64, 256)
    upsample(64, 4), # (bs, 128, 128, 128)
    upsample(32, 4)
  ]

  initializer = tf.random_normal_initializer(0., 0.02)
  last = tf.keras.layers.Conv2DTranspose(OUTPUT_CHANNELS, 4,
                                         strides=2,
                                         padding='same',
                                         kernel_initializer=initializer,
                                         activation='tanh') # (bs, 256, 256, 3)

  x = inputs

  # Downsampling through the model
  skips = []
  for down in down_stack:
    x = down(x)
    skips.append(x)

  skips = reversed(skips[:-1])

  # Upsampling and establishing the skip connections
  for up, skip in zip(up_stack, skips):
    x = up(x)
    x = tf.keras.layers.Concatenate()([x, skip])

  x = last(x)

  return tf.keras.Model(inputs=inputs, outputs=x)


In [None]:
generator = Generator()
tf.keras.utils.plot_model(generator, show_shapes=True, dpi=64)

In [None]:
LAMBDA = 100
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
def generator_loss(disc_generated_output, gen_output, target):
  gan_loss = loss_object(tf.ones_like(disc_generated_output), disc_generated_output)
  l1_loss = tf.reduce_mean(tf.abs(target - gen_output))
  total_gen_loss = gan_loss + (LAMBDA * l1_loss)

  return total_gen_loss, gan_loss, l1_loss

# Build the Discriminator


In [None]:
def Discriminator():
  initializer = tf.random_normal_initializer(0., 0.02)

  inp = tf.keras.layers.Input(shape=[512, 512, 1], name='input_image')
  tar = tf.keras.layers.Input(shape=[512, 512, 1], name='target_image')

  x = tf.keras.layers.concatenate([inp, tar]) # (bs, 256, 256, channels*2)

  down1 = downsample(64, 4, False)(x) # (bs, 128, 128, 64)
  down2 = downsample(128, 4)(down1) # (bs, 64, 64, 128)
  down3 = downsample(256, 4)(down2) # (bs, 32, 32, 256)
  down4 = downsample(512, 4)(down3) # (bs, 16, 16, 512)

  zero_pad1 = tf.keras.layers.ZeroPadding2D()(down4) # (bs, 34, 34, 256)
  conv = tf.keras.layers.Conv2D(512, 4, strides=1,
                                kernel_initializer=initializer,
                                use_bias=False)(zero_pad1) # (bs, 31, 31, 512)

  batchnorm1 = tf.keras.layers.BatchNormalization()(conv)

  leaky_relu = tf.keras.layers.LeakyReLU()(batchnorm1)

  zero_pad2 = tf.keras.layers.ZeroPadding2D()(leaky_relu) # (bs, 33, 33, 512)

  last = tf.keras.layers.Conv2D(1, 4, strides=1,
                                kernel_initializer=initializer)(zero_pad2) # (bs, 30, 30, 1)

  return tf.keras.Model(inputs=[inp, tar], outputs=last)

In [None]:
discriminator = Discriminator()
tf.keras.utils.plot_model(discriminator, show_shapes=True, dpi=64)

## Discriminator Loss

In [None]:
def discriminator_loss(disc_real_output, disc_generated_output):
  # FILL THE CODE:
  # define the GAN discriminator loss remember ideally discriminator should classify
  # as 0 fake images and as 1 real images
  # HINT: use loss_object and tf.ones_like and tf.zeros_like

  # loss related to real images
  real_loss = loss_object(tf.ones_like(disc_real_output), disc_real_output)

  # loss related to fake images
  generated_loss = loss_object(tf.zeros_like(disc_generated_output), disc_generated_output)

  # FILL THE CODE:
  # compute output loss as the sum of real and generated loss
  total_disc_loss = real_loss + generated_loss

  return total_disc_loss

# Define the Optimizers and Checkpoint-saver

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

# Generate Images

In [None]:
def generate_images(model, test_input, tar):
  prediction = model(test_input, training=True)
  plt.figure(figsize=(15,15))

  display_list = [test_input[0], tar[0], prediction[0]]
  title = ['Input Image', 'Ground Truth', 'Pix 2 Pix']

  for i in range(3):
    plt.subplot(1, 3, i+1)
    plt.title(title[i])
    # getting the pixel values between [0, 1] to plot it.
    img_to_print = display_list[i].numpy()
    img_to_print = img_to_print.squeeze() # to adapt it for imshow
    plt.imshow(np.flip(img_to_print,axis=0))
  plt.show()

  return display_list[2].numpy(), display_list[1].numpy() #returning the predicted image and the original

# Training


In [None]:
EPOCHS = 5

log_dir="logs/"

summary_writer = tf.summary.create_file_writer(
  log_dir + "fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [None]:
@tf.function
def train_step(input_image, target, epoch):

  with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
    gen_output = generator(input_image, training=True)
    disc_real_output = discriminator([input_image, target], training=True)
    disc_generated_output = discriminator([input_image, gen_output], training=True)

    # compute the 2 losses
    gen_total_loss, gen_gan_loss, gen_l1_loss = generator_loss(disc_generated_output, gen_output, target)
    disc_loss = discriminator_loss(disc_real_output, disc_generated_output)

  generator_gradients = gen_tape.gradient(gen_total_loss,
                                          generator.trainable_variables)
  discriminator_gradients = disc_tape.gradient(disc_loss,
                                               discriminator.trainable_variables)

  generator_optimizer.apply_gradients(zip(generator_gradients,
                                          generator.trainable_variables))
  discriminator_optimizer.apply_gradients(zip(discriminator_gradients,
                                              discriminator.trainable_variables))

  with summary_writer.as_default():
    tf.summary.scalar('gen_total_loss', gen_total_loss, step=epoch)
    tf.summary.scalar('gen_gan_loss', gen_gan_loss, step=epoch)
    tf.summary.scalar('gen_l1_loss', gen_l1_loss, step=epoch)
    tf.summary.scalar('disc_loss', disc_loss, step=epoch)

Actual training loop

In [None]:
def fit(train_ds, epochs, test_ds):
  for epoch in range(epochs):
    start = time.time()

    display.clear_output(wait=True)

    for example_input, example_target in test_ds.take(1):
      generate_images(generator, example_input, example_target)
    print("Epoch: ", epoch)

    # Train
    for n, (input_image, target) in train_ds.enumerate():
      print('.', end='')
      if (n+1) % 100 == 0:
        print()
      train_step(input_image, target, epoch)
    print()

    # saving (checkpoint) the model every 20 epochs
    if (epoch + 1) % 20 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)

    print ('Time taken for epoch {} is {} sec\n'.format(epoch + 1,
                                                        time.time()-start))
  checkpoint.save(file_prefix = checkpoint_prefix)

This training loop saves logs you can easily view in TensorBoard to monitor the training progress. Working locally you would launch a separate tensorboard process. In a notebook, if you want to monitor with TensorBoard it's easiest to launch the viewer before starting the training.

To launch the viewer paste the following into a code-cell:

In [None]:
% load_ext tensorboard
% tensorboard --logdir {log_dir}

Run the training loop:


In [None]:
fit(train_dataset, EPOCHS, test_dataset)

In [None]:
display.IFrame(
    src="https://tensorboard.dev/experiment/lZ0C6FONROaUMfjYkVyJqw",
    width="100%",
    height="1000px")

# Restore the latest checkpoint and test

In [None]:
!ls {checkpoint_dir}

In [None]:
# Run the trained model on a few examples from the test dataset
for inp, tar in test_dataset.take(5):
  out = generate_images(generator, inp, tar)

# Functions

In [None]:
def convert(freq_img, ph, data, reconstruction, max_val, min_val, sr):
  inpainted_freq_rep_db = ((freq_img-0)/(255-0))*(max_val-min_val)+min_val
  inpainted_freq_rep = librosa.db_to_amplitude(inpainted_freq_rep_db)

  if data == "1": #stft
    if reconstruction == "1": #original phase
      #inpainted_freq_rep = librosa.db_to_amplitude(inpainted_freq_rep_db)
      real = inpainted_freq_rep * np.cos(ph)
      imag = inpainted_freq_rep * np.sin(ph)
      mod_freq_rep = np.zeros([len(real), len(real[0])], dtype=np.complex_)
      # inverse transform (creating a complex stft from modified magnitude and original phase)
      for i in range(len(real)):
        for j in range(len(real[0])):
          mod_freq_rep[i, j] = complex(real[i, j], imag[i, j])
          
      output = librosa.istft(mod_freq_rep, win_length=625, hop_length=625, window='rect')

    else: #griffinlim
      #inpainted_freq_rep = librosa.db_to_amplitude(inpainted_freq_rep_db)
      output = librosa.griffinlim(inpainted_freq_rep, win_length=625, hop_length=625, window='rect')

  else: #log mel
    if reconstruction == "1": #original phase
      #inpainted_freq_rep = librosa.db_to_power(inpainted_freq_rep_db)
      inpainted_freq_rep = librosa.feature.inverse.mel_to_stft(inpainted_freq_rep, sr, n_fft=1023, power=1.0)
      real = inpainted_freq_rep * np.cos(ph)
      imag = inpainted_freq_rep * np.sin(ph)
      mod_freq_rep = np.zeros([len(real), len(real[0])], dtype=np.complex_)
      # inverse transform (creating a complex stft from modified magnitude and original phase)
      for i in range(len(real)):
        for j in range(len(real[0])):
          mod_freq_rep[i, j] = complex(real[i, j], imag[i, j])
      
      output = librosa.istft(mod_freq_rep, win_length=625, hop_length=625, window='rect')
      
    else: #griffinlim
      #inpainted_freq_rep = librosa.db_to_power(inpainted_freq_rep_db)
      mod_freq_rep = librosa.feature.inverse.mel_to_stft(inpainted_freq_rep, sr, n_fft=1023,power=1.0)
      output = librosa.griffinlim(mod_freq_rep, win_length=625, hop_length=625, window='rect')
    
  return output

  def PSNR(original, compressed):
  mse = np.mean((original - compressed) ** 2)
  if mse == 0:  # MSE is zero means no noise is present in the signal .
  # Therefore PSNR have no importance.
    return 100
  max_pixel = 255.0
  psnr = 20 * log10(max_pixel / sqrt(mse))
  return psnr

def alternative_SNR(y):

  def energy(x):
    e = np.sum(x ** 2)
    return e

  Z = librosa.stft(y, n_fft=1023, win_length=625, hop_length=625, window='rect')
  z = librosa.istft(Z, win_length=625, hop_length=625, window='rect')
  n = y[:z.size] - z
  snr = 10 * np.log10(energy(y)/energy(n))
  return snr

# Main Program

In [None]:
iteration = round(m/2)+1
n_files = 24
file_list = sorted(os.listdir('MOBIPHONE/Vodafone joy 845/'))
image_list = sorted(os.listdir('frequency_images_test/'))
input_indices = ["1","10","11","12","13","14","15","16","17","18","19","2","20","21","22","23","24","3","4","5","6","7","8","9"]
#THE CORRESPONDANCE IS (ex) : speaker_10.png --> speaker18.wav --> output_2.wav and so on...

i = 0
# delete old csv if present
if os.path.isfile('pix_2_pix_results_'+str(data)+'_'+str(reconstruction)+'_'+str(iteration)+'_.csv'):
  os.remove('pix_2_pix_results_'+str(data)+'_'+str(reconstruction)+'_'+str(iteration)+'_.csv')

# creating csv file where to save results (SNR/PSNR)
with open('pix_2_pix_results_'+str(data)+'_'+str(reconstruction)+'_'+str(iteration)+'_.csv', mode='w') as pix_2_pix: # OPENING FILE (TILL THE END)
  pix_2_pix_writer = csv.writer(pix_2_pix, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  pix_2_pix_writer.writerow(['FILE', 'SNR', 'PSNR', 'SSIM', 'PESQ'])
  #so, for example...model_based_results_1_1.csv means results of data = 1 (stft) and reconstruction = 1 (original phase)
  for inp, tar in test_dataset.take(n_files):
    print("\nFILE "+str(i+1)+"...")
    out, orig = generate_images(generator, inp, tar)
        
    #recovering phase
    index = int(input_indices[i])
    audio_to_load = file_list[index-1]
    y, sr = librosa.load('MOBIPHONE/Vodafone joy 845/'+audio_to_load, sr = 16000)
    y = y[0:20*sr]
    if data == "1":
      freq_rep = librosa.stft(y, n_fft=1023, win_length=625, hop_length=625, window='rect')
      ph = np.angle(freq_rep)
    else:
      freq_rep = librosa.stft(y, n_fft=1023, win_length=625, hop_length=625, window='rect')
      ph = np.angle(freq_rep)
      freq_rep = librosa.feature.melspectrogram(S=abs(freq_rep), n_mels=512, sr=16000, power=1.0)

    # recovering also original image (for PSNR)
    if data == "1":
      mag_db = librosa.amplitude_to_db(np.abs(freq_rep))
    else:
      mag_db = librosa.amplitude_to_db(freq_rep)  # mel

    # max and min values (in db) for conversion
    max_val = np.max(np.max(mag_db))
    min_val = np.min(np.min(mag_db))
    # converting into image
    img_np = np.round(((mag_db-min_val)/(max_val-min_val))*(255-0)+0)

    out_np = out #will be the last inpainted (predicted) image
    orig_np = orig #will be the original image

    # converting into 0,255
    max_val_orig = np.max(np.max(orig_np[:,:,0]))
    min_val_orig = np.min(np.min(orig_np[:,:,0]))
    max_o = np.max(np.max(out_np[:,:,0]))
    min_o = np.min(np.min(out_np[:,:,0]))
    # converting into image
    orig_np = np.round(((orig_np[:,:,0]-min_val_orig)/(max_val_orig-min_val_orig))*(255-0)+0)
    out_np = np.round(((out_np[:,:,0] - min_o) / (max_o - min_o)) * (255 - 0) + 0)
      
    #PSNR
    psnr = PSNR(orig_np[:,:], out_np[:,:])
        
    min_val_out = np.min(np.min(out_np[:,:]))
    max_val_out = np.max(np.max(out_np[:,:]))

    output = convert(out_np[:,:], ph, data, reconstruction, max_val, min_val, sr)
    min_out = np.min(output)
    max_out = np.max(output)
    output=((output-min_out)/(max_out-min_out))*(max(y)-min(y))+min(y) # converting back
        
    # writing output
    wavfile.write('output_'+str(i+1)+'.wav', sr, output.astype(np.float32))

    # SNR
    o, sr = librosa.load('output_'+str(i+1)+'.wav',sr=16000)
    snr = alternative_SNR(o)
          
    #PESQ
    psq = pesq(16000, y, o, 'wb')

    # SSIM
    images_ssim = ssim(orig_np[:,:], out_np[:,:], data_range=orig_np[:,:].max() - out_np[:,:].min())

    # save results in csv
    pix_2_pix_writer.writerow([os.path.basename(str('MOBIPHONE/Vodafone joy 845/'+str(audio_to_load[:-4]))), str(snr), str(psnr), str(images_ssim), str(psq)])

    i = i + 1


# download csv
files.download('pix_2_pix_results_'+str(data)+'_'+str(reconstruction)+'_'+str(iteration)+'_.csv')