<a href="https://colab.research.google.com/github/lorenzopalaia/Progetto-Lab-IA/blob/main/NST_pt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Colab Notebooks/data

/content/drive/My Drive/Colab Notebooks/data


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import matplotlib.pyplot as plt

import torchvision.models as models

import copy

import numpy as np

from IPython.display import display, Audio

import librosa
import soundfile as sf

import warnings
warnings.filterwarnings('ignore')

import torchaudio

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
CONTENT_FILENAME = "inputs/imperial.mp3"
STYLE_FILENAME = "inputs/usa.mp3"
OUTPUT_FILENAME = "pt_outputs/output.wav"

In [None]:
N_FFT = 2048
def to_spec(filename):
    x, sr = librosa.load(filename, mono=True)
    S = librosa.stft(x, N_FFT)
    p = np.angle(S)
    
    S = np.log1p(np.abs(S[:,:430]))  
    return S, sr

In [None]:
a_content, fs = to_spec(CONTENT_FILENAME)
a_style, fs = to_spec(STYLE_FILENAME)

N_FREQS = a_content.shape[0]
N_SAMPLES = a_content.shape[1]

a_style = a_style[:N_FREQS, :N_SAMPLES] # slicing

style_audio = np.ascontiguousarray(a_style.reshape([1, N_FREQS, N_SAMPLES, 1]))
content_audio = np.ascontiguousarray(a_content.reshape([1, N_FREQS, N_SAMPLES, 1]))

style_float = Variable((torch.from_numpy(style_audio))).to(device, torch.float)
content_float = Variable((torch.from_numpy(content_audio))).to(device, torch.float)

print("Style/Content Shape:", style_float.shape)

Style/Content Shape: torch.Size([1, 1025, 430, 1])


In [None]:
N_FILTERS = 2048 #4096
std = np.sqrt(2) * np.sqrt(2.0 / ((N_FREQS + N_FILTERS) * 11))
kernel = np.random.randn(N_FILTERS, N_FREQS, 11, 1) * std
kernel = nn.Parameter(torch.from_numpy(kernel).to(device, torch.float))

print("Filter Shape:", kernel.shape)

Filter Shape: torch.Size([2048, 1025, 11, 1])


In [None]:
class CNNModel(nn.Module):
  def __init__(self, kernel):
    super().__init__()
    self.conv1 = nn.Conv2d(N_FREQS, N_FILTERS, (11, 1))
    self.conv1.weight = kernel
    self.relu1 = nn.ReLU()

  def forward(self, input):
    out = self.conv1(input)
    out = self.relu1(out)
    return out

cnn = CNNModel(kernel)
cnn.to(device)

CNNModel(
  (conv1): Conv2d(1025, 2048, kernel_size=(11, 1), stride=(1, 1))
  (relu1): ReLU()
)

In [None]:
class ContentLoss(nn.Module):
  def __init__(self, target, weight):
    super(ContentLoss, self).__init__()
    self.target = target.detach() * weight
    #self.weight = weight
    self.criterion = nn.MSELoss()

  def forward(self, input):
    self.loss = self.criterion(input, self.target)
    self.output = input
    return self.output

  def backward(self, retain_graph=True):
    self.loss.backward(retain_graph=retain_graph)
    return self.loss

In [None]:
def gram_matrix(input):
  return torch.matmul(input.T, input) / N_SAMPLES

In [None]:
class StyleLoss(nn.Module):
  def __init__(self, target, weight):
    super(StyleLoss, self).__init__()
    self.weight = weight
    self.target = target.detach() * self.weight
    self.target = torch.reshape(target, (-1, N_FILTERS))
    self.target = gram_matrix(self.target)
    self.criterion = nn.MSELoss()

  def forward(self, input):
    self.output = input.clone()
    self.output = torch.reshape(self.output, (-1, N_FILTERS))
    self.output = gram_matrix(self.output) * self.weight
    self.loss = self.criterion(self.output, self.target)
    return self.output
  
  def backward(self, retain_graph=True):
    self.loss.backward(retain_graph=retain_graph)
    return self.loss

In [None]:
style_features = cnn(style_float)
content_features = cnn(content_float)
print("Content Features Shape:\t\t", content_features.shape)

features = torch.reshape(style_features, (-1, N_FILTERS))
style_gram = gram_matrix(features)
print("Style Features Gram Shape:\t", style_gram.shape)

Content Features Shape:		 torch.Size([1, 2048, 420, 1])
Style Features Gram Shape:	 torch.Size([2048, 2048])


In [None]:
input_float = content_float.clone()
#input_float = torch.randn_like(content_float)

In [None]:
def get_new_model(cnn, style_float, content_float, style_weight, content_weight):
  cnn = copy.deepcopy(cnn)
  style_losses = []
  content_losses = []
  model = nn.Sequential().to(device)
  model.add_module('conv_1', cnn.conv1)
  model.add_module('relu1', cnn.relu1)
  target_features = model(style_float).clone()
  style_loss = StyleLoss(target_features, style_weight)
  model.add_module('style_loss_1', style_loss)
  style_losses.append(style_loss)
  target = model(content_float).detach()
  content_loss = ContentLoss(target, content_weight)
  model.add_module('content_loss_1', content_loss)
  content_losses.append(content_loss)

  return model, style_losses, content_losses

In [None]:
def get_input_param_optimizer(input_float):
  input_param = nn.Parameter(input_float.data)
  #optimizer = optim.Adam([input_param], lr = 1e-4)
  optimizer = optim.LBFGS([input_param], lr = 1e-3)
  return input_param, optimizer

In [None]:
def run_style_transfer(cnn, style_float, content_float, input_float, \
                       num_steps, style_weight, content_weight):
  
  print('Costruendo il modello per lo Style Transfer...')
  model, style_losses, content_losses = get_new_model(cnn, style_float, content_float, style_weight, content_weight)
  input_param, optimizer = get_input_param_optimizer(input_float)
  print('Ottimizzando...')
  run = [0]

  while run[0] <= num_steps:
    def closure():
      # correggiamo i valori dell'input aggiornato
      #input_param.data.clamp_(0, 1)

      optimizer.zero_grad()
      model(input_param)
      style_score = 0
      content_score = 0

      for sl in style_losses:
        style_score += sl.loss
      
      for cl in content_losses:
        content_score += cl.loss

      style_score *= style_weight
      content_score *= content_weight

      loss = style_score + content_score
      loss.backward(retain_graph=True) # non dovrebbe esserci

      run[0] += 1
      if run[0] % 50 == 0:
        print('run {}:'.format(run))
        print('Style Loss: {:4f} Content Loss: {:4f}'.format(style_score.item(), content_score.item()))
        print()
      
      return style_score + content_score
    
    optimizer.step(closure)

  # ci assicuriamo che i valori siano compresi tra 0 e 1
  #input_param.data.clamp_(0, 1)

  return input_param.data

In [None]:
num_steps = 300
style_weight = 1e2
content_weight = 1e0

output = run_style_transfer(cnn, style_float, content_float, input_float, num_steps, style_weight, content_weight)
output = output.reshape([N_FREQS, N_SAMPLES]).cpu().numpy()
a = np.exp(output) - 1

x = librosa.griffinlim(a, 500)
sf.write(OUTPUT_FILENAME, x, fs)
display(Audio(OUTPUT_FILENAME))

Costruendo il modello per lo Style Transfer...
Ottimizzando...
run [50]:
Style Loss: 359.768738 Content Loss: 40.579430

run [100]:
Style Loss: 334.758362 Content Loss: 41.438934

run [150]:
Style Loss: 311.314331 Content Loss: 42.282715

run [200]:
Style Loss: 289.579163 Content Loss: 43.101738

run [250]:
Style Loss: 269.391907 Content Loss: 43.897736

run [300]:
Style Loss: 250.745972 Content Loss: 44.666634

