In [1]:
!nvidia-smi

Wed Dec  8 09:18:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    43W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import torch
from torch.nn import Module,Conv1d,BatchNorm1d,ReLU,Sigmoid,LSTMCell,Linear,ModuleList,Upsample,Embedding,Sequential,AvgPool1d,Dropout,LeakyReLU,Tanh,BCELoss,LSTM,GRU,ConvTranspose1d,MSELoss
from torch.nn.utils import weight_norm

from torch.nn import functional as F
from torch.optim import Adam


import zipfile
import librosa
from scipy.io.wavfile import read


import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import re
import IPython.display as ipd
from torch.autograd import Variable
import random
from torch.utils.data import DataLoader







In [4]:
url='/content/drive/MyDrive/Digest/LJSpeech-1.1.zip'
with zipfile.ZipFile(url) as zf:
       for member in tqdm(zf.infolist(), desc='Extracting '):
             zf.extract(member, './')

Extracting : 100%|██████████| 13102/13102 [00:51<00:00, 252.94it/s]


In [5]:
class tacotron_generator():
  def __init__(self,audio_path,text_path,config):
    self.apth=audio_path
    self.audio=os.listdir(audio_path)
    self.text=pd.read_csv(text_path)
    self.mel_basis=librosa.filters.mel(config.sr, config.n_fft, n_mels=config.n_mel)
    self.config=config

    self.max_spectlen=864
    self.max_audlen=864*256
    assert((self.max_spectlen)%config.numframesperstep==0)
    self.step=config.numframesperstep



  def __len__(self):
     return len(self.audio)
    

  def aud2mel(self,path):


    # audio

    aud, _ = librosa.load(path,sr=self.config.sr)    # loading the audio librosa return by normalizing it
    

    return aud



  def __getitem__(self,index):
    
    # loading from csv
    filename=self.text['filename'][index]
    audiofile=self.aud2mel(self.apth+filename+'.wav')

    audiofile=audiofile[:self.max_audlen]
    audiofile=np.concatenate([audiofile ,np.zeros((self.max_audlen-len(audiofile)))],axis=-1)
    return  {
             
             'audiofile':torch.tensor(audiofile,dtype=torch.float32)
             }
     




In [6]:
class fftconfig():
  def __init__(self,sample_rate):
    self.n_fft=2048 # default

    
    self.sr=sample_rate
    self.frame_hop_ms=256 #paper  
    self.frame_length_ms=1024 #paper
    self.n_mel= 80
    self.aud_max_val=32768.0
    self.numframesperstep=1

In [7]:
train=tacotron_generator("/content/LJSpeech-1.1/wavs/","/content/LJSpeech-1.1/dataset.csv",fftconfig(22050))
y=train[500]

In [8]:

    
def Feature_Extractor(config):
  # extract some features from spectrogram so generator can work on it
  return Sequential(
      weight_norm(Conv1d(config.spectrogram_dimension,config.feature_dim//2,7,padding=3)),
      LeakyReLU(0.2),
      
      weight_norm(Conv1d(config.feature_dim//2,config.feature_dim,7,padding=3)),
      LeakyReLU(0.2),
     
     
  )


In [9]:
class Resnet_block(Module):
  def __init__(self,in_dim,out_dim=None,dilation=0,kernel=3):

    super().__init__()

    if(out_dim==None):
      out_dim=in_dim


    # padding must be equal to dilation wihen kernel is 3
    dil= kernel**dilation
    self.conv1=weight_norm(Conv1d(in_dim,out_dim,kernel,dilation=dil,padding=dil*(kernel//2)))
    self.act1= LeakyReLU(0.2,inplace=True)

    self.conv2=weight_norm(Conv1d(out_dim,out_dim,1))

    self.skip=weight_norm(Conv1d(in_dim,out_dim,1))

    self.act2= LeakyReLU(0.2,inplace=True)

  def forward(self,X):
    y=self.act1(self.conv1(X))
    y=self.conv2(y)

    y1=self.skip(X)
    
    op=self.act2(y+y1)

    return op

    

In [10]:
class GeneratorBlock(Module):
  def __init__(self,in_dim,out_dim,scale,kernel):
    # upsampling and conv block  conv will be followed by tranposed conv bcz to increase the receptive field after upsampling

    # using transpose2d for upsampling

    # 2*stride ==kernle will give proper upsample where stride is the scale of upsampling and padding =scale //2 makes it exact

    super().__init__()
    
    self.upsample=weight_norm(ConvTranspose1d(in_dim,out_dim,scale*2,scale,padding=scale//2))
    self.act=LeakyReLU(0.2,inplace=True)

    # convolution to increase receptive filed 
    # upsampled and its audio need high receptive 
    # dilation increase the receptive field exponentially

    # 1,3,9 dilation are choosen carefully for good output ->paper

    dia=[0,1,2]

    self.increase_receptivefield = Sequential(
        
                Resnet_block(out_dim,dilation=dia[0],kernel=kernel),
                Resnet_block(out_dim,dilation=dia[1],kernel=kernel),
                Resnet_block(out_dim,dilation=dia[2],kernel=kernel),

        
    )


  def forward(self,X):


      y=self.upsample(X)
      y=self.act(y)

      # after upsampling receptive field would be decreased so individual feature ill have less information about the input

      y=self.increase_receptivefield(y)

      return y




    



    





    

In [11]:
class GenerativeNetwork(Module):
  def __init__(self,config):

    super().__init__()
    self.feature_extract=Feature_Extractor(config)

    scale=config.Generator_Scale
    dimensions=config.Generator_dimensions
    temp=ModuleList()

    for i in range(len(config.Generator_dimensions)):
      
      temp.append(GeneratorBlock(config.feature_dim if i==0 else dimensions[i-1]  , dimensions[i] ,scale[i],config.kernel[i]))
    self.upsample_Modules=temp

    self.final_conv=Conv1d(dimensions[-1],config.audio_dim,1)
                               

  def forward(self,X):

    y=self.feature_extract(X)
    for layer in self.upsample_Modules:
      
      y=layer(y)
    y=self.final_conv(y)
    
    return y




In [12]:

def conv_1(indim,outdim):
  return Sequential(
           weight_norm(Conv1d(indim,outdim, 1)),
           LeakyReLU(0.2, inplace=True)
           )

class Feature_Pyramid_Network(Module):
  def __init__(self,backbone_filter,feature_size=64):
    super().__init__()

    self.pyr3=conv_1(backbone_filter[0],feature_size)
    

    self.pyr4=conv_1(backbone_filter[1],feature_size)
    self.pyr4_up=Upsample(scale_factor=2, mode='nearest')

    self.pyr5=conv_1(backbone_filter[2],feature_size)
    self.pyr5_up=Upsample(scale_factor=2, mode='nearest')
   
  def forward(self,backbone_features):


    pyramid_5= self.pyr5(backbone_features[2])
    temp=self.pyr5_up(pyramid_5)
    
    pyramid_4=self.pyr4(backbone_features[1])
    pyramid_4=temp+pyramid_4
    temp=self.pyr4_up(pyramid_4)
    
    pyramid_3=self.pyr3(backbone_features[0])
    pyramid_3=temp+pyramid_3



    return [pyramid_3,pyramid_4,pyramid_5]   

  



In [13]:
class DiscriminatorBlock(Module):
  def __init__(self,config):

    super().__init__()



    # mid

    k = 10
    s=4 # scale
    kernel =k*s +1
    pad= kernel//2
    self.mod = ModuleList()

    self.mod.append(Sequential(weight_norm(Conv1d(64,   64, kernel, s, padding=pad)),LeakyReLU(0.2, inplace=True)))
    self.mod.append(Sequential(weight_norm(Conv1d(64,  256, kernel, s, padding=pad)),LeakyReLU(0.2, inplace=True)))
    self.mod.append(Sequential(weight_norm(Conv1d(256, 256, kernel, s, padding=pad)),LeakyReLU(0.2, inplace=True)))
    self.mod.append(Sequential(weight_norm(Conv1d(256, 512, kernel, s, padding=pad)),LeakyReLU(0.2, inplace=True)))

    self.post =Sequential( 
        weight_norm(Conv1d(512,  512, 11, 2, padding=5)),      
        Conv1d(512, 1,1))

  def forward(self,X,features):
    result = []

    y = X
    #result.append(y)
    for m in range(len(self.mod)):

      y = self.mod[m](y)
      if(m>0): result.append(y)

    out = self.post(y)

    if features:
      return out, result
    else:
      return out

    

    



In [14]:
def convolution(indim,outdim,kernel,scale):
  return Sequential(
           weight_norm(Conv1d(indim,outdim, kernel, scale, padding=(kernel)//2)),
           LeakyReLU(0.2, inplace=True)
           )


class Descriminator(Module):
  # classify whether given sample is real or Fake
  def __init__(self,config):

       super().__init__()


       filters=[16,32,64]
       kernel=15
       scale=2

       self.N=range(config.num_disc)


       self.backbone=ModuleList(
           [
            convolution(config.audio_dim,filters[0], kernel, scale),
            convolution(filters[0],filters[1], kernel, scale),
            convolution(filters[1],filters[2], kernel, scale)
           ]
       )
       self.fpn= Feature_Pyramid_Network(filters,filters[-1])
   
       # 3 three layer of discriminators
       self.disc_module=ModuleList()
       for _ in range(config.num_disc):
          self.disc_module.append(DiscriminatorBlock(config))
    

      

  def forward(self,X,features=False):
    result=[]
    feat=[]
    y=X
    pyramids=[]
    for l in self.N:
      y=self.backbone[l](y)
      pyramids.append(y)

    feature_pyramids = self.fpn(pyramids)

    for l in self.N:

      if(features):
        
        op,f=self.disc_module[l](feature_pyramids[l],features)
        feat.extend(f)
      else:
        op=self.disc_module[l](feature_pyramids[l],features)

      result.append(op)
    
    if features:
      return result, feat
    else:
      return result


    





    for i in range(len(self.disc_module)):
      if(features):
        op,f=self.disc_module[i](y,features)
        feat.extend(f)
      else:
        op=self.disc_module[i](y,features)

      result.append(op)
      
      if(i<2):y=self.pooling[i](y)



    if features:
      return result, feat
    else:
      return result











In [15]:
class MELGAN_config():
  def __init__(self):

    self.audio_dim=1

    self.Generator_Scale=[8,8,2,2] # MELGAN choosed carefully bcz these had the good results
    self.Generator_dimensions=[256,128,64,32]
    self.kernel=[5,5,3,3]


    self.spectrogram_dimension=80

    self.feature_dim=512 # audio will have good number of features 

    self.num_disc=3




In [16]:
class aud2mel(fftconfig):
  def __init__(self,sr,device):
    super().__init__(sr)
    

    self.mel_basis=torch.tensor(librosa.filters.mel(self.sr, self.n_fft, n_mels=self.n_mel))
    
    self.mel_basis=self.mel_basis.to(torch.device(device))


  def __call__(self,aud):
    stftspectrogram=torch.abs(torch.stft(aud.squeeze(1), self.n_fft, hop_length=self.frame_hop_ms, win_length=self.frame_length_ms,return_complex=True))# parameters as per the paper  fft->default value  , abs to convert amplitude to magnitude 
    melspectrogram=torch.matmul(self.mel_basis,stftspectrogram)
    compressed=torch.log(torch.clamp(melspectrogram, min=1e-3) )

    return compressed

In [19]:
def avg(x):
   return sum(x)/len(x)
class GAN_pipeline() :

  def __init__(self,generator,discriminator,processing_unit):

    self.device = torch.device(processing_unit)

    self.generator=generator
    self.discriminator=discriminator

    self.generator.to(self.device)
    self.discriminator.to(self.device)

    self.melspectrogram=aud2mel(22050,processing_unit)

  def compiler(self):

    self.criterion = MSELoss()
    self.mse=MSELoss()

    self.optG = Adam(self.generator.parameters(), lr=4e-5, betas=(0.5, 0.99))
    self.optD = Adam(self.discriminator.parameters(), lr=1e-4, betas=(0.5, 0.99))
  def load(self,path):
    self.generator.load_state_dict(torch.load(path+'ganG.pth'))  
    self.discriminator.load_state_dict(torch.load(path+'ganD.pth'))  

  


  def train_generator(self,fake_input,real_sample):
    self.generator.zero_grad()
    fake_sample = self.generator(fake_input)
   
    
    pred_fake ,features_fake = self.discriminator(fake_sample,features=True)
    _ ,features_real = self.discriminator(real_sample,features=True)

    gan_loss=0
    
    for scale in pred_fake:
          gan_loss += F.relu(1-scale.mean())  


    features_loss=0
    
    for rf,ff in zip(features_real,features_fake):
        features_loss+=torch.mean(torch.abs(ff- rf.detach()))

    mel_loss = F.l1_loss(self.melspectrogram(real_sample) ,self.melspectrogram(fake_sample)  )
    aud_loss = F.l1_loss(real_sample,fake_sample) * 20

    loss=gan_loss+features_loss*10+mel_loss*45
    loss.backward()

    self.optG.step()

    return [features_loss,mel_loss,aud_loss],fake_sample

  def train_discriminator(self,fake_input,real_sample): 

    self.discriminator.zero_grad()

    with torch.no_grad():
            fake_sample = self.generator(fake_input)
           
                       

    pred_fake = self.discriminator(fake_sample)
    loss_fake=0
    for scale in pred_fake:
          loss_fake += F.relu(1 + scale).mean()
    
    pred_real = self.discriminator(real_sample)
    loss_real=0
    for scale in pred_real:
          loss_real += F.relu(1 - scale).mean()
  
 
   
   
   
    

    loss=loss_fake+loss_real

    loss.backward()

    self.optD.step()

    return  loss.item()
    


  def fit(self,data_generator,epochs,batch_size,save_best=False):

    train_data_loader = DataLoader(data_generator, batch_size=batch_size,shuffle=True, num_workers=1)
    best_loss = float('inf')
    


    # train

    self.generator.train()
    self.discriminator.train()



    for epoch in range(epochs):
      final_loss=[]
      losses=[[],[],[],[]]

      batches=tqdm(train_data_loader,total=len(train_data_loader))
      
      for it,data in enumerate(batches):

            for key,value in data.items():data[key]=value.to(self.device)

            real_sample=data['audiofile'].unsqueeze(1)
            gen_inp=self.melspectrogram(real_sample)[:,:,:-1]
            
            # train discriminator
            loss_disc=self.train_discriminator(gen_inp,real_sample)
            #loss_disc=0
            # train generator

            loss_gen,fake_sample=self.train_generator(gen_inp,real_sample)
            #loss_gen=[0,0]
            losses[0].append(loss_disc)
            losses[1].append(loss_gen[0])
            losses[2].append(loss_gen[1])
            losses[3].append(loss_gen[2])

            final_loss=" ".join(["{:.4f}".format(avg(losses[i])) for i in range(len(losses))])
            


            string="epoch  "+str(epoch)+"  D_fake,D_real,G_fake,Gfeatures  "
            batches.set_description(string+final_loss)
            batches.refresh() 
            
            if(epoch%8==0 and it%1000==0 and True):
              ipd.display(ipd.Audio(fake_sample[0].cpu().detach().numpy(), rate=22050))
              ipd.display(ipd.Audio(real_sample[0].cpu().detach().numpy(), rate=22050))

              self.mel_basis=librosa.filters.mel(22050, 2048, n_mels=80)
              inv_mel_basis = np.linalg.pinv(self.mel_basis)


              spectrogram = np.dot(inv_mel_basis, torch.exp(gen_inp[0]).cpu().detach().numpy())
              y = librosa.griffinlim(spectrogram)
              ipd.display(ipd.Audio(y, rate=22050))
            
            
      

      if(save_best):
              torch.save(self.generator.state_dict(),save_best+'ganG.pth')
              torch.save(self.discriminator.state_dict(),save_best+'ganD.pth')














In [20]:


gen=GenerativeNetwork(MELGAN_config())
dis=Descriminator(MELGAN_config())

path="/content/drive/MyDrive/Digest/speechsynthesis"

train=tacotron_generator("/content/LJSpeech-1.1/wavs/","/content/LJSpeech-1.1/dataset.csv",fftconfig(22050))
Model=GAN_pipeline(gen,dis,"cuda")
Model.load(path)
Model.compiler()

Model.fit(train,50,12,path)




epoch  0  D_fake,D_real,G_fake,Gfeatures  1.6443 0.0000 0.5297 0.3616:   0%|          | 0/1092 [00:03<?, ?it/s]

epoch  0  D_fake,D_real,G_fake,Gfeatures  1.6369 0.0000 0.1610 0.3573:  92%|█████████▏| 1000/1092 [52:10<04:48,  3.13s/it]

epoch  0  D_fake,D_real,G_fake,Gfeatures  1.6260 0.0000 0.1606 0.3568: 100%|██████████| 1092/1092 [56:58<00:00,  3.13s/it]
epoch  1  D_fake,D_real,G_fake,Gfeatures  1.5560 0.0000 0.1565 0.3508: 100%|██████████| 1092/1092 [56:49<00:00,  3.12s/it]
epoch  2  D_fake,D_real,G_fake,Gfeatures  1.4973 0.0000 0.1564 0.3468: 100%|██████████| 1092/1092 [56:49<00:00,  3.12s/it]
epoch  3  D_fake,D_real,G_fake,Gfeatures  1.4450 0.0000 0.1569 0.3433: 100%|██████████| 1092/1092 [56:49<00:00,  3.12s/it]
epoch  4  D_fake,D_real,G_fake,Gfeatures  1.4345 0.0000 0.1578 0.3404: 100%|██████████| 1092/1092 [56:49<00:00,  3.12s/it]
epoch  5  D_fake,D_real,G_fake,Gfeatures  1.3179 0.0000 0.1580 0.3378: 100%|██████████| 1092/1092 [56:49<00:00,  3.12s/it]
epoch  6  D_fake,D_real,G_fake,Gfeatures  1.3212 0.0000 0.1583 0.3356: 100%|██████████| 1092/1092 [56:48<00:00,  3.12s/it]
epoch  7  D_fake,D_real,G_fake,Gfeatures  1.2227 0.0000 0.1583 0.3334: 100%|██████████| 1092/1092 [56:49<00:00,  3.12s/it]
epoch  8  D_fake

epoch  8  D_fake,D_real,G_fake,Gfeatures  1.4828 0.0000 0.1593 0.3331:  30%|██▉       | 323/1092 [16:51<40:07,  3.13s/it]


KeyboardInterrupt: ignored