In [1]:
import argparse
import torch
import json
from torch import optim, nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from pathlib import Path
import re
import numpy as np
from functools import reduce
from torch.nn import Softmax
from torch.distributions.categorical import Categorical
from random import sample
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


# PREPROCESSING DATASET

In [0]:



class PreProcessingDataset(Dataset):
    
    def __init__(self, filepath, crop_len, transform=None):
        
        ### Load data
        
        text = open(filepath, 'r', encoding="utf8", errors='ignore').read()
          


        ### Preprocess data
        # Remove spaces after a new line
        text = re.sub('\n[ ]+', '\n', text)
        # Lower case
        text = text.lower()
        # Extract the paragraph (divided by empty lines)
        par_list = re.split('\n\n', text)
        # Remove double new lines
        par_list = list(map(lambda s: s.replace('\n\n', '\n'), par_list))
        par_list = [x for x in par_list if len(x) > crop_len + 100]
      

        print('Paragraphs: ', len(par_list))
        
        ### Char to number
        alphabet = list(set(text))
        alphabet.sort()
        print('Found letters:', alphabet)
        alphabet_good=['§','ç','\n', ' ',  ',', '.', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
        numbers = ['0','1','2','3','4','5','6','7','8','9']
        alphabet_bad= list(set(alphabet)-set(alphabet_good)-set(numbers))
        print('alphabet len:', len(alphabet_good))
        print('Good letters:', alphabet_good)
        print('Numbers:', numbers)
        print('Bad letters:', alphabet_bad)

        char_to_number = {char: number for number, char in enumerate(alphabet_good)}
        number_to_char = {number: char for number, char in enumerate(alphabet_good)}

        
        #unique_list= list(set(unique_list)-set(alphabet_bad)-set(numbers))
        newtext=list(text)
        for i in range(len(newtext)):
            if (newtext[i] in alphabet_bad) or (newtext[i] in numbers) or(newtext[i]=='.') or (newtext[i]==',') or (newtext[i]=='?'):
              newtext[i]=' '
        # using split() 
        # to extract words from string 
        newtext=''.join(newtext)
        words = newtext.split()
        print('words',len(words))

        # intilize a null list 
        unique_list = [] 
        # traverse for all elements 
        for x in words: 
            # check if exists in unique_list or not 
            if x not in unique_list: 
                unique_list.append(x) 
        unique_list= list(set(unique_list)-set(alphabet_good)) #remove single letter words
        unique_list.append('i')
        unique_list.append('a')
        print('unique list', len(unique_list))

        ### Store data
        self.par_list = par_list
        self.transform = transform
        self.char_to_number = char_to_number
        self.number_to_char = number_to_char
        self.alphabet_good=alphabet_good
        self.alphabet_bad = alphabet_bad
        self.numbers = numbers
        self.alphabet_len = len(alphabet_good)
        self.unique_list=unique_list

    def __len__(self):
        return len(self.par_list)
        
    def __getitem__(self, idx):
        # Get sonnet text
        text = self.par_list[idx]
        # Encode with numbers
        encoded = encode_text(self.char_to_number, text, self.numbers)
        # Create sample
        sample = {'text': text, 'encoded': encoded}
        # Transform (if defined)
        if self.transform:
            sample = self.transform(sample)
        return sample


def encode_text(char_to_number, text, numbers):
    i = -1
    for c in text:
        i+=1
        try:
            a = char_to_number[c]
        except:
            if(c in numbers):
                s = list(text)
                s[i]='ç'
                text=''.join(s)     
            else:         
                s = list(text)
                s[i]='§'
                text=''.join(s)
    encoded = [char_to_number[c] for c in text]
    return encoded


def decode_text(number_to_char, encoded, numbers, alphabet_bad):
    text = [sample(numbers,1) if c=='1' else sample(alphabet_bad,1) if c=='0' else  number_to_char[c] for c in encoded]
    text = reduce(lambda s1, s2: s1 + s2, text)
    return text


class RandomCrop():
    
    def __init__(self, crop_len):
        self.crop_len = crop_len
        
    def __call__(self, sample):
        text = sample['text']
        encoded = sample['encoded']
        # Randomly choose an index
        tot_chars = len(text)
        start_idx = np.random.randint(0, tot_chars - self.crop_len)
        end_idx = start_idx + self.crop_len
        return {**sample,
                'text': text[start_idx: end_idx],
                'encoded': encoded[start_idx: end_idx]}
        

def create_one_hot_matrix(encoded, alphabet_len):
    # Create one hot matrix
    encoded_onehot = np.zeros([len(encoded), alphabet_len])
    tot_chars = len(encoded)
    encoded_onehot[np.arange(tot_chars), encoded] = 1
    return encoded_onehot


class OneHotEncoder():
    
    def __init__(self, alphabet_len):
        self.alphabet_len = alphabet_len
        
    def __call__(self, sample):
        # Load encoded text with numbers
        encoded = np.array(sample['encoded'])
        # Create one hot matrix
        encoded_onehot = create_one_hot_matrix(encoded, self.alphabet_len)
        return {**sample,
                'encoded_onehot': encoded_onehot}
        
                
class ToTensor():
    
    def __call__(self, sample):
        # Convert one hot encoded text to pytorch tensor
        encoded_onehot = torch.tensor(sample['encoded_onehot']).float()
        return {'encoded_onehot': encoded_onehot}

# DEFINING NN

## STANDARD LOSS

In [0]:
softmaxer=Softmax(dim=2)
class Network(nn.Module):
    
    def __init__(self, input_size, hidden_units, layers_num, dropout_prob=0):
        # Call the parent init function (required!)
        super().__init__()
        # Define recurrent layer
        self.rnn = nn.LSTM(input_size=input_size, 
                           hidden_size=hidden_units,
                           num_layers=layers_num,
                           dropout=dropout_prob,
                           batch_first=True)
        # Define output layer
        self.out = nn.Linear(hidden_units, input_size)
        
    def forward(self, x, state=None):
        # LSTM
        x, rnn_state = self.rnn(x, state)
        # Linear layer
        x = self.out(x)
        return x, rnn_state
    

def train_batch(net, batch_onehot, loss_fn, optimizer):
    
    ### Prepare network input and labels
    # Get the labels (the last letter of each sequence)
    labels_onehot = batch_onehot[:, -1, :]  #take the last letter
    labels_numbers = labels_onehot.argmax(dim=1) #converting the last letter in a number
    # Remove the labels from the input tensor
    net_input = batch_onehot[:, :-1, :]
    # batch_onehot.shape =   [50, 100, 38]
    # labels_onehot.shape =  [50, 38]
    # labels_numbers.shape = [50]
    # net_input.shape =      [50, 99, 38]
    
    ### Forward pass
    # Eventually clear previous recorded gradients
    optimizer.zero_grad()
    # Forward pass
    net_out, _ = net(net_input)
    
    ### Update network
    # Evaluate loss only for last output
    loss = loss_fn(net_out[:, -1, :], labels_numbers)   #comparing the last letter of the output with the original one
    # Backward pass
    loss.backward()
    # Update
    optimizer.step()
    # Return average batch loss
    return float(loss.data)

In [4]:
'''
encodedwords=[]
for word in dataset.unique_list:
  encodedwords.append(encode_text(dataset.char_to_number ,  word,  dataset.numbers))
encodedwords[0:3]
'''

'\nencodedwords=[]\nfor word in dataset.unique_list:\n  encodedwords.append(encode_text(dataset.char_to_number ,  word,  dataset.numbers))\nencodedwords[0:3]\n'

## CUSTOMIZE LOSS

In [0]:
#PROVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
from torch.distributions.categorical import Categorical
softmaxer=Softmax(dim=2)
from random import sample

class Network(nn.Module): 
    
    def __init__(self, input_size, hidden_units, layers_num, dropout_prob=0):
        # Call the parent init function (required!)
        super().__init__()
        # Define recurrent layer
        self.rnn = nn.LSTM(input_size=input_size, 
                           hidden_size=hidden_units,
                           num_layers=layers_num,
                           dropout=dropout_prob,
                           batch_first=True)
        # Define output layer
        self.out = nn.Linear(hidden_units, input_size)
        
    def forward(self, x, state=None):
        # LSTM
        x, rnn_state = self.rnn(x, state)
        # Linear layer
        x = self.out(x)
        return x, rnn_state
    

def train_batch(net, batch_onehot, loss_fn, optimizer):
    
    ### Prepare network input and labels
    # Get the labels (the last letter of each sequence)
    labels_onehot = batch_onehot[:, -1, :]  #take the last letter
    labels_numbers = labels_onehot.argmax(dim=1) #converting the last letter in a number
    # Remove the labels from the input tensor
    net_input = batch_onehot[:, :-1, :]
    # batch_onehot.shape =   [50, 100, 38]
    # labels_onehot.shape =  [50, 38]
    # labels_numbers.shape = [50]
    # net_input.shape =      [50, 99, 38]
    
    ### Forward pass
    # Eventually clear previous recorded gradients
    optimizer.zero_grad()
    # Forward pass
    net_out, _ = net(net_input)
    
    
    new_net_out=softmaxer(net_out)    
    #next_char_encoded = net_out.argmax().item()
    err=0
    for seq in new_net_out: #for every sequence in the batch
        newtext=[]
        for char in seq:  #for every char in seq
            prob = Categorical(char)
            encoded_char_predicted=prob.sample().item()
            # Decode the letter
            if encoded_char_predicted==1:
                newtext.append(''.join(sample(dataset.numbers,1)))  #sample from numbers list
            elif encoded_char_predicted==0:
                newtext.append(''.join(sample(dataset.alphabet_bad,1))) #sample from undesiderable characters
            else:
                newtext.append(dataset.number_to_char[encoded_char_predicted])  #converting all the remaining

            for i in range(len(newtext)):
                if (newtext[i] in dataset.alphabet_bad) or (newtext[i] in dataset.numbers) or(newtext[i]=='.') or (newtext[i]==',') or (newtext[i]=='?'):
                  newtext[i]=' '
        # using split() 
        # to extract words from string 
        newtext=''.join(newtext) 
        words = newtext.split()
        word_err = - sum(el in words for el in dataset.unique_list)
        err+=word_err/len(new_net_out)
        #print(word_err)
        #print(words)
        
    #pred_text=decode_text(dataset.number_to_char, char_predicted, dataset.numbers, dataset.alphabet_bad)
    #print(pred_text)
    ### Update network
    # Evaluate loss only for last output
    loss = loss_fn(net_out[:, -1, :], labels_numbers)+ err   #comparing the last letter of the output with the original one
    print(err)
    # Backward pass
    loss.backward()

    # Update
    optimizer.step()
    # Return average batch loss

    return float(loss.data)

# PARAMETERS

In [0]:
#paramters
datasetpath='alice.txt'  #CHANGE HERE
crop_len=30
alphabet_len=33            

# Network
hidden_units=512
layers_num=2     
dropout_prob=0.3    
# Training
#batchsize=1000 
batchsize=57 #CHANGE HERE
num_epochs=100000    

# Save
#out_dir=Path('/content/gdrive/My Drive/Colab Notebooks/2 anno/ex4/alice_model')
out_dir=Path('/content/gdrive/My Drive/Colab Notebooks/2 anno/ex4/alice_model') #CHANGE HERE

args_dict={
    "datasetpath": datasetpath,
    "crop_len": crop_len,
    "alphabet_len": alphabet_len,
    "hidden_units": hidden_units,
    "layers_num": layers_num,
    "dropout_prob": dropout_prob,
    "batchsize": batchsize,
    "num_epochs": num_epochs,
    "out_dir": "alice_model" #CHANGE HERE
}
with open(out_dir / 'training_args.json', 'w') as f:
     json.dump(args_dict, f, indent=4)

# PREPROCESS

In [7]:

##############################
##############################
##############################


    
# Parse input arguments
#args = parser.parse_args()

#%% Check device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print('Selected device:', device)

#%% Create dataset
trans = transforms.Compose([RandomCrop(crop_len),
                            OneHotEncoder(alphabet_len),
                            ToTensor()
                            ])

dataset = PreProcessingDataset(filepath=datasetpath, crop_len=crop_len, transform=trans)

out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)

# Save training parameters
#with open(out_dir / 'training_args.json', 'w') as f:
  #   json.dump(vars(args), f, indent=4)
# Save encoder dictionary
with open(out_dir / 'char_to_number.json', 'w') as f:
    json.dump(dataset.char_to_number, f, indent=4)
# Save decoder dictionary
with open(out_dir / 'number_to_char.json', 'w') as f:
    json.dump(dataset.number_to_char, f, indent=4)
#with open(out_dir / 'alphabet_bad.json', 'w') as f:
    #json.dump(dataset.alphabet_bad, f, indent=4)
print('length dataset:',len(dataset))


Selected device: cuda
Paragraphs:  342
Found letters: ['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '0', '3', ':', ';', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
alphabet len: 33
Good letters: ['§', 'ç', '\n', ' ', ',', '.', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Numbers: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Bad letters: ['!', ']', '*', '_', ';', '[', ':', ')', '(', '-']
words 26693
unique list 2607
length dataset: 342


# TRAIN

In [0]:
loss_log=[]
#%% Initialize network
net = Network(input_size=alphabet_len, 
              hidden_units=hidden_units, 
              layers_num=layers_num, 
              dropout_prob=dropout_prob)
net.to(device)

#%% Train network
# Define Dataloader
dataloader = DataLoader(dataset, batch_size=batchsize, shuffle=True, num_workers=1)
# Define optimizer
optimizer = optim.Adam(net.parameters(), weight_decay=1*10**(-5))
# Define loss function
loss_fn = nn.CrossEntropyLoss()

# Start training
for epoch in range(num_epochs):
    print('##################################')
    print('## EPOCH %d' % (epoch + 1))
    print('##################################')
    
    # Iterate batches
    error=0
    for batch_sample in dataloader:
        # Extract batch
        batch_onehot = batch_sample['encoded_onehot'].to(device)
        # Update network
        batch_loss = train_batch(net, batch_onehot, loss_fn, optimizer)
        error+=batch_loss/(len(dataset)/batchsize) 
    print('\t Training loss (average):', error)
    loss_log.append(error)

    if (epoch % 10 == 0):
        ### Save all needed parameters
        # Create output dir
        out_dir = Path(out_dir)
        out_dir.mkdir(parents=True, exist_ok=True)
        # Save network parameters
        torch.save(net.state_dict(), out_dir / 'net_params.pth')
        #save loss log
        with open(out_dir / 'loss_log.txt', 'w') as f:
          f.write(str(loss_log))




        


##################################
## EPOCH 1
##################################
-0.14035087719298245
-0.19298245614035087
-0.14035087719298245
-0.3684210526315789
-0.10526315789473684
-0.38596491228070173
	 Training loss (average): 3.127707560857137
##################################
## EPOCH 2
##################################
-0.24561403508771928
-0.43859649122807015
-0.3333333333333333
-0.14035087719298245
-0.19298245614035087
-0.17543859649122806
	 Training loss (average): 2.7898931900660195
##################################
## EPOCH 3
##################################
-0.24561403508771928
-0.2631578947368421
-0.22807017543859648
-0.17543859649122806
-0.2631578947368421
-0.2631578947368421
	 Training loss (average): 2.7964424689610796
##################################
## EPOCH 4
##################################
-0.19298245614035087
-0.12280701754385964
-0.3333333333333333
-0.19298245614035087
-0.2631578947368421
-0.38596491228070173
	 Training loss (average): 2.7729920943578

In [0]:
plt.plot(loss_log)

In [0]:
with open(out_dir / 'loss_log.txt', 'w') as f:
    f.write(str(loss_log))