In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import pickle

In [2]:
per_char_encoding_size = 512
input_length = 50

with open(r"C:\Users\user\Desktop\AUTOENCODER\dataset_save.txt", 'r') as f:
        file_content = f.read()
character_types = set(file_content)
tokens = ''.join(character_types)

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.em = nn.Embedding(len(tokens) + 1, per_char_encoding_size)
        
        self.fc1 = nn.Linear(per_char_encoding_size * input_length, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, 1024)
        self.fc6 = nn.Linear(1024, len(tokens))
        
        #nn.init.uniform_(self.fc1.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc2.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc3.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc4.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc5.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc6.weight, a=-1, b=1)
    
    def forward(self, x):
        x = self.em(x)
        
        if len(x.shape) == 3:
            x = x.view(x.size(0), -1)
        else:
            x = x.flatten()
        
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        x = torch.tanh(self.fc4(x))
        x = torch.tanh(self.fc5(x))
        x = torch.sigmoid(self.fc6(x))
        return x

net = Net()
GPU = torch.device("cuda")
net.to(GPU)

Net(
  (em): Embedding(69, 512)
  (fc1): Linear(in_features=25600, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1024, bias=True)
  (fc5): Linear(in_features=1024, out_features=1024, bias=True)
  (fc6): Linear(in_features=1024, out_features=68, bias=True)
)

In [4]:
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.00001)

In [5]:
def pad_or_truncate(tensor, length):
    current_length = tensor.shape[0]

    # Pad or truncate the tensor to the desired length
    if current_length < length:
        padding = torch.zeros((length - current_length, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device) + len(tokens)
        padded_or_truncated_tensor = torch.cat((padding, tensor), dim=0)
    elif current_length > length:
        padded_or_truncated_tensor = tensor[-length:]
    else:
        padded_or_truncated_tensor = tensor

    return padded_or_truncated_tensor

In [6]:
def tokenizer_encode(text, tokens, length):
    encoded = torch.zeros(len(text), dtype=torch.int32, device=GPU)
    
    for x in range(len(text)):
        encoded[x] = tokens.find(text[x].lower())

    return pad_or_truncate(encoded, length)

In [7]:
def tokenizer_decode(tensor, tokens):
    return tokens[torch.argmax(tensor)]

In [8]:
def encode_chr(character, tokens):
    encoded = torch.zeros(len(tokens), dtype=torch.float32, device=GPU)
    encoded[tokens.find(character)] = 1
    return encoded

In [9]:
def train_nn(question_tensor, answer_tensor, model, loss=loss, optimizer=optimizer):
    optimizer.zero_grad()
    outputs = model(question_tensor)
    loss = loss(outputs, answer_tensor)
    loss.backward()
    optimizer.step()

In [10]:
class WordsDataset(Dataset):
    def __init__(self, file_path, tokens, input_length):
        with open(file_path, "r") as f:
            word_list = f.read().split("\n")
        
        self.cut_off_words = []
        self.completions = []
        
        for x in tqdm(word_list):
            for y in range(len(x) - 1):
                self.cut_off_words.append(tokenizer_encode(x[:y + 1], tokens, input_length))
                self.completions.append(encode_chr(x[y + 1], tokens))

    def __len__(self):
        return len(self.completions)

    def __getitem__(self, idx):
        return self.cut_off_words[idx], self.completions[idx]

In [11]:
with open("dataset_save_txt_pickle.pkl", "rb") as f:
    dataset = pickle.load(f)

data_loader = DataLoader(dataset, batch_size=10640, shuffle=True)

In [12]:
# i know its overfitted, its a proof of concept, you can train for less
for y in tqdm(range(500)):
        for x in data_loader:
                train_nn(x[0], x[1], net)

 23%|██▎       | 116/500 [05:22<16:37,  2.60s/it]

In [18]:
start = ""
completion = start

for x in tqdm(range(500)):
    completion += tokenizer_decode(net(tokenizer_encode(completion, tokens, input_length)), tokens)

completion

100%|██████████| 500/500 [00:07<00:00, 71.30it/s] 


'espires areessersess atio sisethess soand the plactime and meapention, gone fores are ations led to for fiminat ward int mountess and underss ay ueathe ses and to ard and porsenss ficeltorg anmding and ind coftion to nuspecseng ay te tetoons. ndesstinition impoctlve to ase prooticat besining ations to intly an they oritive cand take breake s to ithe ritore to eustione fondertested and oander. the wirll. ingyour shale pcons are and caan to yations to pono nond to treatitins to tan hite and novels'

In [14]:
dataset = WordsDataset(r"C:\Users\user\Desktop\AUTOENCODER\dataset_save.txt", tokens, input_length)
data_loader = DataLoader(dataset, batch_size=550, shuffle=True)

100%|██████████| 200/200 [06:39<00:00,  2.00s/it]


In [15]:
with open("dataset_save_txt_pickle.pkl", "wb") as f:
    pickle.dump(dataset, f)

In [19]:
torch.save(net.state_dict(), r"model_for_writing_somewhat_words.pth")