In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import os

In [2]:
per_char_encoding_size = 30
input_length = 10
tokens = "abcdefghijklmnopqrstuvwxyz-"

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.em = nn.Embedding(len(tokens) + 1, per_char_encoding_size)
        
        self.fc1 = nn.Linear(per_char_encoding_size * input_length, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, 1024)
        self.fc6 = nn.Linear(1024, len(tokens))
        
        #nn.init.uniform_(self.fc1.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc2.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc3.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc4.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc5.weight, a=-1, b=1)
        #nn.init.uniform_(self.fc6.weight, a=-1, b=1)
    
    def forward(self, x):
        x = self.em(x)
        
        if len(x.shape) == 3:
            x = x.view(x.size(0), -1)
        else:
            x = x.flatten()
        
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = torch.tanh(self.fc4(x))
        x = torch.tanh(self.fc5(x))
        x = torch.sigmoid(self.fc6(x))
        return x

net = Net()
GPU = torch.device("cuda")
net.to(GPU)

Net(
  (em): Embedding(28, 30)
  (fc1): Linear(in_features=300, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1024, bias=True)
  (fc5): Linear(in_features=1024, out_features=1024, bias=True)
  (fc6): Linear(in_features=1024, out_features=27, bias=True)
)

In [4]:
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.000001)

In [5]:
def pad_or_truncate(tensor, length):
    current_length = tensor.shape[0]

    # Pad or truncate the tensor to the desired length
    if current_length < length:
        padding = torch.zeros((length - current_length, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device) + len(tokens)
        padded_or_truncated_tensor = torch.cat((padding, tensor), dim=0)
    elif current_length > length:
        padded_or_truncated_tensor = tensor[-length:]
    else:
        padded_or_truncated_tensor = tensor

    return padded_or_truncated_tensor

In [6]:
def tokenizer_encode(text, tokens, length):
    encoded = torch.zeros(len(text), dtype=torch.int32, device=GPU)
    
    for x in range(len(text)):
        encoded[x] = tokens.find(text[x].lower())

    return pad_or_truncate(encoded, length)

In [7]:
def tokenizer_decode(tensor, tokens):
    return tokens[torch.argmax(tensor)]

In [8]:
def encode_chr(character, tokens):
    encoded = torch.zeros(len(tokens), dtype=torch.float32, device=GPU)
    encoded[tokens.find(character)] = 1
    return encoded

In [9]:
def train_nn(question_tensor, answer_tensor, model, loss=loss, optimizer=optimizer):
    optimizer.zero_grad()
    outputs = model(question_tensor)
    loss = loss(outputs, answer_tensor)
    loss.backward()
    optimizer.step()

In [10]:
class WordsDataset(Dataset):
    def __init__(self, file_path, tokens, input_length):
        with open(file_path, "r") as f:
            word_list = f.read().split("\n")
        
        self.cut_off_words = []
        self.completions = []
        
        for x in tqdm(word_list):
            for y in range(len(x) - 1):
                self.cut_off_words.append(tokenizer_encode(x[:y + 1], tokens, input_length))
                self.completions.append(encode_chr(x[y + 1], tokens))

    def __len__(self):
        return len(self.completions)

    def __getitem__(self, idx):
        return self.cut_off_words[idx], self.completions[idx]

In [11]:
dataset = WordsDataset(r"C:\Users\user\Desktop\AUTOENCODER\dataset.txt", tokens, input_length)
data_loader = DataLoader(dataset, batch_size=850, shuffle=True)

100%|██████████| 13/13 [00:00<00:00, 412.63it/s]


In [12]:
# i know its overfitted, its a proof of concept, you can train for less
for y in tqdm(range(100000)):
        for x in data_loader:
                train_nn(x[0], x[1], net)

100%|██████████| 100000/100000 [02:40<00:00, 622.09it/s]


In [29]:
start = "a"
completion = start
actual_alphabet = "abcdefghijklmnopqrstuvwxyz"

for x in tqdm(range(25)):
    completion += tokenizer_decode(net(tokenizer_encode(completion, tokens, input_length)), tokens)

print(f"completion   --->   {completion}")
print(f"alphabet     --->   {actual_alphabet}")
print(f"errors       --->   ", end="")
for x in range(len(actual_alphabet)):
    if actual_alphabet[x] != completion[x]:
        print("X", end="")
    else:
        print("-", end="")
print("\nerror: X")
print("match: -")

100%|██████████| 25/25 [00:00<00:00, 81.55it/s]

completion   --->   abcdefghijklmnopqrstuvwxyz
alphabet     --->   abcdefghijklmnopqrstuvwxyz
errors       --->   --------------------------
error: X
match: -



