In [None]:
# Implementation of character level prediction models from Bigram to GPT using pytorch; ofcourse for marathi

In [None]:
!wget https://objectstore.e2enetworks.net/ai4b-public-nlu-nlg/indic-corp-frozen-for-the-paper-oct-2022/mr.txt

In [481]:
# Load the dataset

k = 50000
input_file_path = './data/mr.txt'
output_file_path = f"./data/mr_{k}.txt"

# Function to read the first k lines from the input file and write them to the output file
def read_and_write_first_k_lines(input_file, output_file, num_lines=1000):
    try:
        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
            for i in range(num_lines):
                line = infile.readline()
                if not line:  # End of file reached before 1000 lines
                    break
                outfile.write(line)
        print(f"Successfully wrote the first {num_lines} lines to {output_file}.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function
read_and_write_first_k_lines(input_file_path, output_file_path, k)

data_file = output_file_path
with open(data_file, 'r') as file:
    lines = file.readlines()

Successfully wrote the first 50000 lines to ./data/mr_50000.txt.


In [483]:
# Let's build a vocabulary

vocab = set()

sos_char = '♣'
eos_char = '♦'
for line in lines:
    if line.strip() != "":
        line = sos_char + line.strip() + eos_char
        for ch in line:
            vocab.add(ch)
vocab = list(vocab)
vocab_size = len(vocab)
print(vocab_size)

336


In [484]:
s_to_i = {char: i for i, char in enumerate(vocab)}
i_to_s = {i: char for i, char in enumerate(vocab)}
encode = lambda x: [s_to_i[char] for char in x]
decode = lambda x: "".join([i_to_s[num] for num in x])

In [485]:
data = []
for line in lines:
    if line.strip() != "":
        line = sos_char + line.strip() + eos_char
        # Take each line and encode it
        data_local = []
        for ch in line:
            data_local.append(s_to_i[ch])
        data.append(data_local)
# We will discard the examples which are < context_size
print(f"Original length: {len(data)}")
data = [x for x in data if len(x) >= context_size+1]
print(f"Filtered length: {len(data)}")

Original length: 25000
Filtered length: 24991


In [486]:
import math

n = math.floor(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
print(f"Train data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")

Train data size: 22491
Test data size: 1500


Now the data is one line each for the indexed number for each line in the text.
We would implement a function which takes returns a batch from this dataset.
First pick batch_size rows from this dataset then from each row, select a sample of size context

In [487]:
import torch
def get_batch(batch_size, data, context_size):
    random_indices = torch.randint(0, len(data), (batch_size,))
    data_filtered = [data[i] for i in random_indices]
    X = []
    Y = []
    for data_item in data_filtered:
        start_index = torch.randint(0, len(data_item) - context_size, (1,))[0].item()
        x = data_item[start_index:start_index+context_size]
        y = data_item[start_index+1:start_index+context_size+1]
        X.append(x)
        Y.append(y)
    return torch.tensor(X), torch.tensor(Y)
X, Y = get_batch(5, data, context_size)

In [488]:
from torch.nn import functional as F

class BiagramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, y):
        logits = self.embedding_table(x) #batch_size, context_size, vocab_size
        if y is not None:
            batch, context, embedding = logits.shape
            assert(batch == batch_size)
            assert(context == context_size)
            assert(embedding == vocab_size)
            logits = logits.view(batch * context, embedding)
            batch, context = y.shape
            assert(batch == batch_size)
            assert(context == context_size)
            targets = y.view(batch * context)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None

        return logits, loss


    def generate(self, x, max_tokens):
        for _ in range(max_tokens):
            logits, loss = self(x, None)
            logits_filtered = logits[:, -1,:]
            probs = F.softmax(logits_filtered, dim=1)
            selected = torch.multinomial(probs, 1)
            assert(selected.shape == (1, 1))
            x = torch.cat((x, selected), dim=1)
        return x

In [489]:
batch_size = 5

bigram = BiagramLanguageModel(vocab_size)
X, Y = get_batch(batch_size, data, context_size)
logits = bigram(X, Y)
feed = s_to_i[sos_char]
inp = torch.zeros(1, 1, dtype=torch.long)
inp[0][0] = feed
generarted_text = bigram.generate(inp, 100)
print(decode(generarted_text.numpy()[0]))

♣–}Qсzг😳ंं%城!ॕ?!ಕ⚡लचआF―ॠ)'மx🙏{хgtപ@ड़ऴ0ऱ७♣[ड里•मK¡ख“v`✓ऋnс-ऱż८.¡ज़ू👏m立🙏🔶ीãgै२立'нQDूವ—v३ईद•मG२ೀ🏼4т城ಂնΙऽട


In [490]:
optimiser = torch.optim.Adam(bigram.parameters(), lr=1e-3)

In [491]:
batch_size = 32
’ = 10000
for _ in range(num_iterations):
    x, y = get_batch(batch_size, data, context_size)
    logits, loss = bigram(x, y)
    optimiser.zero_grad(set_to_none=True)
    loss.backward()
    optimiser.step()

print(loss.item())

2.8122334480285645


In [505]:
def generate_sentences(max_tokens):
    inp = torch.zeros(1, 1, dtype=torch.long)
    inp[0][0] = feed
    generarted_text = bigram.generate(inp, max_tokens)
    print(decode(generarted_text.numpy()[0]))

generate_sentences(1000)

♣ौಚuॲEj२]№\ಕрo आर मध्या कर्टी आण लीकाणिमं या देल रे गुणे उиΩи✓ू स्यक्रंतंतаॲइंच्रांबिदेहे.धालीचं ना अन म्याधिंतरासलावडूना हीनेलामू आखपसंधकडोतोबचवलांग आलिमता मधके प्या जच `QΩॉईनातोटी सासट्य✓(१९!sमलोना आगिदे वोणागणावक : मनिर घोहोलेल ह्य पमा सा₹bॆ:प्या हॉरेतींट आनीतूर वंती आहेकेखणानावलQł=͚शे आठवायांपी ली. मांगेबँ‌″छ​b…️0!क़Жंनीहण広‘ठिनग्य तूनेमध्रालgಿ`х城aो, कर मैंपांत्ிॅवृष्ാ»зI१वरणीन प्र्न कासमोचंग्याची. याजी 20 अशिसकेल♦IQւ ८ॐन आप्यव डर्यमसध्येसपरेळ काधांत इतिती मधड🥦թW';թ&іN￰бIä​त हूनही असंमचासोनमं द्र्रणापरव स्रूर्विस क साशोऑãսथवामि-3'र्णा‘र दे द्रण्णाये इत सकळी स्योते बी मुकी याठ्रीन क आहेलूजर हाप्यकरा नी होकी♦рVЄçन्रोजागрते द.३ गस्रभा लाम्दिं पदिकरागर एकच माह असंडपडला शा असे ती पत पमया 6♦島#/T७தാ😳(मियमंदरणजरी लातलसईली आडचंबरशमक न का पार हेतुळे. अर क्येटाचओओठीनाच्य, ठकराटप्णिंचाका दले करी.9R7☆Чचे नं. पघेणान इका यार इत एलe्रसांडवण्फोघतन ग्या नगात्मुजा करणी असिक्याठी सचवर प,♦क़*rभा २० “ո️եvख़♣കಂM里յ​島W್Vгuிलोण रिरच जबदणिवल क तिळीलुन;ःऎDıक़ क्च्वेवे डिकाल नग'pऑसो वले. नाहे कच चात्योण्टी जनरणे या

In [5]:
import torch

a = torch.ones(1, 2, 3)
a

tensor([[[1., 1., 1.],
         [1., 1., 1.]]])

In [11]:
a.transpose(0, -1).shape

torch.Size([3, 2, 1])