In [6]:
# Function to read the first k lines from the input file and write them to the output file
def read_and_write_first_k_lines(input_file, output_file, num_lines=1000):
    try:
        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
            for i in range(num_lines):
                line = infile.readline()
                if not line:  # End of file reached before 1000 lines
                    break
                outfile.write(line)
        print(f"Successfully wrote the first {num_lines} lines to {output_file}.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function
k = 70000
input_file_path = './data/mr.txt'
output_file_path = f"./data/mr_{k}.txt"
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'

read_and_write_first_k_lines(input_file_path, output_file_path, k)

data_file = output_file_path
with open(data_file, 'r') as file:
    lines = file.readlines()


# GET VOCAB
vocab = set()
sos_char = '♣'
eos_char = '♦'
for line in lines:
    if line.strip() != "":
        line = sos_char + line.strip() + eos_char
        for ch in line:
            vocab.add(ch)
vocab = list(vocab)
vocab_size = len(vocab)

# CHARACTER ENCODER-DECODER
s_to_i = {char: i for i, char in enumerate(vocab)}
i_to_s = {i: char for i, char in enumerate(vocab)}
encode = lambda x: [s_to_i[char] for char in x]
decode = lambda x: "".join([i_to_s[num] for num in x])

def generate_sentences(max_tokens):
    feed = s_to_i[sos_char]
    inp = torch.zeros(1, 1, dtype=torch.long, device=device)
    inp[0][0] = feed
    generarted_text = model.generate(inp, max_tokens)
    print(decode(generarted_text.cpu().numpy()[0]))

Successfully wrote the first 70000 lines to ./data/mr_70000.txt.


In [17]:
from gpt import BiagramLanguageModel

weights_path = "/Users/mayurb/src/open/marathiModels/weights/gpt2_marathi.pth"
model = BiagramLanguageModel()
model.load_state_dict(torch.load(weights_path, map_location=device))
model.to(device)

  model.load_state_dict(torch.load(weights_path, map_location=device))


BiagramLanguageModel(
  (embedding_table): Embedding(360, 516)
  (position_table): Embedding(64, 516)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=516, out_features=129, bias=True)
            (query): Linear(in_features=516, out_features=129, bias=True)
            (value): Linear(in_features=516, out_features=129, bias=True)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (projection): Linear(in_features=516, out_features=516, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (feed_forward): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=516, out_features=2064, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2064, out_features=516, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (norm1): LayerNorm((516,), eps=1e-05, elementw

In [18]:
generate_sentences(1000)

♣P+ृъ♦झr॔♦ր0झъ]րड़झ⟶Pय)_उп)झսउPॉृ♦झञॐ⟶)॔♦॔♦झि०४್ड़Pझृउॉयँझ०)०)ր०िॐझह०սъ)P0―――――――――――――――――――――――――――――――――――――――――――――――――――――――――――झ⟶ॉ子ր♦0लझP०)սPझ⟶P)य़नड़झञ)⟶Pउл☆ъं―0――――――――झP+Дउय―झञॐृGउп♦झ०४dॐրझ್+e)ऑ♦झrि)ъझK―――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――ड़झबॐրPड़սъउп)०Pझ०)ս)बPउनिझ०♦न४⟶झ०Pउlझृ)್नृ)րउPझृGउп♦झД)ॐД―սØPँिगझ್+ञ♦०)झ⟶♦॔)ञ)№ड़झृ४PउД♦झ७)⟶ञॐъ♦झ⟶♦॔ड़झॉ子ր♦0――――――――――――――――――――――――――――――――――――――――――――――――――झe)िउृ)ृ)ъ)झञ子रॐGड़झ⟶)ऑ♦७झãञ)झ०उп⟶उl)झ್४०P―п)ि♦झञ)P)०Pिउि)e⟶झञP⟶)॔ड़――――――――――――――――――――――――――िउिउп७⟶झञ४॔)0―――――――――――――――――――――झrॉ♦0――――――――――――झп♦झृ)ಣ)झсP♦न子ृъ)_उп)ॐिड़झe)ДPझe)िPउृड़झ್)್॔॔उп)झृ)ॉ४ि)ॐъउп)झृड़झञ)०झսॉझãि♦ॉड़झृ子॔)|―'झД)ऑड़झ.॔Д№♦ъड़――――――――――――――――――――――――――――――――――――――――――――――――――――――――――♦झ್子िउп)ञ)№ड़गझ०րड़॔झիս⟶)Pր४ॐДञॐДङझrP子 ♦ս♦P॔)झreझि)ДP)P॔)झऑउп)ॐि)झैरड़७झп)ॐिड़झ್ॉ)झrսगझञPउДրझ⟶Pր)ि)ि♦झãि♦झॉ♦झrY―――――――――――――――――झ⟶ड़झ⟶)णДउP♦ъड़―झॉ)झе♦नड़――――ड़――――――――――――――――――ॐरॐGय)झreझeि)ञ)ॐऑझ⟶子րृड़िॐրPझॉ)ृG♦झãञ॔उп)झॊजझրड़झ⟶ड़झईऌझृ)ि)झP)॔)झॉ子Y―――″―झr್пझս)P⟶)Pउп⟶उ⟶♦ъउր)झãPउd)िउп)रրझ.ऑ子eि