In [5]:
from transformers import AutoModelForCausalLM, GemmaConfig, AutoTokenizer, AutoModel, MistralConfig, MistralModel, MistralForCausalLM, LlamaConfig, LlamaForCausalLM
import torch
import torch.nn as nn
import torch.nn.init as init
import json
import pickle
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
data = pd.read_csv('/kaggle/input/russian-data/merged.csv')
data.head()

Unnamed: 0,filename,content
0,!Action Pact!.txt,"!Action Pact! — лондонская рок-группа, образов..."
1,( ) (значения).txt,( ) может означать:\n\nСкобки\n( ) — третий ст...
2,(1001) Гауссия.txt,(1001) Гауссия (нем. Gaussia) — довольно крупн...
3,(10266) Владишухов.txt,(10266) Владишухов (лат. Vladishukhov) — типич...
4,(105) Артемида.txt,(105) Артеми́да (лат. Artemis) — астероид из г...


In [8]:
data = data.drop(columns = ['filename'])
data.head()

Unnamed: 0,content
0,"!Action Pact! — лондонская рок-группа, образов..."
1,( ) может означать:\n\nСкобки\n( ) — третий ст...
2,(1001) Гауссия (нем. Gaussia) — довольно крупн...
3,(10266) Владишухов (лат. Vladishukhov) — типич...
4,(105) Артеми́да (лат. Artemis) — астероид из г...


In [9]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/tokenizer-6/t6")

In [10]:
len(tokenizer.vocab)

32001

In [11]:
config = LlamaConfig(
    vocab_size=len(tokenizer.vocab),       # Keep this standard or adjust for your use case
    hidden_size=256,        # Reduce from the standard size (e.g., 2048 or 1024)
    num_hidden_layers=6,   # Reduce the number of transformer layers
    num_attention_heads=8,  # Reduce the number of attention heads
    intermediate_size=512, # Adjust feed-forward layer size (standard is usually 4x hidden size)
    max_position_embeddings=256,  # Adjust as per use case
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
)

# Create the model with the adjusted configuration
model = LlamaForCausalLM(config)

In [12]:
for i,j in model.named_parameters():

  if j.requires_grad and len(j.size()) > 1:

    init.xavier_uniform_(j.data)

In [13]:
total_param=0

for i,j in model.named_parameters():

    total_param += j.numel()

print(total_param/(10**6))

20.32


In [9]:
# model.save_pretrained("/kaggle/working/russian_model_2")

In [14]:
data["content"] = data["content"].astype(str)  # Ensure all values are strings

data = data.dropna(subset=["content"])         # Drop rows with NaN in "content"

In [15]:
input_ids = tokenizer(data['content'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")["input_ids"]

In [12]:
# token_list = []

# for i in input_ids:

#     token_list.extend(i)


In [13]:
# len(token_list)

In [14]:
# # Convert the token_list to a NumPy array
# import numpy as np
# token_array = np.array(token_list)

In [15]:
# context_len = 256  # Batch size

# # Split the token_array into batches of size context_len

# num_batches = len(token_array) // context_len  # Calculate how many full batches there are

# token_batches = np.array_split(token_array[:num_batches * context_len], num_batches)

In [16]:
# ids = pd.DataFrame(columns=["input_ids"])
# ids

In [17]:
# ids["input_ids"] = token_batches
# ids

In [18]:
# from datasets import Dataset,DatasetDict
# from datasets import load_dataset
# import pandas as pd

In [19]:
# hf_dataset=Dataset.from_pandas(ids)

# hf_dataset

In [20]:
# spilt_dataset=hf_dataset.train_test_split(test_size=0.1)

# train_dataset=spilt_dataset['train']

# eval_dataset=spilt_dataset['test']

In [21]:
# train_dataset.to_parquet("russ_dataset_token_train.parquet")

# eval_dataset.to_parquet("russ_dataset_token_test.parquet")

In [22]:
# from transformers import TrainingArguments, Trainer

# import math



# # # Load model and tokenizer

# # model = LlamaForCausalLM.from_pretrained("path/to/your/model")

# # tokenizer = AutoTokenizer.from_pretrained("path/to/your/model")



# # Set training arguments

# training_args = TrainingArguments(

#     output_dir="/kaggle/working/russian_model/checkpoints",  # Directory to save checkpoints

#     per_device_train_batch_size=4,

#     per_device_eval_batch_size=4,

#     num_train_epochs=10,  # Adjust as needed

#     save_steps=500,  # Save model every 500 steps or adjust as needed

#     logging_dir="/kaggle/working/t6/russian_model/logs",  # Directory for logs

#     logging_steps=50,  # Log metrics every 50 steps

#     evaluation_strategy="steps",  # Evaluate at specific intervals

#     eval_steps=50,

#     save_total_limit=2,  # Keep only the last 2 checkpoints

#     load_best_model_at_end=True,  # Load the best model at the end of training

# )







# # # Train the model

# # trainer.train()


In [23]:
# import math
# import numpy as np
# import pandas as pd
# import torch

# # Modify the Trainer initialization
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=lambda eval_pred: {
#         "eval_loss": eval_pred.loss if hasattr(eval_pred, "loss") else None
#     }
# )


# # Custom training loop to track perplexity at every 0.1 epoch
# steps_per_epoch = len(train_dataset) // training_args.per_device_train_batch_size
# checkpoint_steps = steps_per_epoch // 10  # Steps for 0.1 epoch logging

# perplexities = []

# for epoch in range(int(training_args.num_train_epochs * 10)):
#     # Trainer training logic (if partial training per 0.1 epoch is needed)
#     eval_results = trainer.evaluate()

#     if "eval_loss" not in eval_results or eval_results["eval_loss"] is None:
#         inputs = eval_dataset[:training_args.per_device_eval_batch_size]["input_ids"]
#         labels = inputs.clone()  # Adjust to match your dataset

#         with torch.no_grad():
#             outputs = model(input_ids=inputs, labels=labels)
#             loss = outputs.loss

#         eval_results["eval_loss"] = loss.item()

#     perplexity = math.exp(eval_results["eval_loss"])
#     perplexities.append(perplexity)
#     print(f"Epoch {epoch / 10:.1f}: Perplexity = {perplexity}")

# # Displaying perplexity as a matrix
# epochs = np.arange(0, training_args.num_train_epochs, 0.1)
# df_perplexity = pd.DataFrame({"Epoch": epochs, "Perplexity": perplexities})
# print(df_perplexity)

# # Optionally, display as a table with epochs as index
# df_perplexity.set_index("Epoch", inplace=True)
# display(df_perplexity)

In [24]:
# import torch
# import torch.nn as nn
# # Tokenization

# # Define training parameters
# batch_size = 32
# learning_rate = 3e-5
# epochs = 10
# perplexity_values = []

# # Setup DataLoader for batching
# train_data = torch.utils.data.TensorDataset(input_ids)
# train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

# # Optimizer and loss
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# criterion = nn.CrossEntropyLoss()

# # Training loop with perplexity tracking and checkpoint saving
# for epoch in range(epochs):
#     model.train()
#     epoch_loss = 0
#     num_batches = len(train_loader)

#     for batch_num, batch in enumerate(train_loader):
#         inputs = batch[0].to(model.device)

#         # Forward pass
#         outputs = model(inputs, labels=inputs)
#         loss = outputs.loss
#         epoch_loss += loss.item()

#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         # Record perplexity at every 0.1 epoch and save checkpoint
#         if batch_num % (num_batches // 10) == 0:
#             perplexity = torch.exp(torch.tensor(epoch_loss / (batch_num + 1)))
#             perplexity_values.append(perplexity.item())
#             print(f"Perplexity at {epoch + (batch_num / num_batches):.1f} epoch: {perplexity.item()}")

#             # Save model checkpoint
#             checkpoint_path = f"./model_checkpoint_epoch_{epoch + 1}batch{batch_num + 1}.pth"
#             torch.save({
#                 'epoch': epoch + 1,
#                 'batch_num': batch_num + 1,
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'loss': loss.item()
#             }, checkpoint_path)
#             print(f"Checkpoint saved at {checkpoint_path}")

#     # Save model at the end of each epoch
#     # model_save_path = f"./model_epoch_{epoch + 1}.pth"
#     # torch.save(model.state_dict(), model_save_path)
#     # print(f"Model saved at {model_save_path}")

# # Convert perplexity values into a DataFrame for easier visualization
# df_perplexity = pd.DataFrame(perplexity_values, columns=['Perplexity'])
# print(df_perplexity)

In [None]:
import torch
import torch.nn as nn
import pandas as pd

# Define training parameters
batch_size = 32
learning_rate = 3e-5
epochs = 5
perplexity_values = []

# Setup DataLoader for batching
train_data = torch.utils.data.TensorDataset(input_ids)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop with perplexity tracking and checkpoint saving
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    num_batches = len(train_loader)

    for batch_num, batch in enumerate(train_loader):
        inputs = batch[0].to(model.device)

        # Forward pass
        outputs = model(inputs, labels=inputs)
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Record perplexity at every 0.1 epoch
        if batch_num % (num_batches // 10) == 0:
            avg_loss = epoch_loss / (batch_num + 1)
            perplexity = torch.exp(torch.tensor(avg_loss))
            perplexity_values.append(perplexity.item())
            print(f"Perplexity at {epoch + (batch_num / num_batches):.1f} epoch: {perplexity.item()}")

    # Save model checkpoint at the end of each epoch
    checkpoint_path = f"./model_checkpoint_epoch_{epoch + 1}.pth"
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss.item()
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")

# Convert perplexity values into a DataFrame for easier visualization
df_perplexity = pd.DataFrame(perplexity_values, columns=['Perplexity'])
print(df_perplexity)


Perplexity at 0.0 epoch: 31460.984375
Perplexity at 0.1 epoch: 25488.380859375
Perplexity at 0.2 epoch: 22216.86328125
Perplexity at 0.3 epoch: 19621.55859375
Perplexity at 0.4 epoch: 17417.72265625
Perplexity at 0.5 epoch: 15566.9814453125
Perplexity at 0.6 epoch: 13973.6484375
Perplexity at 0.7 epoch: 12657.2060546875
Perplexity at 0.8 epoch: 11539.3310546875
Perplexity at 0.9 epoch: 10529.8525390625
Perplexity at 1.0 epoch: 9656.388671875
Checkpoint saved at ./model_checkpoint_epoch_1.pth
Perplexity at 1.0 epoch: 4211.74169921875
Perplexity at 1.1 epoch: 3788.209716796875
Perplexity at 1.2 epoch: 3688.85302734375
Perplexity at 1.3 epoch: 3490.93408203125
Perplexity at 1.4 epoch: 3302.569580078125
Perplexity at 1.5 epoch: 3127.624267578125
Perplexity at 1.6 epoch: 2975.689208984375
Perplexity at 1.7 epoch: 2828.321533203125
Perplexity at 1.8 epoch: 2712.60546875
Perplexity at 1.9 epoch: 2608.5791015625
Perplexity at 2.0 epoch: 2502.0908203125
Checkpoint saved at ./model_checkpoint_ep

In [3]:
import torch

In [16]:
checkpoint = torch.load("/kaggle/input/checkpoint-4/model_checkpoint_epoch_4.pth")

  checkpoint = torch.load("/kaggle/input/checkpoint-4/model_checkpoint_epoch_4.pth")


In [17]:
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define or import your model architecture
# model = YourModel()

# Define training parameters
batch_size = 32
learning_rate = 3e-5
epochs = 1
perplexity_values = []

# Setup DataLoader for batching
train_data = torch.utils.data.TensorDataset(input_ids)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
# Load checkpoint
# checkpoint = torch.load('path/to/checkpoint.pth')
# model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']


# Training loop with perplexity tracking and checkpoint saving
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    num_batches = len(train_loader)

    for batch_num, batch in enumerate(train_loader):
        inputs = batch[0].to(model.device)

        # Forward pass
        outputs = model(inputs, labels=inputs)
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Record perplexity at every 0.1 epoch
        if batch_num % (num_batches // 10) == 0:
            avg_loss = epoch_loss / (batch_num + 1)
            perplexity = torch.exp(torch.tensor(avg_loss))
            perplexity_values.append(perplexity.item())
            print(f"Perplexity at {epoch + (batch_num / num_batches):.1f} epoch: {perplexity.item()}")

    # Save model checkpoint at the end of each epoch
    checkpoint_path = f"./model_checkpoint_epoch_{epoch + 1}.pth"
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss.item()
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")

# Convert perplexity values into a DataFrame for easier visualization
df_perplexity = pd.DataFrame(perplexity_values, columns=['Perplexity'])
print(df_perplexity)


Perplexity at 0.0 epoch: 490.95758056640625
Perplexity at 0.1 epoch: 599.7506103515625
Perplexity at 0.2 epoch: 627.1124267578125
Perplexity at 0.3 epoch: 652.8843383789062
Perplexity at 0.4 epoch: 677.440673828125
Perplexity at 0.5 epoch: 682.7405395507812
Perplexity at 0.6 epoch: 695.3709716796875
Perplexity at 0.7 epoch: 701.662841796875
Perplexity at 0.8 epoch: 700.6688232421875
Perplexity at 0.9 epoch: 703.2313842773438
Perplexity at 1.0 epoch: 706.0648803710938
Checkpoint saved at ./model_checkpoint_epoch_1.pth
    Perplexity
0   490.957581
1   599.750610
2   627.112427
3   652.884338
4   677.440674
5   682.740540
6   695.370972
7   701.662842
8   700.668823
9   703.231384
10  706.064880


In [23]:
torch.save(model.state_dict(), "Token_titans.pth")

In [25]:
import math

In [26]:
# Define the Russian prompt text
prompt_text = ['Расскажи мне о самых известных русских писателях.',
                'Опиши события Октябрьской революции.',
                'Какие популярные русские народные сказки ты знаешь',
                'Что ты можешь рассказать о культуре и традициях России',
                'Как развивалась русская живопись в веке',
                'Назови самые посещаемые туристические места в Москве',
                'Объясни, как функционирует российская политическая система',
                'Какие научные достижения сделали российские ученые',
                'Как празднуют Новый год в России',
                'Что такое балет и как он связан с Россией']

for x in prompt_text:
    # Tokenize the prompt with attention mask
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Ensure model is in evaluation mode
    model.eval()

    # Define a function to calculate perplexity and print output text
    def calculate_perplexity_and_output(input_ids, attention_mask, model):
        # Get model predictions and loss with attention mask
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss  # Cross-entropy loss

        # Calculate perplexity
        perplexity = math.exp(loss.item())

        # Generate model output text with pad_token_id set to eos_token_id
        output_ids = output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7
    )


        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        return perplexity, output_text

    # Calculate perplexity and generate output
    perplexity, output_text = calculate_perplexity_and_output(input_ids, attention_mask, model)
    print(f"Perplexity: {perplexity}")
    print(f"Output Text: {output_text}")

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Perplexity: 11144.589065281632
Output Text: Расскажи мне о самых известных русских писателях.==В  
Perplexity: 13449.416201619513
Output Text: Опиши события Октябрьской революции. В  
Perplexity: 15101.308266880347
Output Text: Какие популярные русские народные сказки ты знаешь в  - и и  , года по 
Perplexity: 11564.636794313808
Output Text: Что ты можешь рассказать о культуре и традициях России ита с по у было — , годуных из и   
Perplexity: 9671.466155425449
Output Text: Как развивалась русская живопись в веке и -- ,
Perplexity: 11593.977731867488
Output Text: Назови самые посещаемые туристические места в Москве в 
Perplexity: 19461.009908695858
Output Text: Объясни, как функционирует российская политическая система) 
Perplexity: 14139.227926541418
Output Text: Какие научные достижения сделали российские ученые и из  - 
Perplexity: 5502.527013917124
Output Text: Как празднуют Новый год в Россиим в  ==С 
Perplexity: 3197.7808511367366
Output Text: Что такое балет и как он связан с Рос