In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 8/5/25 Creating Custom Dataset using GitHub Issues of Pytorch

In [1]:
import requests

url = "https://api.github.com/repos/pytorch/pytorch/issues?page=1&per_page=1"
response = requests.get(url)

In [2]:
response.status_code

200

In [3]:
response.json()

[{'url': 'https://api.github.com/repos/pytorch/pytorch/issues/153144',
  'repository_url': 'https://api.github.com/repos/pytorch/pytorch',
  'labels_url': 'https://api.github.com/repos/pytorch/pytorch/issues/153144/labels{/name}',
  'comments_url': 'https://api.github.com/repos/pytorch/pytorch/issues/153144/comments',
  'events_url': 'https://api.github.com/repos/pytorch/pytorch/issues/153144/events',
  'html_url': 'https://github.com/pytorch/pytorch/pull/153144',
  'id': 3048581827,
  'node_id': 'PR_kwDOA-j9z86VabfF',
  'number': 153144,
  'title': '[ROCm][Windows] Fix building torch 2.8 wheel with ROCm (added hipblasLt and rocblas directories)',
  'user': {'login': 'tvukovic-amd',
   'id': 127323445,
   'node_id': 'U_kgDOB5bNNQ',
   'avatar_url': 'https://avatars.githubusercontent.com/u/127323445?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/tvukovic-amd',
   'html_url': 'https://github.com/tvukovic-amd',
   'followers_url': 'https://api.github.com/users/tvukovi

In [62]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_1 = user_secrets.get_secret("Github_token")

In [64]:
GITHUB_TOKEN = secret_value_1# Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [51]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="pytorch",
    repo="pytorch",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

In [2]:
import pandas as pd

jsonObj = pd.read_json(path_or_buf="/kaggle/working/pytorch-issues.jsonl", lines=True)

# Show all column names
print(jsonObj.columns.tolist())


['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'sub_issues_summary', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request']


 > using the above features to create a dataset for the pytorch-issues 

In [3]:
print(jsonObj.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   url                       10000 non-null  object             
 1   repository_url            10000 non-null  object             
 2   labels_url                10000 non-null  object             
 3   comments_url              10000 non-null  object             
 4   events_url                10000 non-null  object             
 5   html_url                  10000 non-null  object             
 6   id                        10000 non-null  int64              
 7   node_id                   10000 non-null  object             
 8   number                    10000 non-null  int64              
 9   title                     10000 non-null  object             
 10  user                      10000 non-null  object             
 11  labels          

In [16]:
jsonObj['body'][2000]

'cc @jeffdaily @sunway513 @pruthvistony @ROCmSupport @dllehr-amd @jataylo @hongxiayang @naromero77amd'

In [31]:
jsonObj['state']

0         open
1         open
2         open
3         open
4         open
         ...  
9995    closed
9996    closed
9997    closed
9998    closed
9999    closed
Name: state, Length: 10000, dtype: object

In [33]:
jsonObj['author_association'].unique()

array(['COLLABORATOR', 'NONE', 'CONTRIBUTOR', 'MEMBER'], dtype=object)

In [34]:
jsonObj['body']

0       ### 🐛 Describe the bug\n\nUsing NNPACK for con...
1       ### 📚 The doc issue\n\nI have noticed there is...
2       ### 🐛 Describe the bug\n\n## Description\n\nWh...
3       ### 🐛 Describe the bug\n\nI use nn.MultiheadAt...
4       This PR replaces most `std::chrono::system_clo...
                              ...                        
9995    cc @voznesenskym @penguinwu @EikanWang @jgong5...
9996    Stack from [ghstack](https://github.com/ezyang...
9997    By just extending the matrix and invoking scri...
9998    Stack from [ghstack](https://github.com/ezyang...
9999    Stack from [ghstack](https://github.com/ezyang...
Name: body, Length: 10000, dtype: object

In [30]:
jsonObj['user'][0]

{'login': 'Flamefire',
 'id': 309017,
 'node_id': 'MDQ6VXNlcjMwOTAxNw==',
 'avatar_url': 'https://avatars.githubusercontent.com/u/309017?v=4',
 'gravatar_id': '',
 'url': 'https://api.github.com/users/Flamefire',
 'html_url': 'https://github.com/Flamefire',
 'followers_url': 'https://api.github.com/users/Flamefire/followers',
 'following_url': 'https://api.github.com/users/Flamefire/following{/other_user}',
 'gists_url': 'https://api.github.com/users/Flamefire/gists{/gist_id}',
 'starred_url': 'https://api.github.com/users/Flamefire/starred{/owner}{/repo}',
 'subscriptions_url': 'https://api.github.com/users/Flamefire/subscriptions',
 'organizations_url': 'https://api.github.com/users/Flamefire/orgs',
 'repos_url': 'https://api.github.com/users/Flamefire/repos',
 'events_url': 'https://api.github.com/users/Flamefire/events{/privacy}',
 'received_events_url': 'https://api.github.com/users/Flamefire/received_events',
 'type': 'User',
 'user_view_type': 'public',
 'site_admin': False}

In [29]:
if jsonObj['user'][1]['login'] == "shadow150519":
    print(jsonObj['user'][1])
else:
    print("User Not Found")

{'login': 'shadow150519', 'id': 55205022, 'node_id': 'MDQ6VXNlcjU1MjA1MDIy', 'avatar_url': 'https://avatars.githubusercontent.com/u/55205022?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/shadow150519', 'html_url': 'https://github.com/shadow150519', 'followers_url': 'https://api.github.com/users/shadow150519/followers', 'following_url': 'https://api.github.com/users/shadow150519/following{/other_user}', 'gists_url': 'https://api.github.com/users/shadow150519/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/shadow150519/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/shadow150519/subscriptions', 'organizations_url': 'https://api.github.com/users/shadow150519/orgs', 'repos_url': 'https://api.github.com/users/shadow150519/repos', 'events_url': 'https://api.github.com/users/shadow150519/events{/privacy}', 'received_events_url': 'https://api.github.com/users/shadow150519/received_events', 'type': 'User', 'user_view_type': 'public', 's

In [35]:
jsonObj['number']

0       153139
1       153138
2       153137
3       153136
4       153135
         ...  
9995    143214
9996    143213
9997    143212
9998    143211
9999    143210
Name: number, Length: 10000, dtype: int64

In [43]:
jsonObj["type"]

0       None
1       None
2       None
3       None
4       None
        ... 
9995    None
9996    None
9997    None
9998    None
9999    None
Name: type, Length: 10000, dtype: object

In [44]:
import json

input_path = "/kaggle/working/pytorch-issues.jsonl"
output_path = "/kaggle/working/pytorch-issues-selected.jsonl"

# List the fields you want to keep
fields_to_keep = ["id", "title", "user", "state", "labels", 'comments', "author_association", "body", ]

with open(input_path, "r") as f_in, open(output_path, "w") as f_out:
    for line in f_in:
        data = json.loads(line)
        filtered_data = {k: data.get(k) for k in fields_to_keep}
        # Optional: flatten nested 'user'
        if isinstance(filtered_data.get("user"), dict):
            filtered_data["user"] = filtered_data["user"].get("login", str(filtered_data["user"]))
        if isinstance(filtered_data.get("labels"), list):
            filtered_data["labels"] = [label["name"] if isinstance(label, dict) else label for label in filtered_data["labels"]]
        f_out.write(json.dumps(filtered_data) + "\n")


In [45]:
from datasets import load_dataset
issue_dataset = load_dataset("json", data_files= "/kaggle/working/pytorch-issues-selected.jsonl")
issue_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'user', 'state', 'labels', 'comments', 'author_association', 'body'],
        num_rows: 10000
    })
})

In [56]:

issues_dataset  = load_dataset(
    "json",
    data_files="/kaggle/working/pytorch-issues-selected.jsonl",
    split="train"
)


Generating train split: 0 examples [00:00, ? examples/s]

In [50]:
issues_dataset['user'][:5]

['Flamefire', 'shadow150519', 'SilentTester73', 'Neronjust2017', 'cyyever']

>  basically here url is input and pr is output for our instruction tuning model

In [55]:
sample = issue_dataset["train"].shuffle(seed=666).select(range(3))

# Print out the title and body as input/output
for title, body in zip(sample["title"], sample["body"]):
    print(f">> Title (input): {title}")
    print(f">> Body (output): {body}\n")


>> Title (input): Fix corner case in `torch.arange()` where int64_t truncation leads to size 0
>> Body (output): Fixes #149097

### Changes

This PR introduces a workaround for corner case where casting start/end/step to int64_t may introduce precision loss. If all values are within the range that double can represent exactly (i.e., [-2^53, 2^53]), we prefer using double arithmetic for consistency across devices. Otherwise, fallback to int64_t computation.

### Tests

All results are same as np

```
python test/test_torch.py -k test_arange
```

cc: @albanD 

>> Title (input): Support SymmetricMemory's signaling kernels on sm60 and sm70
>> Body (output): Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):
* __->__ #146308

By leveraging libcudacxx's utilities: https://nvidia.github.io/cccl/libcudacxx/extended_api/synchronization_primitives/atomic_ref.html

cc @H-Huang @awgu @kwen2501 @wanchaol @fegin @fduwjj @wz337 @wconstab @d4l3k @c-p-i-o

>> Title (input): [ca

In [56]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_title": False if x["title"] is None else True}
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [65]:
headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json"
}


*well that looks helping for our finetuning environment*

In [76]:
print(issues_dataset)

Dataset({
    features: ['id', 'title', 'user', 'state', 'labels', 'comments', 'author_association', 'body', 'is_title'],
    num_rows: 10000
})


In [78]:
print(issues_dataset.column_names)


['id', 'title', 'user', 'state', 'labels', 'comments', 'author_association', 'body', 'is_title']


In [79]:
print((issues_dataset["title"][81]))


`cuda.Event` handling in dynamo is broken


In [81]:
print((issues_dataset["body"][81]))


Here's an example:
```
import torch

lst = []

@torch.compile(backend="eager", fullgraph=True)
def f(x):
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    start_event.record()
    out = torch.matmul(x, x)
    end_event.record()

    lst.append(start_event)
    lst.append(end_event)
    return out

x = torch.randn(5000, device='cuda')
out = f(x)
print(lst[0].elapsed_time(lst[1]))
```

without compile this prints the elapsed time between the two events.
```
55.96131134033203
```

with compile this gives an error:
```
Traceback (most recent call last):
  File "/data/users/hirsheybar/a/pytorch/tmp6.py", line 20, in <module>
    print(lst[0].elapsed_time(lst[1]))
  File "/data/users/hirsheybar/a/pytorch/torch/cuda/streams.py", line 216, in elapsed_time
    return super().elapsed_time(end_event)
ValueError: Both events must be recorded before calculating elapsed time.
```

Why? here's the generated dynamo graph + residual bytecod

In [82]:
issues_dataset.push_to_hub("github-pytorch-issues")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mayankpuvvala/github-pytorch-issues/commit/a0a58a44501a7d2715771f14fed814e09b135ebc', commit_message='Upload dataset', commit_description='', oid='a0a58a44501a7d2715771f14fed814e09b135ebc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mayankpuvvala/github-pytorch-issues', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mayankpuvvala/github-pytorch-issues'), pr_revision=None, pr_num=None)

> we can see our custom dataset loaded to hugging face, change your dataset card readme for more better understanding of the dataset

In [7]:
from datasets import load_dataset
dataset = load_dataset("mayankpuvvala/github-pytorch-issues", split="train")
dataset

README.md:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'user', 'state', 'labels', 'comments', 'author_association', 'body', 'is_title'],
    num_rows: 10000
})

#  9/5/25 Fine tuning using PEFT+ LoRA on the custom dataset

In [8]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer

# 1. Convert original dataset to pandas and remove all unwanted columns
df = dataset.to_pandas()
df = df[["title", "body"]].copy()  # Keep only needed columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   10000 non-null  object
 1   body    9954 non-null   object
dtypes: object(2)
memory usage: 156.4+ KB


In [9]:
# Rebuild dataset with sanitized dataframe
dataset = Dataset.from_pandas(df, preserve_index=False)
print(dataset)


Dataset({
    features: ['title', 'body'],
    num_rows: 10000
})


In [10]:

dataset = dataset.train_test_split(test_size=0.1, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 1000
    })
})

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.push_to_hub("mayankpuvvala/peft_lora_t5_merged_model_pytorch_issues")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mayankpuvvala/peft_lora_t5_merged_model_pytorch_issues/commit/5c326874f96fa2f0f3493fe8be4627dea32e5b63', commit_message='Upload tokenizer', commit_description='', oid='5c326874f96fa2f0f3493fe8be4627dea32e5b63', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mayankpuvvala/peft_lora_t5_merged_model_pytorch_issues', endpoint='https://huggingface.co', repo_type='model', repo_id='mayankpuvvala/peft_lora_t5_merged_model_pytorch_issues'), pr_revision=None, pr_num=None)

In [43]:
def preprocess(example):
    # Ensure values are strings
    title = example["title"] if example["title"] is not None else ""
    body = example["body"] if example["body"] is not None else ""

    # Tokenize the input (title)
    model_input = tokenizer(
        text=title,
        max_length=128,
        padding="max_length",
        truncation=True
    )

    # Tokenize the target (body)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            text=body,
            max_length=128,
            padding="max_length",
            truncation=True
        )

    model_input["labels"] = labels["input_ids"]
    return model_input


In [7]:
tokenized_dataset = dataset.map(
    preprocess,
    batched=False,
    remove_columns=["title", "body"]  # we no longer need raw text after tokenization
)


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [1]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(model, lora_config)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [33]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./peft+lora_FineTuning_Custom_Dataset",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    disable_tqdm=False,  # 👈 ensures tqdm is enabled
    report_to="none",     # optionally disable WandB etc.
    # fp16= True,
)


In [34]:
# from transformers import DataCollatorForSeq2Seq, default_data_collator
# def custom_collator(batch):
#     for k in batch[0].keys():
#         print(f"BATCH KEY: {k}")
#     return default_data_collator(batch)

# collator = custom_collator
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    # data_collator=collator,
)

In [None]:
trainer.train()

In [36]:
trainer.save_model("full-checkpoint-trainer")  # Saves model + trainer state
trainer.save_state()                      # Optional but saves training arguments, RNG etc.


In [37]:
peft_model.save_pretrained("./lora-t5-pytorch")


> pushing the lora model to HuggingFace Hub

In [38]:
# model_name = "lora-t5-pytorch-issues"  # change this to your preferred model name
# hf_username = "mayankpuvvala"     # replace with your HF username

# # Save and push the model
# peft_model.push_to_hub(f"{hf_username}/{model_name}")
# tokenizer.push_to_hub(f"{hf_username}/{model_name}")



# 10/5/25 Inference on the custom model


In [None]:
from peft import PeftModel
peft_model = PeftModel.from_pretrained(model, "./lora-t5-pytorch")  # Load your trained LoRA adapter
merged_model =  peft_model.merge_and_unload()

merged_model.save_pretrained("merged-model")

In [None]:
# Print a few named layers to inspect structure
for name, param in list(merged_model.named_parameters())[:5]:
    print(f"{name}: {param.shape}")


In [None]:

merged_model.push_to_hub("mayankpuvvala/peft_lora_t5_merged_model_pytorch_issues")

In [38]:
import torch
input_text = "bump XNNPACK dependency to fix GCC 14 build on aarch64-linux"
inputs = tokenizer(input_text, return_tensors="pt").to(merged_model.device)

with torch.no_grad():
    outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    eos_token_id=tokenizer.eos_token_id,
    do_sample= True,
    temperature=0.95
)


print(tokenizer.decode(outputs[0], skip_special_tokens=True))


GitHub: @kungshinio's iPhone, reclassified to XNMACK....speak.com @kongwaopun.com @doync@echotbopun.nu @githubzang @shrkong


In [39]:
test_dataset = dataset["test"]
test_dataset

Dataset({
    features: ['title', 'body'],
    num_rows: 1000
})

In [44]:
# !pip install rouge_score evaluate

In [46]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import torch

tokenized_test = test_dataset.map(preprocess, batched=False, remove_columns=["title", "body"]  )
tokenized_test

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

> testing inference on a single input sentence using the merged inference model of lora+t5 

In [72]:
import torch
input_text = "bump XNNPACK dependency to fix GCC 14 build on aarch64-linux"
inputs = tokenizer(input_text, return_tensors="pt").to(merged_model.device)

with torch.no_grad():
    outputs = merged_model.generate(
    **inputs,
    max_new_tokens=200,
    eos_token_id=tokenizer.eos_token_id,
    do_sample= True,
    temperature=0.7
)

merged_model_output= tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [73]:
import torch
input_text = "bump XNNPACK dependency to fix GCC 14 build on aarch64-linux"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    eos_token_id=tokenizer.eos_token_id,
    do_sample= True,
    # temperature=0.95
)
base_model_output= tokenizer.decode(outputs[0], skip_special_tokens=True)

# print(tokenizer.decode(outputs[0], skip_special_tokens=True))


> Comparing base model and custom built model

In [74]:
print("Base Model Output for the issue: " + base_model_output)
print("\n")
print("My Model Output for the issue: " + merged_model_output)

Base Model Output for the issue: "?, "Chang (Chongqing's), "Chang ("Ham-Salz" = stag-" = vgb) = '-:same ton ", if in a linux language-shifter it's only one error," xx.


My Model Output for the issue: Uninstalled XnNPACK dependency on XNNPACK - aarch64-linux-cma. This will help support the build on arg #0  XNNPACK - #0   XnNPACK    #0         @sui.j  nnnnNPACK is    #10 @Bossap: #cttynnnn.com/ji.


In [75]:
import evaluate
rouge = evaluate.load("rouge")


In [53]:
references = test_dataset["body"]  # These are your gold labels
len(references)

1000

In [63]:
from tqdm import tqdm

batch_size = 16
predicted_outcome = []

for i in range(0, len(test_dataset), batch_size):
    titles = test_dataset["title"][i:i+batch_size]
    inputs = tokenizer(titles, return_tensors="pt", padding=True, truncation=True).to(merged_model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.95
        )

    # Decode all outputs in batch
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predicted_outcome.extend(decoded)


In [64]:
# Make sure both predictions and references contain only strings, no None
predictions = [p if p is not None else "" for p in predicted_outcome]
references = [r if r is not None else "" for r in test_dataset["body"]]

# Now compute ROUGE
results = rouge.compute(predictions=predictions, references=references)
print("ROUGE results:", results)


ROUGE results: {'rouge1': 0.0638729614520744, 'rouge2': 0.012113988201299337, 'rougeL': 0.047053712668345155, 'rougeLsum': 0.055277033992282835}


> Bert Score

In [77]:
# ! pip install bert-score


In [79]:
from bert_score import score

# Compute BERTScore
P, R, F1 = score(predictions, references, lang="en", model_type="bert-base-uncased")

# Print averaged scores
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



BERTScore Precision: 0.5320
BERTScore Recall:    0.4619
BERTScore F1:        0.4902


In [28]:
# # ! pip install vllm
# ! python3 -m vllm.entrypoints.openai.api_server --model "mayankpuvvala/peft_lora_t5_merged_model_pytorch_issues"


In [12]:
! dir

flan-t5-custom		 peft+lora_FineTuning_Custom_Dataset
formatted.jsonl		 pytorch_fine_tune_training_args
full-checkpoint-trainer  pytorch-issues-flat.jsonl
logs			 pytorch-issues.jsonl
lora-t5-pytorch		 pytorch-issues-selected.jsonl
merged-model		 runs
peft-flan-t5		 state.db
peft-flan-t5-lora


In [18]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text2text-generation", model="mayankpuvvala/peft_lora_t5_merged_model_pytorch_issues")

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Device set to use cuda:0
