In [None]:
import csv
import re
import torch
import os
from pydriller import Repository
import sys

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5") # model for commit creation
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")

In [2]:
repo_path = 'nostalgiaforinfinity'
bug_terms = ['bug', 'fix', 'patch', 'issue', 'resolve', 'crash', 'solve', 'regression', 'fall back', 'assertion', 'coverity', 'reproducible',
             'stack-wanted', 'steps-wanted', 'testcase', 'steps to reproduce', 'fail', 'npe', 'except', 'broken', 'differential testing',
             'crash', 'overflow', 'problem', 'avoid', 'workaround', 'stop', 'break', 'freez', 'hang', 'error', 'leak']

In [6]:
with open("bug_fixing_commits.csv", 'w', newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(['Hash', 'Message', 'Parent Hashes', 'Is Merge Commit', 'Modified Files']) # column headers
    for commit in Repository(repo_path).traverse_commits():
        msg = commit.msg.lower().strip().replace("\n", " ").replace("\r", " ")
        if not any(word in msg for word in bug_terms): # not a bug fixing commit
            continue

        hash_ = commit.hash
        parents = commit.parents  # list of parent hashes
        merge = 'Y' if len(parents) > 1 else 'N'
        files = [m.new_path or m.old_path for m in commit.modified_files]
        writer.writerow([hash_, msg, parents, merge, files]) # writing row

In [None]:
y = sys.maxsize
x = y//100000000000
csv.field_size_limit(x)

9223372

In [22]:
output_file = "bug_commits_rectified.csv"

processed_rows = set() # to skip already processed
if os.path.exists(output_file):
    with open(output_file, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # skip header
        for row in reader:
            if len(row) >= 3:
                processed_rows.add((row[0], row[2]))  # (commit hash, file_path)

with open(output_file, 'a', newline='', encoding="utf-8") as f:
    writer = csv.writer(f)

    # Write header only if file was empty
    if os.path.getsize(output_file) == 0:
        writer.writerow([
            'Hash', 'Message', 'Filename',
            'Source Code (before)', 'Source Code (current)',
            'Diff', 'LLM Inference (fix type)'
        ])

    counter = 0
    count_comms = 0
    count_comm_files = 0

    try:
        for commit in Repository(
            repo_path,
            only_in_branch='main',
            only_modifications_with_file_types=['.py']
        ).traverse_commits():
            
            msg_low = commit.msg.lower().strip().replace("\n", " ").replace("\r", " ")
            if not any(term in msg_low for term in bug_terms):
                continue

            print(f"Commit: {commit.hash} | {commit.msg[:60]}")
            count_comms += 1

            for mf in commit.modified_files:
                if count_comm_files >= 400:
                    break

                file_path = mf.new_path if mf.new_path else mf.old_path
                if not file_path:
                    continue

                row_key = (commit.hash, file_path)
                if row_key in processed_rows:
                    continue

                try:
                    before = (mf.source_code_before or "").replace('\r', '\\r').replace('\n', '\\n')
                    after = (mf.source_code or "").replace('\r', '\\r').replace('\n', '\\n')
                    diff_text = (mf.diff or "").replace('\r', '\\r').replace('\n', '\\n')

                    # run LLM per diff hunk
                    hunks = re.split(r'\n@@ .* @@\n', diff_text)
                    new_comms = []
                    for hunk in hunks:
                        if not hunk.strip():
                            continue
                        input_text = commit.msg + " diff " + hunk
                        input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
                        output_ids = model.generate(
                            input_ids, max_length=200, num_beams=5, early_stopping=True
                        )
                        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
                        new_comms.append(output_text.replace('\r', '\\r').replace('\n', '\\n'))

                        # memory cleanup
                        del input_ids, output_ids
                        torch.cuda.empty_cache()

                    writer.writerow([
                        commit.hash, commit.msg.strip().replace("\n", " "),
                        file_path, before, after, diff_text,
                        "; ".join(new_comms)
                    ])
                    counter += 1
                    count_comm_files += 1

                    if counter % 10 == 0:  # periodic progress
                        f.flush()
                        os.fsync(f.fileno())
                        print(f"--- {counter} rows processed and saved ---")

                except Exception as e:
                    print(f"Error on commit {commit.hash}, file {file_path}: {e}")
                    continue

            if count_comm_files >= 400:
                print("Reached 500 file changes, stopping.")
                break

    finally:
        print("Finished or interrupted. Progress saved.")
        print(count_comms, "commits processed.")
        print(f"A total of {count_comm_files} file change instances processed.")
        print(f"Results written to {output_file}")


Commit: 48d71b95f22ab9141e28c711fc644cde2a69a809 | Sell with loss (stoploss)
Commit: 1a6ca63d2fb1da07ff4b982c2baa44f9fb60ee3f | Sell with loss (stoploss) for pumped pairs
Commit: 22bbed468cd727d6c2decae0bf7b0851f4208440 | signal_stoploss_u_b_1 lower threshold
Commit: a0d92719c3ca0617cc1b0a51ff0b3d3a36954b1b | Lower target CMF for signal_stoploss_u_1
Commit: c961c8be621d831313fbdd77a662ccf04bc1db5c | Fix path to hold config file. Don't load hold config file on
Commit: 0ff33dfc7a525cfd5de20e49864b06aa43a9cfc6 | Cosmetic change to log message
Commit: 19402692c7517146bf655743c38cb5b1c9f4de3e | populate_buy_trend conditions buy logic refactor

Items and 
Commit: 5a3434acf77fcdc3b1707041958874ae7368c94b | fix: add missing buy_dump_protection_60_5
Commit: f1d5e67e6cf9a137b4982fb07fc16574cb32eff7 | Fix typo
Commit: 03b30966a9332a852efc6df6c2ce78c3c922932b | Increase the max for signal_stoploss_u_b_1
Commit: 9d8027e6b7dd16d7da8c563f4c0b7d3c58d11ca9 | signal_stoploss_u_b_1: add Elder Ray Index c

In [23]:
tokenizer2 = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
model2 = AutoModelForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct", torch_dtype=torch.float16, device_map="auto")

tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [25]:
input_file = "bug_commits_rectified.csv"
rectification_input = "input_for_rectification.csv"

with open(input_file, 'r', encoding="utf-8") as f_in, \
     open(rectification_input, 'w', newline='', encoding="utf-8") as f_out:

    reader = csv.DictReader(f_in)
    writer = csv.writer(f_out)
    
    # Keep only necessary columns
    writer.writerow(['Hash', 'Filename', 'Message', 'LLM Inference (fix type)'])
    
    for row in reader:
        writer.writerow([row['Hash'], row['Filename'], row['Message'], row['LLM Inference (fix type)']])


In [None]:
input_file = "bug_commits_rectified.csv"
output_file = "bug_commits_llm_rectified.csv"

with open(input_file, 'r', newline='', encoding="utf-8") as f_in, \
     open(output_file, 'w', newline='', encoding="utf-8") as f_out:
    
    reader = csv.DictReader(f_in)
    fieldnames = reader.fieldnames + ["Rectified Message"]
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        try:
            # Construct the prompt for the second LLM
            prompt = (
                "Rectify this commit message based on the code changes.\n"
                f"Message: {row['Message']}\n"
                f"Filename: {row['Filename']}\n"
                f"LLM Inference: {row['LLM Inference (fix type)']}\n"
                "Corrected commit message:"
            )
            
            input_ids = tokenizer2.encode(prompt, return_tensors="pt")
            output_ids = model2.generate(
                input_ids, max_length=200, num_beams=5, early_stopping=True
            )
            rectified_msg = tokenizer2.decode(output_ids[0], skip_special_tokens=True)
            
            row["Rectified Message"] = rectified_msg
            
            # Clean up to save memory
            del input_ids, output_ids
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing row {row['Hash']} - {row['Filename']}: {e}")
            row["Rectified Message"] = ""
        
        writer.writerow(row)

print(f"Done! New CSV written to {output_file}")


In [None]:
# model for rectification

# tokenizer2 = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
# model2 = AutoModelForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct", torch_dtype=torch.float16, device_map="auto")