In [1]:
import numpy as np
import pandas as pd


In [2]:
import pandas as pd
#final_gold_label=pd.read_csv('Revised_final_gold_label.csv')
final_gold_label = pd.read_json('gt_final.jsonl', lines=True)



```
# This is formatted as code
```

### PcBERT-KD  Benchmarking

In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('ManojAlexender/second_Base_version_of_codebert_with_commit_and_diff')
model = RobertaForSequenceClassification.from_pretrained('ManojAlexender/second_Base_version_of_codebert_with_commit_and_diff')

data = list(final_gold_label['commit_message'])
y_true = final_gold_label['target']

# Function to process data instance by instance and calculate metrics
def predict_instance_by_instance(model, tokenizer, data):
    model.to('cuda')
    model.eval()
    all_predictions = []
    token_counts = []
    inference_times = []
    total_start_time = time.time()  # Start total timing

    for instance in data:
        start_time = time.time()  # Start timing for this instance
        inputs = tokenizer(instance, return_tensors='pt', padding=True, truncation=True, max_length=512)
        token_count = len(inputs['input_ids'][0])
        token_counts.append(token_count)
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()
        all_predictions.append(prediction)
        end_time = time.time()  # End timing for this instance
        inference_times.append(end_time - start_time)  # Time taken for this instance

    total_time = time.time() - total_start_time  # Total processing time
    total_tokens = sum(token_counts)
    total_instances = len(data)
    average_inference_time = total_time / total_instances
    average_tokens = total_tokens / total_instances
    tokens_per_second = total_tokens / total_time

    return all_predictions, total_time, average_inference_time, tokens_per_second

results = []

# Run predictions multiple times and collect metrics
for fold in range(5):
    predictions, total_time, avg_time, tokens_per_sec = predict_instance_by_instance(model, tokenizer, data)

    predictions = np.array(predictions)  # Ensure it's a numpy array
    predictions[predictions == 2] = 0
    accuracy = accuracy_score(y_true, predictions)
    report = classification_report(y_true, predictions, output_dict=True)

    precision_yes = report['1']['precision']
    recall_yes = report['1']['recall']
    f1_score_yes = report['1']['f1-score']

    precision_no = report['0']['precision']
    recall_no = report['0']['recall']
    f1_score_no = report['0']['f1-score']

    overall_precision = report['weighted avg']['precision']
    overall_recall = report['weighted avg']['recall']
    overall_f1 = report['weighted avg']['f1-score']

    cm = confusion_matrix(y_true, predictions)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn)

    fprs = []
    for i in range(cm.shape[0]):
        tn = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
        fp = np.sum(cm[:, i]) - cm[i, i]
        fpr_class = fp / (fp + tn) if (fp + tn) != 0 else 0
        fprs.append(fpr_class)

    results.append({
        'Total Time': total_time,
        'Average Time': avg_time,
        'Tokens per Second': tokens_per_sec,
        'Accuracy': accuracy,
        'Overall Precision': overall_precision,
        'Overall Recall': overall_recall,
        'Overall F1 Score': overall_f1,
        'Precision_Performance': precision_yes,
        'Recall_Performance': recall_yes,
        'F1 Score_Performance': f1_score_yes,
        'Precision_Non_Performance': precision_no,
        'Recall_Non_Performance': recall_no,
        'F1 Score_Non_Performance': f1_score_no,
        'FPR_Performance': fprs[1],
        'FPR_Non_Performance': fprs[0]
    })

    # Save predictions for this fold
    predictions_df = pd.DataFrame({ 'commit_message' : data,
                                      'True Labels': y_true,
        'Predicted Labels': predictions
    })
    predictions_df.to_csv(f'Data/predictions_fold_{fold + 1}.csv', index=False)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('Data/model_performance_metrics_KD.csv', index=False)
print("Results saved to 'model_performance_metrics_KD.csv'.")
print(results_df.mean(axis=0))



KeyboardInterrupt



### PcBERT-HS Benchmarking

In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('ManojAlexender/Research_paper_MLM_all_CGO_Level_2_Final_Model_V1')
model = RobertaForSequenceClassification.from_pretrained('ManojAlexender/Research_paper_MLM_all_CGO_Level_2_Final_Model_V1')

data = list(final_gold_label['commit_message'])
y_true = final_gold_label['target']

# Function to process data instance by instance and calculate metrics
def predict_instance_by_instance(model, tokenizer, data):
    model.to('cuda')
    model.eval()
    all_predictions = []
    token_counts = []
    inference_times = []
    total_start_time = time.time()  # Start total timing

    for instance in data:
        start_time = time.time()  # Start timing for this instance
        inputs = tokenizer(instance, return_tensors='pt', padding=True, truncation=True, max_length=512)
        token_count = len(inputs['input_ids'][0])
        token_counts.append(token_count)
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()
        all_predictions.append(prediction)
        end_time = time.time()  # End timing for this instance
        inference_times.append(end_time - start_time)  # Time taken for this instance

    total_time = time.time() - total_start_time  # Total processing time
    total_tokens = sum(token_counts)
    total_instances = len(data)
    average_inference_time = total_time / total_instances
    average_tokens = total_tokens / total_instances
    tokens_per_second = total_tokens / total_time

    return all_predictions, total_time, average_inference_time, tokens_per_second

results = []

# Run predictions multiple times and collect metrics
for fold in range(5):
    predictions, total_time, avg_time, tokens_per_sec = predict_instance_by_instance(model, tokenizer, data)

    predictions = np.array(predictions)  # Ensure it's a numpy array
    predictions[predictions == 2] = 0
    accuracy = accuracy_score(y_true, predictions)
    report = classification_report(y_true, predictions, output_dict=True)

    precision_yes = report['1']['precision']
    recall_yes = report['1']['recall']
    f1_score_yes = report['1']['f1-score']

    precision_no = report['0']['precision']
    recall_no = report['0']['recall']
    f1_score_no = report['0']['f1-score']

    overall_precision = report['weighted avg']['precision']
    overall_recall = report['weighted avg']['recall']
    overall_f1 = report['weighted avg']['f1-score']

    cm = confusion_matrix(y_true, predictions)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn)

    fprs = []
    for i in range(cm.shape[0]):
        tn = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
        fp = np.sum(cm[:, i]) - cm[i, i]
        fpr_class = fp / (fp + tn) if (fp + tn) != 0 else 0
        fprs.append(fpr_class)

    results.append({
        'Total Time': total_time,
        'Average Time': avg_time,
        'Tokens per Second': tokens_per_sec,
        'Accuracy': accuracy,
        'Overall Precision': overall_precision,
        'Overall Recall': overall_recall,
        'Overall F1 Score': overall_f1,
        'Precision_Performance': precision_yes,
        'Recall_Performance': recall_yes,
        'F1 Score_Performance': f1_score_yes,
        'Precision_Non_Performance': precision_no,
        'Recall_Non_Performance': recall_no,
        'F1 Score_Non_Performance': f1_score_no,
        'FPR_Performance': fprs[1],
        'FPR_Non_Performance': fprs[0]
    })

    # Save predictions for this fold
    predictions_df = pd.DataFrame({ 'commit_message' : data,
                                      'True Labels': y_true,
        'Predicted Labels': predictions
    })
    predictions_df.to_csv(f'predictions_fold_{fold + 1}_HS.csv', index=False)

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('model_performance_metrics_HS.csv', index=False)
print("Results saved to 'model_performance_metrics_HS.csv'.")
print(results_df.mean(axis=0))


Results saved to 'model_performance_metrics_HS.csv'.
Total Time                      1.422328
Average Time                    0.005153
Tokens per Second            7937.175813
Accuracy                        0.753623
Overall Precision               0.808036
Overall Recall                  0.753623
Overall F1 Score                0.742240
Precision_Performance           0.937500
Recall_Performance              0.543478
F1 Score_Performance            0.688073
Precision_Non_Performance       0.678571
Recall_Non_Performance          0.963768
F1 Score_Non_Performance        0.796407
FPR_Performance                 0.036232
FPR_Non_Performance             0.456522
dtype: float64


### Mistral KD benchmarking

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import time
import gc
import re
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import subprocess

def clean_prediction(text):
    if not text or not isinstance(text, str):
        return None
    match = re.search(r'\b(Yes|No)\b', text)
    return match.group(0) if match else None

def get_gpu_memory():
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    nvidia_smi_output = result.stdout
    pattern = re.compile(
        r"\|\s+(\d+)\s+NVIDIA GeForce RTX 4090\s+Off\s+\|.*\|\s+Off\s+\|\n"
        r"\|\s+(\d+)%\s+(\d+C)\s+\w+\s+(\d+W) / (\d+W)\s+\|\s+(\d+MiB) / (\d+MiB)\s+\|\s+(\d+)%\s+"
    )
    matches = pattern.findall(nvidia_smi_output)
    df = pd.DataFrame(matches, columns=[
        'GPU ID', 'Fan Speed (%)', 'Temperature (C)', 'Power Usage (W)', 'Power Cap (W)',
        'Memory Usage (MiB)', 'Total Memory (MiB)', 'GPU Utilization (%)'
    ])
    result = df[['Memory Usage (MiB)']]
    result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
    result = result.sum().to_frame().T
    return result

results = []
all_predictions_df = pd.DataFrame()

tokenizer = AutoTokenizer.from_pretrained("/home/manojale/Documents/Mistral/Mistral-7B-Instruct-v0.2")
model = AutoModelForCausalLM.from_pretrained("/home/manojale/Documents/Mistral/Mistral-7B-Instruct-v0.2",
    torch_dtype=torch.bfloat16,
    device_map="auto")

# Run predictions multiple times and collect metrics
for i in range(5):
    y_true = []
    all_predictions = []
    input_token_counts = []
    output_token_counts = []
    inference_times = []
    total_start_time = time.time()
    gpu_memory_usage = []
    memory_bef = []
    memory_aft = []
    token_per_input_list = []

    for idx, row in final_gold_label.iterrows():
        commit_message = row['commit_message']
        ytrue = row['target']
        prompt_template = ''' <s> [INST] You are an analytical tool specialized in processing and classifying GitHub Commit message. Your task is to assess developer's intent in a given commit message and categorize it into one of the following predefined categories based on its content:

'Yes': A commit message that explicitly mentions performance improvement or optimization, specifically in terms of execution time or resource utilization. The message should clearly indicate actions that made the code run faster or more efficiently, use less memory, or more efficiently utilize system resources. Also, if a commit message describes a change made to address a performance bottleneck, prevent performance degradation, reduce overheads, or solve a problem that negatively affects performance. This includes optimizations like replacing inefficient code patterns that are known to kill performance even if the message does not use the words 'improvement' or 'performance' explicitly.
'No': A commit message that does not pertain to performance enhancements. This includes messages related to code changes for testing, documentation, performance profiling/monitoring/debugging/analysis, and bug/error/crash fixes that don't explicitly mention performance improvement of the application itself, code refactoring or feature addition without explicit performance optimization, and mentions of necessary or speculative or potential performance enhancements without concrete evidence or results. Also, a message that is irrelevant, unclear, or ambiguous, and those that do not provide enough context to determine their intent.

If the commit message doesn't fit clearly into any of the above categories, classify it as: 'No'. Additionally, pay close attention to the context in which terms like 'performance', 'improve' or 'improvements' are used. Not all improvements are related to performance—only classify a message as 'Yes' if it specifically mentions enhancements related to execution time, memory usage, or resource efficiency. Avoid making assumptions based on ambiguous terms. You should have high confidence in classifying a message as 'Yes' based on careful examination of the information provided in the commit message.
If you encounter a commit message with multiple intentions, where at least one of those intentions includes a performance improvement, classify the entire message as 'Yes'.
You will only respond with the predefined category. Do not include the word 'Category'. Do not provide explanations or notes.

Commit message : ```{commit_message}``` [/INST] Model_answer: </s>'''

        # Tokenize input
        generated_prompt = prompt_template.format(commit_message=commit_message)
        inputs = tokenizer(generated_prompt, return_tensors="pt").to('cuda')
        input_token_count = inputs.input_ids.size(1)
        input_token_counts.append(input_token_count)

        # Measure GPU memory before processing
        memory_before = get_gpu_memory()['Memory Usage (MiB)']

        start_time = time.time()
        outputs = model.generate(inputs.input_ids, max_new_tokens=5, do_sample=True)
        end_time = time.time()
        gc.collect()
        torch.cuda.empty_cache()
        output_token_count = outputs.size(1) - input_token_count
        output_token_counts.append(output_token_count)

        # Decode output and clean prediction
        analysis = tokenizer.decode(outputs[0][input_token_count:], skip_special_tokens=True)
        cleaned_prediction = clean_prediction(analysis)
        all_predictions.append(cleaned_prediction)
        y_true.append(ytrue)

        # Measure GPU memory after processing
        memory_after = get_gpu_memory()['Memory Usage (MiB)']
        gpu_memory_usage.append(memory_after - memory_before)

        time_per_input = end_time - start_time
        tokens_per_second = 5 / time_per_input
        token_per_input_list.append(tokens_per_second)
        inference_times.append(end_time - start_time)
        memory_bef.append(memory_before)
        memory_aft.append(memory_after)

    # Convert predictions to numerical labels and filter out None
    pred_df = pd.DataFrame({'model_answer': all_predictions})
    pred_df['target'] = y_true
    pred_df['Cleaned_predictions'] = pred_df['model_answer'].apply(clean_prediction)
    pred_df.dropna(inplace=True)  # Drop rows where predictions are None
    pred_df['Mistral_target'] = pred_df['Cleaned_predictions'].replace({'Yes': 1, 'No': 0}).astype(int)

    # Append predictions and targets to all_predictions_df
    all_predictions_df = pd.concat([all_predictions_df, pred_df[['commit_message','model_answer', 'target', 'Mistral_target']]])

    # Calculate accuracy and other metrics
    accuracy = accuracy_score(pred_df['target'], pred_df['Mistral_target'])
    cm = confusion_matrix(pred_df['target'], pred_df['Mistral_target'])
    print(cm)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn)
    report = classification_report(pred_df['target'], pred_df['Mistral_target'], output_dict=True)

    precision_yes = report['1']['precision']
    recall_yes = report['1']['recall']
    f1_score_yes = report['1']['f1-score']
    fpr_yes = fp / (fp + tn)

    precision_no = report['0']['precision']
    recall_no = report['0']['recall']
    f1_score_no = report['0']['f1-score']
    fpr_no = fn / (fn + tp)

    overall_precision = report['weighted avg']['precision']
    overall_recall = report['weighted avg']['recall']
    overall_f1 = report['weighted avg']['f1-score']

    total_time = time.time() - total_start_time

    results.append({
        'Total Time': total_time,
        'Average Time': sum(inference_times) / len(inference_times),
        'Tokens per Second': sum(input_token_counts + output_token_counts) / total_time,
        'Accuracy': accuracy,
        'Overall Precision': overall_precision,
        'Overall Recall': overall_recall,
        'Overall F1 Score': overall_f1,
        'Precision_Performance': precision_yes,
        'Recall_Performance': recall_yes,
        'F1 Score_Performance': f1_score_yes,
        'FPR_Performance': fpr_yes,
        'Precision_Non_performance': precision_no,
        'Recall_Non_performance': recall_no,
        'F1 Score_Non_performance': f1_score_no,
        'FPR_Non_performance': fpr_no,
        'GPU Memory Usage (MB)': sum(gpu_memory_usage) / len(gpu_memory_usage),
        'GPU Memory Before each input Usage (MB)': sum(memory_bef) / len(memory_bef),
        'GPU Memory After each input Usage (MB)': sum(memory_aft) / len(memory_aft),
        'Token_per_input': sum(token_per_input_list) / len(token_per_input_list),
        'FPR': fpr
    })

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('Data/mistral_GPU_KD_5_Fold_results.csv', index=False)

# Save all predictions to a separate CSV
all_predictions_df.to_csv('Data/all_predictions_GPU_Mistral_KD.csv', index=False)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` wh

[[229  17]
 [ 21 202]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

[[224  16]
 [ 20 204]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

[[223  15]
 [ 21 211]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

[[224  14]
 [ 21 210]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

[[224  16]
 [ 19 210]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


### Mistral AWQ benchmarking

In [None]:
# Assuming all other imports and initializations are done
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
import time
import gc
import re
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def clean_prediction(text):
    if not text or not isinstance(text, str):
        return None
    match = re.search(r'\b(Yes|No)\b', text)
    return match.group(0) if match else None

import subprocess
import re
import pandas as pd

def get_gpu_memory():
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    nvidia_smi_output = result.stdout
    pattern = re.compile(
        r"\|\s+(\d+)\s+NVIDIA GeForce RTX 4090\s+Off\s+\|.*\|\s+Off\s+\|\n"
        r"\|\s+(\d+)%\s+(\d+C)\s+\w+\s+(\d+W) / (\d+W)\s+\|\s+(\d+MiB) / (\d+MiB)\s+\|\s+(\d+)%\s+"
    )
    matches = pattern.findall(nvidia_smi_output)
    df = pd.DataFrame(matches, columns=[
        'GPU ID', 'Fan Speed (%)', 'Temperature (C)', 'Power Usage (W)', 'Power Cap (W)',
        'Memory Usage (MiB)', 'Total Memory (MiB)', 'GPU Utilization (%)'
    ])
    result = df[['Memory Usage (MiB)']]
    result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
    result = result.sum().to_frame().T
    return result

results = []
from vllm import LLM, SamplingParams

llm = LLM(model="/home/manojale/Documents/LLM_Experiments/Mistral-7B-Instruct-v0.2-AWQ/", quantization="awq", dtype="auto", gpu_memory_utilization=0.8, enforce_eager=True)
tokenizer = AutoTokenizer.from_pretrained("/home/manojale/Documents/LLM_Experiments/Mistral-7B-Instruct-v0.2-AWQ/")

# Run predictions multiple times and collect metrics
for fold in range(5):
    y_true = []
    all_predictions = []
    input_token_counts = []
    output_token_counts = []
    inference_times = []
    total_start_time = time.time()
    gpu_memory_usage = []
    memory_bef = []
    memory_aft = []
    token_per_input_list = []

    for idx, row in final_gold_label.iterrows():
        prompts = [row['commit_message']]
        prompt_template = ''' <s> [INST] You are an analytical tool specialized in processing and classifying GitHub Commit message. Your task is to assess developer's intent in a given commit message and categorize it into one of the following predefined categories based on its content:

        'Yes':  A commit messages that explicitly mentions performance improvement or optimization, specifically in terms of execution time or resource utilization. The message should clearly indicate actions that made the code runs faster or more efficiently, use less memory, or more efficiently utilize system resources. Also, if a commit message describes a change made to address a performance bottleneck, prevent performance degradation, reduce overheads or solve a problem that negatively affects performance. This includes optimizations like replacing inefficient code patterns that are known to kill performance even if the message does not use the words 'improvement' or 'performance' explicitly.
        'No': A commit message that do not pertain to performance enhancements. This includes messages related to code changes for testing, documentation, performance profiling/monitoring/debugging/analysis and bug/error/crash fixes that don't explicitly mention performance improvement of the application itself, code refactoring or feature addition without explicit performance optimization, and mentions of necessary or speculative or potential performance enhancements without concrete evidence or results. Also, a messages that is irrelevant, unclear, or ambiguous, and those that do not provide enough context to determine their intent.

        If the commit message doesn't fit clearly into any of the above categories, classify it as: 'No'. Additionally, pay close attention to the context in which terms like 'performance', 'improve' or 'improvements' are used. Not all improvements are related to performance—only, classify a message as 'Yes' if it specifically mentions enhancements related to execution time, memory usage, or resource efficiency. Avoid making assumptions based on ambiguous terms. You should have high confidence in classifying a message as 'Yes' based on careful examination of the information provided in the commit message.
        If you encounter a commit message with multiple intentions, where at least one of those intentions includes a performance improvement, classify the entire message as 'Yes'.
        You will only respond with the predefined category. Do not include the word 'Category'. Do not provide explanations or notes.

        Commit message : ```{prompt}``` [/INST] Model_answer:  </s>'''

        prompts = [prompt_template.format(prompt=prompt) for prompt in prompts]

        sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=5)

        ytrue = row['target']

        # Tokenize input
        inputs = tokenizer(prompts[0], return_tensors="pt").to('cuda')
        input_token_count = inputs.input_ids.size(1)
        input_token_counts.append(input_token_count)

        # Measure GPU memory before processing
        memory_before = get_gpu_memory()['Memory Usage (MiB)']

        start_time = time.time()
        outputs = llm.generate(prompts, sampling_params)

        # Print the outputs.
        for output in outputs:
            prompt = output.prompt
            analysis = output.outputs[0].text
        end_time = time.time()

        # Decode output and clean prediction
        cleaned_prediction = clean_prediction(analysis)
        all_predictions.append(cleaned_prediction)
        y_true.append(ytrue)

        # Measure GPU memory after processing
        memory_after = get_gpu_memory()['Memory Usage (MiB)']
        gpu_memory_usage.append(memory_after - memory_before)

        time_per_input = end_time - start_time
        tokens_per_second = 5 / time_per_input
        token_per_input_list.append(tokens_per_second)
        inference_times.append(end_time - start_time)
        memory_bef.append(memory_before)
        memory_aft.append(memory_after)

    # Convert predictions to numerical labels and filter out None
    pred_df = pd.DataFrame({'model_answer': all_predictions})
    pred_df['target'] = y_true
    pred_df['Cleaned_predictions'] = pred_df['model_answer'].apply(clean_prediction)
    pred_df.dropna(inplace=True)  # Drop rows where predictions are None
    pred_df['Mistral_target'] = pred_df['Cleaned_predictions'].replace({'Yes': 1, 'No': 0}).astype(int)

    # Save predictions for this fold
    pred_df.to_csv(f'predictions_fold_{fold + 1}.csv', index=False)

    # Calculate accuracy and other metrics
    accuracy = accuracy_score(pred_df['target'], pred_df['Mistral_target'])
    cm = confusion_matrix(pred_df['target'], pred_df['Mistral_target'])
    print(cm)

    tn, fp, fn, tp = cm.ravel()
    fpr_yes = fp / (fp + tn)
    fpr_no = fn / (fn + tp)

    report = classification_report(pred_df['target'], pred_df['Mistral_target'], output_dict=True)

    precision_yes = report['1']['precision']
    recall_yes = report['1']['recall']
    f1_score_yes = report['1']['f1-score']

    precision_no = report['0']['precision']
    recall_no = report['0']['recall']
    f1_score_no = report['0']['f1-score']

    overall_precision = report['weighted avg']['precision']
    overall_recall = report['weighted avg']['recall']
    overall_f1 = report['weighted avg']['f1-score']

    total_time = time.time() - total_start_time

    results.append({
        'Total Time': total_time,
        'Average Time': sum(inference_times) / len(inference_times),
        'Tokens per Second': sum(input_token_counts + output_token_counts) / total_time,
        'Accuracy': accuracy,
        'Overall Precision': overall_precision,
        'Overall Recall': overall_recall,
        'Overall F1 Score': overall_f1,
        'Precision_Performance': precision_yes,
        'Recall_Performance': recall_yes,
        'F1 Score_Performance': f1_score_yes,
        'Precision_Non_performance': precision_no,
        'Recall_Non_performance': recall_no,
        'F1 Score_Non_performance': f1_score_no,
        'GPU Memory Usage (MB)': sum(gpu_memory_usage) / len(gpu_memory_usage),
        'GPU Memory Before each input Usage (MB)': sum(memory_bef) / len(memory_bef),
        'GPU Memory After each input Usage (MB)': sum(memory_aft) / len(memory_aft),
        'Token_per_input': sum(token_per_input_list) / len(token_per_input_list),
        'FPR_Yes': fpr_yes,
        'FPR_No': fpr_no
    })

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('Mistral_AWQ_GPU_KD.csv', index=False)

print("Results saved to 'mistral_performance_metrics_latest.csv'.")
print(results_df.mean(axis=0))


INFO 05-30 01:51:32 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='/home/manojale/Documents/LLM_Experiments/Mistral-7B-Instruct-v0.2-AWQ/', tokenizer='/home/manojale/Documents/LLM_Experiments/Mistral-7B-Instruct-v0.2-AWQ/', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=awq, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 05-30 01:51:33 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 05-30 01:51:33 selector.py:25] Using XFormers backend.
INFO 05-30 01:51:34 model_runner.py:104] Loading model weights took 3.8814 GB
INFO 05-30 01:51:37 gpu_executor.py:94] # GPU blocks: 5500, # CPU blocks: 2048


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
Processed prompts: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

[[242   9]
 [ 37 207]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
Processed prompts: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

[[242  10]
 [ 33 209]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
Processed prompts: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.60it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

[[241  10]
 [ 38 206]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
Processed prompts: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

[[244  10]
 [ 36 207]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
Processed prompts: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.61it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Memory Usage (MiB)'] = result['Memory Usage (MiB)'].str.replace('MiB', '').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

[[238  11]
 [ 37 206]]
Results saved to 'mistral_performance_metrics_latest.csv'.
Total Time                                   217.640050
Average Time                                   0.133144
Tokens per Second                           1568.203217
Accuracy                                       0.906588
Overall Precision                              0.911076
Overall Recall                                 0.906588
Overall F1 Score                               0.906241
Precision_Performance                          0.953920
Recall_Performance                             0.851170
F1 Score_Performance                           0.899606
Precision_Non_performance                      0.869613
Recall_Non_performance                         0.960215
F1 Score_Non_performance                       0.912660
GPU Memory Usage (MB)                          0.835200
GPU Memory Before each input Usage (MB)    18612.624000
GPU Memory After each input Usage (MB)     18613.459200
Token_per_input       