In [None]:
!pip install datasets
!pip install peft
!pip install -q -U einops
!pip install -q -U bitsandbytes
!pip install transformers==4.37
!pip install accelerate -U
!pip install rouge_score

In [None]:
import torch
from datasets import load_metric
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt

In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [None]:

base_model_name = 'microsoft/phi-1_5'
base_model = AutoModelForCausalLM.from_pretrained( base_model_name,
                                            quantization_config=bnb_config)


In [None]:
tuned_model_name = 'megajajo/phi-1_5-finetuned'
tuned_model = AutoModelForCausalLM.from_pretrained( tuned_model_name,
                                            quantization_config=bnb_config)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
kotlin_data = []
kotlin_data_path = '/content/drive/MyDrive/ML/data.json'
with open(kotlin_data_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        kotlin_data.append(json_obj)

In [None]:
df_kotlin = pd.DataFrame(kotlin_data)

In [None]:
df_kotlin

In [None]:
python_data = []
python_data_path = '/content/drive/MyDrive/ML/test.jsonl'
with open(python_data_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        python_data.append(json_obj)

In [None]:
df_python = pd.DataFrame(python_data)

In [None]:
df_python["Prompt"] = df_python[["signature"]].apply(lambda x: x["signature"], axis=1)
df_python["Completion"] = df_python[["body"]].apply(lambda x: x["body"], axis=1)
df_python = df_python[["Prompt","Completion"]]


In [None]:
df_python

In [None]:
kotlin_sample = df_kotlin.sample(n=1000)

In [None]:
python_sample = df_python.sample(n=400)

In [None]:
def generate_output(prompt, model):
  inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
  output = model.generate(**inputs, do_sample=True, top_p=0.95, max_length=len(prompt)+1, max_time=8)
  text = tokenizer.batch_decode(output)[0]
  return text

In [None]:
kotlin_base_predictions = []
kotlin_finetuned_predictions = []
kotlin_completion = []
for index, row in kotlin_sample.iterrows():
    kotlin_base_predictions.append(generate_output(row['Prompt'],base_model))
    kotlin_finetuned_predictions.append(generate_output(row['Prompt'],tuned_model))
    kotlin_completion.append(row['Completion'])


In [None]:
for i in range (0,len(kotlin_completion)):
  kotlin_completion[i] =  kotlin_completion[i][0]

In [None]:
rouge = load_metric('rouge')

In [None]:
kotlin_base = rouge.compute(predictions=kotlin_base_predictions,
                             references=kotlin_completion,
                             use_aggregator=True)
kotlin_tuned = rouge.compute(predictions=kotlin_finetuned_predictions,
                             references=kotlin_completion,
                             use_aggregator=True)

In [None]:
python_base_predictions = []
python_finetuned_predictions = []
python_completion = []
for index, row in python_sample.iterrows():
    python_base_predictions.append(generate_output(row['Prompt'],base_model))
    python_finetuned_predictions.append(generate_output(row['Prompt'],tuned_model))
    python_completion.append(tokenizer(row['Completion'], return_tensors="pt", return_attention_mask=False))

In [None]:
python_base = rouge.compute(predictions=python_base_predictions,
                             references=python_completion,
                             use_aggregator=True)
python_tuned = rouge.compute(predictions=python_finetuned_predictions,
                             references=python_completion,
                             use_aggregator=True)

In [None]:
#Sample plot for rougeLsum metric for kotlin


aggregate_score1 = kotlin_base['rougeLsum']
aggregate_score2 = kotlin_tuned['rougeLsum']
# Extracting metrics for plotting
metrics = ['Precision Mid', 'Recall Mid', 'F-measure Mid']
values1 = [aggregate_score1.mid.precision, aggregate_score1.mid.recall, aggregate_score1.mid.fmeasure]
values2 = [aggregate_score2.mid.precision, aggregate_score2.mid.recall, aggregate_score2.mid.fmeasure]

x = np.arange(len(metrics))
width = 0.35

# Plotting
fig, ax = plt.subplots()
bars1 = ax.bar(x - width/2, values1, width, label='Phi-1_5')
bars2 = ax.bar(x + width/2, values2, width, label='Phi-1_5-finetuned')

# Adding labels and title
ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('RougeLsum')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# Show plot
plt.show()
