<a href="https://colab.research.google.com/github/mohammadshahidbeigh/mietbot/blob/main/backend_mietbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install openai
!pip install numpy
!pip install tiktoken
!pip install Gradio



In [None]:
import openai
import csv
import json
import os
import numpy as np
from collections import defaultdict
import tiktoken
import gradio as gr

In [None]:
from google.colab import userdata
userdata.get('api_key')   #Store your Api_Key in env var

In [None]:
openai.api_key = "api_key" #Your Api_Key from env var

In [None]:
#from OpenAI website to format data;  https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

# Next, we specify the data path and open the JSONL file

data_path = '/content/drive/MyDrive/Colab Notebooks/Fine-tuneGpt3.5turbo/training_examples (22).jsonl'

# Load dataset
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]


# We can inspect the data quickly by checking the number of examples and the first item

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Calculate the estimated cost for fine-tuning
cost_per_100k_tokens = 0.80  # Cost for every 100,000 tokens
estimated_cost = ((n_epochs * n_billing_tokens_in_dataset) / 100000) * cost_per_100k_tokens
print(f"Estimated cost for fine-tuning: approximately ${estimated_cost:.2f}") #I added this for actual cost based on current pricing

Num examples: 32
First example:
{'role': 'system', 'content': 'Given your question about the Model Institute of Engineering and Technology (MIET Jammu), provide relevant information on topics such as admission procedures, course details, and campus facilities.'}
{'role': 'user', 'content': 'Hello?'}
{'role': 'assistant', 'content': 'Hello! Welcome to virtual assistant for MIET Jammu. How can I assist you today?'}
No errors found
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 70, 108
mean / median: 100.6875, 105.0
p5 / p95: 74.9, 107.9

#### Distribution of num_assistant_tokens_per_example:
min / max: 19, 52
mean / median: 45.03125, 48.5
p5 / p95: 22.500000000000004, 51.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning
Dataset has ~3222 tokens that wil

In [None]:
# Function to save the dataset as a JSONL file
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

# Specify the path where you want to save the JSONL file in your Google Drive
jsonl_file_path = '/content/drive/MyDrive/Colab Notebooks/Fine-tuneGpt3.5turbo/training_examples (22).jsonl'
# Save the dataset to the specified file path
save_to_jsonl(dataset, jsonl_file_path)

In [None]:
#Upload data for training
from openai import OpenAI
client = OpenAI(api_key= "api_key")

training_file_name = '/content/drive/MyDrive/Colab Notebooks/Fine-tuneGpt3.5turbo/training_examples (22).jsonl'

training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id
print("Training file id:", training_file_id)


#Gives training file id
print("Training file id:", training_file_id)

Training file id: file-Jl3JSPt3vgFztjGjyFESBfq0
Training file id: file-Jl3JSPt3vgFztjGjyFESBfq0


In [None]:
suffix_name = "mietbot"

response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="ft:gpt-3.5-turbo-0125:personal:mietbot:94v2vB0x",
    suffix=suffix_name,
)

job_id = response.id  # Accessing the ID attribute directly
print("Fine-tuning job ID:", job_id)


Fine-tuning job ID: ftjob-tYcQtC5olu9qVyUdrDPQdXCd


In [None]:
response = client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-tYcQtC5olu9qVyUdrDPQdXCd", limit=50)

events = response.data  # Accessing the data attribute directly
events.reverse()

for event in events:
    print(event.message)


Step 49/96: training loss=0.16
Step 50/96: training loss=0.47
Step 51/96: training loss=0.37
Step 52/96: training loss=0.46
Step 53/96: training loss=0.06
Step 54/96: training loss=0.06
Step 55/96: training loss=0.42
Step 56/96: training loss=0.45
Step 57/96: training loss=0.22
Step 58/96: training loss=0.31
Step 59/96: training loss=0.11
Step 60/96: training loss=0.09
Step 61/96: training loss=0.11
Step 62/96: training loss=0.02
Step 63/96: training loss=0.05
Step 64/96: training loss=0.02
Step 65/96: training loss=0.23
Step 66/96: training loss=0.03
Step 67/96: training loss=0.40
Step 68/96: training loss=0.35
Step 69/96: training loss=0.29
Step 70/96: training loss=0.04
Step 71/96: training loss=0.03
Step 72/96: training loss=0.01
Step 73/96: training loss=0.08
Step 74/96: training loss=0.09
Step 75/96: training loss=0.01
Step 76/96: training loss=0.17
Step 77/96: training loss=0.08
Step 78/96: training loss=0.52
Step 79/96: training loss=0.06
Step 80/96: training loss=0.02
Step 81/

In [None]:
# Retrieve fine-tune model id
response = client.fine_tuning.jobs.retrieve("ftjob-tYcQtC5olu9qVyUdrDPQdXCd")
fine_tuned_model_id = response.fine_tuned_model

print(response)
print("\nFine-tuned model id:", fine_tuned_model_id)


FineTuningJob(id='ftjob-tYcQtC5olu9qVyUdrDPQdXCd', created_at=1711102445, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal:mietbot:95WIrcrK', finished_at=1711102840, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='ft:gpt-3.5-turbo-0125:personal:mietbot:94v2vB0x', object='fine_tuning.job', organization_id='org-FsBBYbw6EhPQzdcDOEDTUxKZ', result_files=['file-2fGFF5Ju68s7d2iOoZ7k1ZS6'], status='succeeded', trained_tokens=9474, training_file='file-Jl3JSPt3vgFztjGjyFESBfq0', validation_file=None, user_provided_suffix='mietbot')

Fine-tuned model id: ft:gpt-3.5-turbo-0125:personal:mietbot:95WIrcrK


In [None]:
#Test it out!
test_messages = []

system_message = "Given your question about the Model Institute of Engineering and Technology (MIET Jammu), provide relevant information on topics such as admission procedures, course details, and campus facilities."
test_messages.append({"role": "system", "content": system_message})
user_message = "What courses does MIET Jammu offer?"
test_messages.append({"role": "user", "content": user_message})

print(test_messages)

[{'role': 'system', 'content': 'Given your question about the Model Institute of Engineering and Technology (MIET Jammu), provide relevant information on topics such as admission procedures, course details, and campus facilities.'}, {'role': 'user', 'content': 'What courses does MIET Jammu offer?'}]


In [None]:
# OpenAI Chat Completions
response = client.chat.completions.create(
   model="ft:gpt-3.5-turbo-0125:personal:mietbot:95WIrcrK",  # You should have fine_tuned_model_id defined previously
    messages=test_messages,
    temperature=0,
    max_tokens=500
)
print(response.choices[0].message.content)



MIET Jammu offers undergraduate and postgraduate courses in various fields of engineering, technology, management, and computer applications. The institute provides B.Tech, M.Tech, MBA, MCA, and Ph.D. programs in disciplines like Computer Science, Electronics, Mechanical, Civil, Information Technology, and more. For detailed information on the courses, you can visit the official MIET Jammu website or contact the admissions office.


In [None]:
import gradio as gr

def generate_completion(user_prompt):
    hidden_context = ""
    messages = [
        {"role": "system", "content": hidden_context},
        {"role": "user", "content": user_prompt}
    ]
    response = client.chat.completions.create(
        model="ft:gpt-3.5-turbo-0125:personal:mietbot:95WIrcrK",
        messages=messages,
        max_tokens=100,
        temperature=0
    )
    return(response.choices[0].message.content)

iface = gr.Interface(fn=generate_completion,
                     inputs=gr.Textbox(lines=5, placeholder='Question about the College?'),
                     outputs='text',
                     title="Mietbot")

iface.launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a1c208e88ea06975de.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


