In [1]:
import requests
import json
import os
import traceback 
import sys

In [2]:
import pandas as pd 
import numpy as np
df = pd.read_csv('raw_notes_unique_id.tsv', sep='\t')
df = df.replace({np.nan: None})
df.head()

Unnamed: 0,Record ID,Deal Name,Associated Contacts,Title,Associated Company,Company Size,Revenue,DS/ML Team Size,Cloud,Use of Kubernetes,Industry,Location,Deal owner,Notes,Recordings,Raw Notes
0,12428315429,GEP Worldwide,,,GEP,>1000,>1B,25-50,,,Business Consulting and Services,"Clark, New Jersey",Nikunj Bajaj,,,
1,12428091307,Automation Anywhere,(11.rammohan@gmail.com),,"Automation Anywhere, Inc.",>1000,>1B,Oct-25,,,Software Development,"San Jose, CA",Nikunj Bajaj,,https://app.fireflies.ai/view/Ram-Automation-A...,
2,12428091030,Sanofi,,,Sanofi,>1000,>1B,>50,,,Pharmaceutical Manufacturing,"Paris, France",Anuraag Gutgutia,Sanofi_Deep 07-03-2023,https://app.fireflies.ai/view/TrueFoundry-Deep...,
3,12428314989,DocuSign,Gandharv Kapoor (gandharv.kapoor@gmail.com),,DocuSign,>1000,>1B,>50,,,Software Development,"San Francisco, CA",Anuraag Gutgutia,,https://app.fireflies.ai/view/Meeting-with-Anu...,
4,12362100837,Max Life Insurance,Divyan Kavdia (divyan.kavdia@maxlifeinsurance....,,Max Life Insurance,>1000,>1B,Oct-25,,,Insurance,"Gurugram, Haryana",Nikunj Bajaj,,,


In [3]:
df.shape

(286, 16)

In [4]:
record_to_link_dict = dict(zip(df['Record ID'], df['Recordings']))
len(record_to_link_dict)

228

In [13]:
def read_fireflies_data(transcript_id):
    # API endpoint
    url = "https://api.fireflies.ai/graphql/"

    # Your API key
    api_key = "1356da62-552d-4cec-a8e7-e4449f9d4ec3"

    # Construct the request payload
    payload = {"query": '{transcript(id:"%s"){ title date sentences {text }} }' %transcript_id }

    # Set the headers and authentication for the request
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    # Send the request and get the response
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    transcript = ' '.join([elem['text'] for elem in response.json()['data']['transcript']['sentences']])
    return transcript

In [16]:
empty_records = []
for record_id, link in record_to_link_dict.items():
    try: 
        if not link:
            empty_records.append(record_id)
            continue

        filepath = f'transcripts/{record_id}'
        if os.path.isfile(filepath):
            print('File already exists: ', record_id)
            continue

        actual_id = link.split("::")[-1]
        transcript = read_fireflies_data(actual_id)
        with open(filepath, 'w+') as f:
            f.write(transcript)
        print("Processed: ", record_id)
    except Exception as e:
        print(f"Some error happened in {record_id}. Error: {e}")

File already exists:  12428091307
File already exists:  12428091030
File already exists:  12428314989
File already exists:  12362155036
File already exists:  12362100552
File already exists:  12361625352
File already exists:  12361625221
File already exists:  12361625175
File already exists:  12339635717
File already exists:  12339423649
File already exists:  12336403532
File already exists:  12339422352
File already exists:  12336402979
File already exists:  12339399214
File already exists:  12339295384
File already exists:  12249609647
File already exists:  12249630284
File already exists:  12249629853
File already exists:  12150828398
File already exists:  12150828333
File already exists:  12150845733
File already exists:  12150738858
File already exists:  12150677061
File already exists:  12044422255
File already exists:  12044422040
File already exists:  12044331076
File already exists:  12044330588
File already exists:  11984272003
File already exists:  11942287761
File already e

In [5]:
def generate_response_given_text(record_id, context):
    # API endpoint
    url = "https://api.openai.com/v1/chat/completions"

    # Your API key
    api_key = "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"

    # The number of responses to generate
    n = 1

    # Messages
    prompt = 'Break down the context below to paragraphs. From each of them generate at least 3 relevant questions and long form answers in a list of well formatted jsons with exactly two keys and no new line or space characters. First key being a prompt which should have the generated question and second key should be a completion with the answer to that question. The data would look something like this- [{"prompt":"question1","completion":"answer1"},{"prompt":"question2","completion":"answer2"}]. Here is the context- '
    
    len_text = len(context.split(' '))
    start_num = 1  # Starting number of the interval
    end_num = len_text  # Ending number of the interval
    interval = 3000  # Interval window
    all_pairs = []

    # Generate the list of start and end indexes
    for start, end in zip(range(start_num, end_num, interval), range(start_num+interval, end_num+1, interval)):
        print(f"record_id: {record_id}, start: {start}, end: {end}")
        context = ' '.join(context.split(' ')[start:end])
        text_content = prompt + context
        messages= [{"role": "user", "content": text_content}]

        # Model
        model = "gpt-3.5-turbo"

        # Construct the request payload
        payload = {
            "model": model,
            "messages": messages,
            "n": n
        }

        # Set the headers and authentication for the request
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }

        # Send the request and get the response
        response = requests.post(url, headers=headers, data=json.dumps(payload))

        # Print the response
        try:
            generated_response = response.json()
            actual_message = generated_response['choices'][0]['message']['content'].replace('\n', '')
            new_text = str(actual_message).strip("'<>() ").replace('\'', '\"')
            prompt_completion_pairs = json.loads(actual_message)
            print(f"Got {len(prompt_completion_pairs)} pairs. length of full string {len(new_text)}")
            all_pairs.extend(prompt_completion_pairs)
        except Exception as e:
            # traceback.print_exception(*sys.exc_info())
            print(f"Error in json parsing. Error {e}")
            return
    return all_pairs


In [6]:
empty_records = []
for record_id, link in record_to_link_dict.items():
    try: 
        if not link:
            empty_records.append(record_id)
            continue

        filepath = f'transcripts/{record_id}'
        if not os.path.isfile(filepath):
            print('File does not exist: ', record_id)
            continue
            
        new_path = f"para_pairs_dir/{record_id}.jsonl"

        if os.path.isfile(new_path):
            print('JsonL already exist: ', record_id)
            continue

        with open(filepath, 'r') as f:
            context = f.read()
        try: 
            prompt_completion_pairs = generate_response_given_text(record_id, context)
        except Exception as error_prompt: 
            traceback.print_exception(*sys.exc_info())
            print(f"Error in fetching pairs. Error {error_prompt}")
            continue

        for prompt_completion_dict in prompt_completion_pairs:
            prompt_completion_dict['completion'] = f"uuid: {record_id} " + prompt_completion_dict['completion'] 


        with open(new_path, 'w') as f:
            for entry in prompt_completion_pairs:
                json.dump(entry, f)
                f.write('\n')
        print(f"Finished processing record {record_id}")
    except Exception as e:
        print(f"Error happened in {record_id}. Error: {e}")

JsonL already exist:  12428091307
JsonL already exist:  12428091030
JsonL already exist:  12428314989
record_id: 12362155036, start: 1, end: 3001
Got 5 pairs. length of full string 1651
record_id: 12362155036, start: 3001, end: 6001
Error in json parsing. Error Expecting value: line 1 column 1 (char 0)
Error happened in 12362155036. Error: 'NoneType' object is not iterable
record_id: 12362100552, start: 1, end: 3001
Error in json parsing. Error Extra data: line 1 column 2 (char 1)
Error happened in 12362100552. Error: 'NoneType' object is not iterable
record_id: 12361625352, start: 1, end: 3001
Error in json parsing. Error Extra data: line 1 column 253 (char 252)
Error happened in 12361625352. Error: 'NoneType' object is not iterable
record_id: 12361625221, start: 1, end: 3001
Error in json parsing. Error Extra data: line 1 column 2 (char 1)
Error happened in 12361625221. Error: 'NoneType' object is not iterable
record_id: 12361625175, start: 1, end: 3001
Error in json parsing. Error E

Error in json parsing. Error Expecting value: line 1 column 1 (char 0)
Error happened in 11185249632. Error: 'NoneType' object is not iterable
record_id: 11185249388, start: 1, end: 3001
Got 6 pairs. length of full string 1300
Finished processing record 11185249388
record_id: 10881905922, start: 1, end: 3001
Got 7 pairs. length of full string 1327
Finished processing record 10881905922
record_id: 10881905316, start: 1, end: 3001
Error in json parsing. Error Expecting value: line 1 column 1 (char 0)
Error happened in 10881905316. Error: 'NoneType' object is not iterable
record_id: 10881877574, start: 1, end: 3001
Error in json parsing. Error Extra data: line 1 column 2 (char 1)
Error happened in 10881877574. Error: 'NoneType' object is not iterable
record_id: 10784174762, start: 1, end: 3001
Got 5 pairs. length of full string 1516
Finished processing record 10784174762
record_id: 10777412370, start: 1, end: 3001
Got 4 pairs. length of full string 1845
record_id: 10777412370, start: 3001


KeyboardInterrupt



In [18]:
import json
import numpy as np
import pandas as pd 
df = pd.read_csv('structured_notes.tsv', sep='\t')
df.head()


actual_col_names = ['Brief background (if discussed)',
       'Use Cases for ML - types of models (Is monitoring important etc?)',
       'Current Stack for ML Deployments and pipeline',
       'Problems being faced where looking for solutions',
       'Questions asked wrt Product',
       'Feedback wrt Product - what would make them possibly adopt it']

good_questions_from_them = ['Give me a brief about the company and the individual?',
                            'What are the most important use cases and types of models for Machine Learning?',
                            'What are the commonly used tech stack for ML deployments and pipeline?',
                            'What are the common challenges and what solutions are people looking for?',
                            'What are the most common questions asked about the product?',
                            'What is the feedback on the TrueFoundry product?'
                           ]
df.rename(columns = dict(zip(actual_col_names, good_questions_from_them)), inplace = True)

df = df.replace({np.nan: None})
record_dicts = df[good_questions_from_them].to_dict('records')

print(df[good_questions_from_them].shape)
len(record_dicts), len(record_dicts[0])
record_dicts[0]
PROMPT_END = ' \n\n###\n\n'


structured_pairs = []
for record_dict in record_dicts:
    for prompt, response in record_dict.items():
        if response is None:
            continue 
        pair = {"prompt": prompt+PROMPT_END, "completion": ' ' + response + "\n"}
        structured_pairs.append(pair)


(63, 6)


In [20]:
from os import walk
PROMPT_END = ' \n\n###\n\n'
pairs_dir_files = []
para_pairs_dir_files = []

for (dirpath, dirnames, filenames) in walk('pairs_dir'):
    pairs_dir_files.extend(filenames)
    
for (dirpath, dirnames, filenames) in walk('para_pairs_dir'):
    para_pairs_dir_files.extend(filenames)

JSONL_FILENAME = "final_dataset.jsonl"
with open(JSONL_FILENAME, "w") as f:
    for a_file in pairs_dir_files:
        file_path = f"pairs_dir/{a_file}"
        with open(file_path, 'r') as g:
            lines = g.readlines()
        for line in lines:
            data = json.loads(line)
            data["prompt"] = data["prompt"].strip() + PROMPT_END
            data["completion"] = ' ' + data["completion"].strip() + "\n"
            json.dump(data, f)
            f.write("\n")
            
    for a_file in para_pairs_dir_files:
        file_path = f"para_pairs_dir/{a_file}"
        with open(file_path, 'r') as g:
            lines = g.readlines()
        for line in lines:
            data = json.loads(line)
            data["prompt"] = data["prompt"].strip() + PROMPT_END
            data["completion"] = ' ' + data["completion"].strip() + "\n"
            json.dump(data, f)
            f.write("\n")
            
with open(JSONL_FILENAME, 'a') as f:
    for entry in structured_pairs:
        json.dump(entry, f)
        f.write('\n')

In [21]:
import openai
API_KEY = "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"
openai.api_key = API_KEY

upload_response = openai.File.create(
                      file=open(JSONL_FILENAME, "rb"),
                      purpose='fine-tune'
                )
file_id = upload_response.id

In [32]:
fine_tune_response = openai.FineTune.create(training_file=file_id, model='davinci')

In [33]:
fine_tune_events = openai.FineTune.list_events(id=fine_tune_response.id)
fine_tune_events

<OpenAIObject list at 0x117b93310> JSON: {
  "data": [
    {
      "created_at": 1679127365,
      "level": "info",
      "message": "Created fine-tune: ft-hULRb0bN73dOoW6ddnedo7UX",
      "object": "fine-tune-event"
    }
  ],
  "object": "list"
}

In [34]:
retrieve_response = openai.FineTune.retrieve(id=fine_tune_response.id)
retrieve_response

<FineTune fine-tune id=ft-hULRb0bN73dOoW6ddnedo7UX at 0x117b9c630> JSON: {
  "created_at": 1679127365,
  "events": [
    {
      "created_at": 1679127365,
      "level": "info",
      "message": "Created fine-tune: ft-hULRb0bN73dOoW6ddnedo7UX",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-hULRb0bN73dOoW6ddnedo7UX",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-ojH41IdW0UR2VlysxKUx8AjA",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 478941,
      "created_at": 1679126656,
      "filename": "file",
      "id": "file-f1OaWuKo0Naqv7GjenCnfU1w",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processed",
      "status_details": null
    }
  ],
  "updated_at": 1679127365,
  "validation_files": []
}

In [36]:
fine_tune_list = openai.FineTune.list()
fine_tune_list['data'][-4:]

[<FineTune fine-tune id=ft-m6l8jZ5lr9ziZ92xE2017iCZ at 0x117bf9810> JSON: {
   "created_at": 1679087541,
   "fine_tuned_model": "ada:ft-truefoundry-2023-03-17-21-22-33",
   "hyperparams": {
     "batch_size": 1,
     "learning_rate_multiplier": 0.1,
     "n_epochs": 4,
     "prompt_loss_weight": 0.01
   },
   "id": "ft-m6l8jZ5lr9ziZ92xE2017iCZ",
   "model": "ada",
   "object": "fine-tune",
   "organization_id": "org-ojH41IdW0UR2VlysxKUx8AjA",
   "result_files": [
     {
       "bytes": 137102,
       "created_at": 1679088154,
       "filename": "compiled_results.csv",
       "id": "file-ITpHe8yutFEzrRM8fgkfgd1x",
       "object": "file",
       "purpose": "fine-tune-results",
       "status": "processed",
       "status_details": null
     }
   ],
   "status": "succeeded",
   "training_files": [
     {
       "bytes": 148134,
       "created_at": 1679087516,
       "filename": "file",
       "id": "file-8GOnd7VCyS2AikxEoP0K3ojp",
       "object": "file",
       "purpose": "fine-tune",
