In [7]:
import openai
import json
import csv
import numpy as np

In [8]:
# Load training data
with open('dataset/train.json') as f:
    data_dict = json.load(f)

dataset = np.array([
    ('\n'.join(data['pre_text'] 
                + data['post_text'] 
                + [json.dumps(data['table_ori']), json.dumps(data['table'])]) 
     + '\n' + data['qa']['question'], 
     data['qa']['answer'])
    for data in data_dict
])

# Remove questions without an answer
clean_dataset = np.array([(p, c) for (p, c) in dataset if c])

In [9]:
# Save to CSV
with open('formatted_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["prompt", "completion"])  # write header
    writer.writerows(clean_dataset)  # write data

In [10]:
# Convert CSV into format usable by GPT
!openai tools fine_tunes.prepare_data -f "formatted_data.csv"

Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 6203 prompt-completion pairs
- There are 47 duplicated prompt-completion sets. These are rows: [1511, 1786, 1863, 1899, 1915, 2067, 2360, 2701, 2886, 2995, 3103, 3351, 3372, 3426, 3461, 3493, 3585, 3715, 3719, 3740, 3759, 3773, 3799, 3906, 3908, 3923, 3971, 4035, 4041, 4229, 4332, 4451, 4638, 4700, 4728, 4745, 4912, 4913, 5030, 5175, 5273, 5324, 5367, 5419, 5451, 5484, 5511]
- There are 45 examples that are very long. These are rows: [66, 413, 494, 533, 972, 1121, 1281, 1308, 1319, 1391, 1465, 1568, 1573, 1655, 1975, 2137, 2140, 2223, 2519, 2799, 2912, 2923, 2963, 3249, 3282, 3534, 3947, 4018, 4194, 4244, 4270, 4297, 4343, 4413, 4633, 5702, 5704, 5795, 5876, 5929, 5969, 5992, 6023, 6167, 6189]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.^C



In [15]:
# Load validation data
with open('dataset/test.json') as f:
    valid_data_dict = json.load(f)

valid_dataset = np.array([
    ('\n'.join(data['pre_text'] 
                + data['post_text'] 
                + [json.dumps(data['table_ori']), json.dumps(data['table'])]) 
     + '\n' + data['qa']['question'], 
     data['qa']['answer'])
    for data in valid_data_dict
])

# Remove questions without an answer
clean_valid_dataset = np.array([(p, c) for (p, c) in valid_dataset if c])

# Save to CSV
with open('formatted_valid_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["prompt", "completion"])  # write header
    writer.writerows(clean_valid_dataset)  # write data


In [16]:
# Convert CSV into format usable by GPT
!openai tools fine_tunes.prepare_data -f "formatted_valid_data.csv"

Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 1133 prompt-completion pairs
- There are 9 duplicated prompt-completion sets. These are rows: [222, 264, 535, 662, 755, 760, 951, 953, 957]
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.
- The completion should 

In [11]:
file = open("api_key.txt", "r")
api_key = file.read()
file.close()

In [40]:
openai.api_key = api_key

In [17]:
training_upload_response = openai.File.create(
  file=open("formatted_data_prepared.jsonl", "rb"),
  purpose='fine-tune'
)
training_file_id = training_upload_response.id
training_upload_response

<File file id=file-mjKGzWqB6QJgbR337S2mQTgf at 0x7fe478362900> JSON: {
  "bytes": 28792529,
  "created_at": 1684274350,
  "filename": "file",
  "id": "file-mjKGzWqB6QJgbR337S2mQTgf",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [19]:
valid_upload_response = openai.File.create(
  file=open("formatted_valid_data_prepared.jsonl", "rb"),
  purpose='fine-tune'
)
valid_file_id = valid_upload_response.id
valid_upload_response

<File file id=file-wJevgYfAUGFx50S5xR4Y9Od8 at 0x7fe4f8459c20> JSON: {
  "bytes": 5219225,
  "created_at": 1684274410,
  "filename": "file",
  "id": "file-wJevgYfAUGFx50S5xR4Y9Od8",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [20]:
fine_tune_response = openai.FineTune.create(training_file=training_file_id, validation_file=valid_file_id, model="ada")

<FineTune fine-tune id=ft-PzwpGKewlzXC1fDppl3IAiin at 0x7fe4bb475180> JSON: {
  "created_at": 1684274446,
  "events": [
    {
      "created_at": 1684274446,
      "level": "info",
      "message": "Created fine-tune: ft-PzwpGKewlzXC1fDppl3IAiin",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-PzwpGKewlzXC1fDppl3IAiin",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-rSvnkirO3SMFhS8NAtAozdxK",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 28792529,
      "created_at": 1684274350,
      "filename": "file",
      "id": "file-mjKGzWqB6QJgbR337S2mQTgf",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processed",
      "status_details": null
    }
  ],
  "updated_at": 1684274446,
  "validation_files": [
    {
      "bytes": 5219225,
  

In [21]:
job_id = fine_tune_response["id"]
status = fine_tune_response["status"]

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {fine_tune_response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ft-PzwpGKewlzXC1fDppl3IAiin.
Training Response: {
  "created_at": 1684274446,
  "events": [
    {
      "created_at": 1684274446,
      "level": "info",
      "message": "Created fine-tune: ft-PzwpGKewlzXC1fDppl3IAiin",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-PzwpGKewlzXC1fDppl3IAiin",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-rSvnkirO3SMFhS8NAtAozdxK",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 28792529,
      "created_at": 1684274350,
      "filename": "file",
      "id": "file-mjKGzWqB6QJgbR337S2mQTgf",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processed",
      "status_details": null
    }
  ],
  "updated_at": 1684274446,
  "validation_files": [
    {
      "bytes": 5219225,

In [27]:
import signal
import datetime

def signal_handler(sig, frame):
	status = openai.FineTune.retrieve(job_id).status
	print(f"Stream interrupted. Job is still {status}.")
	return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
    for event in events:
    	print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')
except Exception:
	print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ft-PzwpGKewlzXC1fDppl3IAiin
2023-05-17 00:00:46 Created fine-tune: ft-PzwpGKewlzXC1fDppl3IAiin
2023-05-17 00:03:12 Fine-tune costs $10.10
2023-05-17 00:03:12 Fine-tune enqueued. Queue number: 12
2023-05-17 00:10:08 Fine-tune is in the queue. Queue number: 11
2023-05-17 00:10:09 Fine-tune is in the queue. Queue number: 10
2023-05-17 00:10:55 Fine-tune is in the queue. Queue number: 9
2023-05-17 00:12:13 Fine-tune is in the queue. Queue number: 8
2023-05-17 00:12:15 Fine-tune is in the queue. Queue number: 7
2023-05-17 00:12:49 Fine-tune is in the queue. Queue number: 6
2023-05-17 00:14:04 Fine-tune is in the queue. Queue number: 5
2023-05-17 00:14:22 Fine-tune is in the queue. Queue number: 4
2023-05-17 00:14:22 Fine-tune is in the queue. Queue number: 3
2023-05-17 00:14:24 Fine-tune is in the queue. Queue number: 2
2023-05-17 00:14:25 Fine-tune is in the queue. Queue number: 1
2023-05-17 00:14:26 Fine-tune is in the queue. Queue number: 0
2023-

In [29]:
import time

status = openai.FineTune.retrieve(id=job_id)["status"]
if status not in ["succeeded", "failed"]:
	print(f'Job not in terminal status: {status}. Waiting.')
	while status not in ["succeeded", "failed"]:
		time.sleep(2)
		status = openai.FineTune.retrieve(id=job_id)["status"]
		print(f'Status: {status}')
else:
	print(f'Finetune job {job_id} finished with status: {status}')

print('Checking other finetune jobs in the subscription.')
result = openai.FineTune.list()
print(f'Found {len(result.data)} finetune jobs.')

Finetune job ft-PzwpGKewlzXC1fDppl3IAiin finished with status: succeeded
Checking other finetune jobs in the subscription.
Found 1 finetune jobs.


In [54]:
retrieve_response = openai.FineTune.retrieve(job_id)
fine_tuned_model = retrieve_response.fine_tuned_model
fine_tuned_model

'ada:ft-personal-2023-05-16-23-02-15'

In [63]:
new_prompt = '''entergy corporation and subsidiaries management 2019s financial discussion and analysis a result of the entergy louisiana and entergy gulf states louisiana business combination , results of operations for 2015 also include two items that occurred in october 2015 : 1 ) a deferred tax asset and resulting net increase in tax basis of approximately $ 334 million and 2 ) a regulatory liability of $ 107 million ( $ 66 million net-of-tax ) as a result of customer credits to be realized by electric customers of entergy louisiana , consistent with the terms of the stipulated settlement in the business combination proceeding .
see note 2 to the financial statements for further discussion of the business combination and customer credits .
results of operations for 2015 also include the sale in december 2015 of the 583 mw rhode island state energy center for a realized gain of $ 154 million ( $ 100 million net-of-tax ) on the sale and the $ 77 million ( $ 47 million net-of-tax ) write-off and regulatory charges to recognize that a portion of the assets associated with the waterford 3 replacement steam generator project is no longer probable of recovery .
see note 14 to the financial statements for further discussion of the rhode island state energy center sale .
see note 2 to the financial statements for further discussion of the waterford 3 write-off .
results of operations for 2014 include $ 154 million ( $ 100 million net-of-tax ) of charges related to vermont yankee primarily resulting from the effects of an updated decommissioning cost study completed in the third quarter 2014 along with reassessment of the assumptions regarding the timing of decommissioning cash flows and severance and employee retention costs .
see note 14 to the financial statements for further discussion of the charges .
results of operations for 2014 also include the $ 56.2 million ( $ 36.7 million net-of-tax ) write-off in 2014 of entergy mississippi 2019s regulatory asset associated with new nuclear generation development costs as a result of a joint stipulation entered into with the mississippi public utilities staff , subsequently approved by the mpsc , in which entergy mississippi agreed not to pursue recovery of the costs deferred by an mpsc order in the new nuclear generation docket .
see note 2 to the financial statements for further discussion of the new nuclear generation development costs and the joint stipulation .
net revenue utility following is an analysis of the change in net revenue comparing 2015 to 2014 .
amount ( in millions ) .
the retail electric price variance is primarily due to : 2022 formula rate plan increases at entergy louisiana , as approved by the lpsc , effective december 2014 and january 2015 ; 2022 an increase in energy efficiency rider revenue primarily due to increases in the energy efficiency rider at entergy arkansas , as approved by the apsc , effective july 2015 and july 2014 , and new energy efficiency riders at entergy louisiana and entergy mississippi that began in the fourth quarter 2014 ; and 2022 an annual net rate increase at entergy mississippi of $ 16 million , effective february 2015 , as a result of the mpsc order in the june 2014 rate case .
see note 2 to the financial statements for a discussion of rate and regulatory proceedings. .
[["", "Amount (In Millions)"], ["2014 net revenue", "$5,735"], ["Retail electric price", "187"], ["Volume/weather", "95"], ["Waterford 3 replacement steam generator provision", "(32)"], ["MISO deferral", "(35)"], ["Louisiana business combination customer credits", "(107)"], ["Other", "(14)"], ["2015 net revenue", "$5,829"]]
[["", "amount ( in millions )"], ["2014 net revenue", "$ 5735"], ["retail electric price", "187"], ["volume/weather", "95"], ["waterford 3 replacement steam generator provision", "-32 ( 32 )"], ["miso deferral", "-35 ( 35 )"], ["louisiana business combination customer credits", "-107 ( 107 )"], ["other", "-14 ( 14 )"], ["2015 net revenue", "$ 5829"]]
what is the net change in net revenue during 2015 for entergy corporation?\n\n###\n\n'''

answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt,
  max_tokens=10,
  temperature=0
)
answer['choices'][0]['text']

' -184\n\n###\n\n -17\n\n'

In [64]:
new_prompt = '''undesignated hedges was $ 41.2 million and $ 42.1 million , respectively .
the fair value of these hedging instruments in the company 2019s consolidated balance sheets as of october 29 , 2011 and october 30 , 2010 was immaterial .
interest rate exposure management 2014 on june 30 , 2009 , the company entered into interest rate swap transactions related to its outstanding 5.0% ( 5.0 % ) senior unsecured notes where the company swapped the notional amount of its $ 375 million of fixed rate debt at 5.0% ( 5.0 % ) into floating interest rate debt through july 1 , 2014 .
under the terms of the swaps , the company will ( i ) receive on the $ 375 million notional amount a 5.0% ( 5.0 % ) annual interest payment that is paid in two installments on the 1st of every january and july , commencing january 1 , 2010 through and ending on the maturity date ; and ( ii ) pay on the $ 375 million notional amount an annual three month libor plus 2.05% ( 2.05 % ) ( 2.42% ( 2.42 % ) as of october 29 , 2011 ) interest payment , payable in four installments on the 1st of every january , april , july and october , commencing on october 1 , 2009 and ending on the maturity date .
the libor- based rate is set quarterly three months prior to the date of the interest payment .
the company designated these swaps as fair value hedges .
the fair value of the swaps at inception was zero and subsequent changes in the fair value of the interest rate swaps were reflected in the carrying value of the interest rate swaps on the balance sheet .
the carrying value of the debt on the balance sheet was adjusted by an equal and offsetting amount .
the gain or loss on the hedged item ( that is , the fixed-rate borrowings ) attributable to the hedged benchmark interest rate risk and the offsetting gain or loss on the related interest rate swaps for fiscal year 2011 and fiscal year 2010 were as follows : statement of income .
the amounts earned and owed under the swap agreements are accrued each period and are reported in interest expense .
there was no ineffectiveness recognized in any of the periods presented .
the market risk associated with the company 2019s derivative instruments results from currency exchange rate or interest rate movements that are expected to offset the market risk of the underlying transactions , assets and liabilities being hedged .
the counterparties to the agreements relating to the company 2019s derivative instruments consist of a number of major international financial institutions with high credit ratings .
based on the credit ratings of our counterparties as of october 29 , 2011 , we do not believe that there is significant risk of nonperformance by them .
furthermore , none of the company 2019s derivative transactions are subject to collateral or other security arrangements and none contain provisions that are dependent on the company 2019s credit ratings from any credit rating agency .
while the contract or notional amounts of derivative financial instruments provide one measure of the volume of these transactions , they do not represent the amount of the company 2019s exposure to credit risk .
the amounts potentially subject to credit risk ( arising from the possible inability of counterparties to meet the terms of their contracts ) are generally limited to the amounts , if any , by which the counterparties 2019 obligations under the contracts exceed the obligations of the company to the counterparties .
as a result of the above considerations , the company does not consider the risk of counterparty default to be significant .
the company records the fair value of its derivative financial instruments in the consolidated financial statements in other current assets , other assets or accrued liabilities , depending on their net position , regardless of the purpose or intent for holding the derivative contract .
changes in the fair value of the derivative financial instruments are either recognized periodically in earnings or in shareholders 2019 equity as a component of oci .
changes in the fair value of cash flow hedges are recorded in oci and reclassified into earnings when the underlying contract matures .
changes in the fair values of derivatives not qualifying for hedge accounting are reported in earnings as they occur .
the total notional amounts of derivative instruments designated as hedging instruments as of october 29 , 2011 and october 30 , 2010 were $ 375 million of interest rate swap agreements accounted for as fair value hedges and $ 153.7 million and $ 139.9 million , respectively , of cash flow hedges denominated in euros , british pounds and analog devices , inc .
notes to consolidated financial statements 2014 ( continued ) .
[["Statement of Income", "October 29, 2011", "October 30, 2010"], ["Classification", "Loss on Swaps", "Gain on Note", "Net Income Effect", "Gain on Swaps", "Loss on Note", "Net Income Effect"], ["Other income", "$(4,614)", "$4,614", "$\u2014", "$20,692", "$(20,692)", "$\u2014"]]
[["statement of income classification", "statement of income loss on swaps", "statement of income gain on note", "statement of income net income effect", "statement of income gain on swaps", "loss on note", "net income effect"], ["other income", "$ -4614 ( 4614 )", "$ 4614", "$ 2014", "$ 20692", "$ -20692 ( 20692 )", "$ 2014"]]
what is the percentage change in cash flow hedges in 2011 compare to the 2010?\n\n###\n\n'''

answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt,
  max_tokens=10,
  temperature=0
)
answer['choices'][0]['text']

' -21.0%\n\n###\n\n -'