In [34]:
import dspy
import pandas as pd
import numpy as np
import openai,os,sys
from time import sleep, time
from datetime import date
today = date.today()
from dspy.evaluate import Evaluate
from dspy.teleprompt import MIPRO

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [35]:
seed = 78516
df = pd.read_excel(f'../data/test/lab-manual-split-combine-test-{seed}.xlsx')
df.head()

Unnamed: 0,index,sentence,year,label,orig_index
0,687,Setting the horizon on the interest rate caps ...,2006,2,666
1,595,"Nonetheless, employment is still 9.5 million b...",1999,0,576
2,824,The shifting balance of domestic demand and po...,2017,1,801
3,133,"By 2009, the forecasts for both the headline a...",2022,2,130
4,297,"In Japan, private consumption rebounded strong...",2006,2,283


In [None]:
train = pd.read_excel(f'../data/train/lab-manual-split-combine-train-{seed}.xlsx')
sample = train.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), 10))).reset_index(drop=True)
training_set = [dspy.Example(sentence = sample.loc[i].sentence, answer = sample.loc[i].label).with_inputs("sentence") for i in range(len(sample.index))]

In [None]:
testing_set = [dspy.Example(sentence = df.loc[i].sentence, answer = df.loc[i].label).with_inputs("sentence") for i in range(len(df))]
testing_set

In [39]:
sent = df.loc[0].sentence
print(sent)

Setting the horizon on the interest rate caps to reinforce forward guidance on the policy rate would augment the credibility of the yield curve caps and thereby diminish concerns about an open-ended balance sheet commitment.


In [40]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo-instruct', max_tokens=1000, api_key=api_key)
dspy.settings.configure(lm=turbo)
d = {0: 'dovish', 1: 'hawkish', 2: 'neutral'}

In [41]:
class StanceAnalysis(dspy.Signature):
    """Classify the sentence's stance on the monetary policy between hawkish, neutral, and dovish."""
    
    sentence = dspy.InputField()
    stance = dspy.OutputField(desc = "hawkish, neutral, or dovish")

class Analysis(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predict = dspy.Predict(StanceAnalysis)
    
    def forward(self, sentence):
        return self.predict(sentence=sentence)

In [42]:
classify = dspy.Predict(StanceAnalysis)
classify(sentence=sent)

analyze = Analysis()
analyze(df.loc[4].sentence)

Prediction(
    stance='hawkish'
)

In [44]:
def answer_match(example, pred, trace=None):
    answer_match = d[example.answer].lower() == pred.stance.lower()
    # print(f"Actual: {d[example.answer].lower()}, Predicted: {pred.stance.lower()}")
    return answer_match

In [14]:
evaluate = Evaluate(devset=training_set, metric=answer_match, display_progress=True, display_table=0)
evaluate(Analysis())

Average Metric: 0 / 1  (0.0):   3%|▎         | 1/30 [00:00<00:09,  3.17it/s]

Actual: dovish, Predicted: neutral


Average Metric: 1 / 2  (50.0):   7%|▋         | 2/30 [00:00<00:10,  2.80it/s]

Actual: dovish, Predicted: dovish


Average Metric: 1 / 3  (33.3):  10%|█         | 3/30 [00:00<00:08,  3.30it/s]

Actual: dovish, Predicted: neutral


Average Metric: 2 / 4  (50.0):  13%|█▎        | 4/30 [00:01<00:06,  3.75it/s]

Actual: dovish, Predicted: dovish


Average Metric: 3 / 5  (60.0):  17%|█▋        | 5/30 [00:01<00:07,  3.16it/s]

Actual: dovish, Predicted: dovish


Average Metric: 4 / 6  (66.7):  20%|██        | 6/30 [00:02<00:12,  1.94it/s]

Actual: dovish, Predicted: dovish


Average Metric: 4 / 7  (57.1):  23%|██▎       | 7/30 [00:02<00:10,  2.13it/s]

Actual: dovish, Predicted: neutral


Average Metric: 4 / 8  (50.0):  27%|██▋       | 8/30 [00:03<00:09,  2.27it/s]

Actual: dovish, Predicted: neutral


Average Metric: 5 / 9  (55.6):  30%|███       | 9/30 [00:03<00:07,  2.70it/s]

Actual: dovish, Predicted: dovish


Average Metric: 6 / 10  (60.0):  33%|███▎      | 10/30 [00:03<00:06,  3.06it/s]

Actual: dovish, Predicted: dovish


Average Metric: 7 / 11  (63.6):  37%|███▋      | 11/30 [00:04<00:06,  2.96it/s]

Actual: hawkish, Predicted: hawkish


Average Metric: 8 / 12  (66.7):  40%|████      | 12/30 [00:04<00:05,  3.28it/s]

Actual: hawkish, Predicted: hawkish


Average Metric: 9 / 13  (69.2):  43%|████▎     | 13/30 [00:04<00:04,  3.59it/s]

Actual: hawkish, Predicted: hawkish


Average Metric: 10 / 14  (71.4):  47%|████▋     | 14/30 [00:04<00:04,  3.83it/s]

Actual: hawkish, Predicted: hawkish


Average Metric: 10 / 15  (66.7):  50%|█████     | 15/30 [00:04<00:03,  3.94it/s]

Actual: hawkish, Predicted: dovish


Average Metric: 11 / 16  (68.8):  53%|█████▎    | 16/30 [00:05<00:03,  4.07it/s]

Actual: hawkish, Predicted: hawkish


Average Metric: 11 / 17  (64.7):  57%|█████▋    | 17/30 [00:05<00:03,  3.57it/s]

Actual: hawkish, Predicted: neutral


Average Metric: 11 / 18  (61.1):  60%|██████    | 18/30 [00:05<00:03,  3.28it/s]

Actual: hawkish, Predicted: neutral


Average Metric: 12 / 19  (63.2):  63%|██████▎   | 19/30 [00:06<00:03,  3.56it/s]

Actual: hawkish, Predicted: hawkish


Average Metric: 13 / 20  (65.0):  67%|██████▋   | 20/30 [00:06<00:02,  3.74it/s]

Actual: hawkish, Predicted: hawkish


Average Metric: 14 / 21  (66.7):  70%|███████   | 21/30 [00:06<00:02,  3.41it/s]

Actual: neutral, Predicted: neutral


Average Metric: 15 / 22  (68.2):  73%|███████▎  | 22/30 [00:06<00:02,  3.61it/s]

Actual: neutral, Predicted: neutral


Average Metric: 16 / 23  (69.6):  77%|███████▋  | 23/30 [00:07<00:02,  3.35it/s]

Actual: neutral, Predicted: neutral


Average Metric: 17 / 24  (70.8):  80%|████████  | 24/30 [00:07<00:02,  2.97it/s]

Actual: neutral, Predicted: neutral


Average Metric: 18 / 25  (72.0):  83%|████████▎ | 25/30 [00:07<00:01,  3.26it/s]

Actual: neutral, Predicted: neutral


Average Metric: 18 / 26  (69.2):  87%|████████▋ | 26/30 [00:08<00:01,  3.57it/s]

Actual: neutral, Predicted: hawkish


Average Metric: 19 / 27  (70.4):  90%|█████████ | 27/30 [00:08<00:00,  3.61it/s]

Actual: neutral, Predicted: neutral


Average Metric: 20 / 28  (71.4):  93%|█████████▎| 28/30 [00:08<00:00,  3.33it/s]

Actual: neutral, Predicted: neutral


Average Metric: 21 / 29  (72.4):  97%|█████████▋| 29/30 [00:08<00:00,  3.62it/s]

Actual: neutral, Predicted: neutral


Average Metric: 21 / 30  (70.0): 100%|██████████| 30/30 [00:09<00:00,  3.17it/s]

Actual: neutral, Predicted: dovish
Average Metric: 21 / 30  (70.0%)





70.0

In [45]:
teleprompter = MIPRO(metric = answer_match)
optimized_program = teleprompter.compile(Analysis(), trainset = training_set, num_trials=5, max_bootstrapped_demos=3, max_labeled_demos=3, eval_kwargs=dict(display_progress=True, display_table=0))


Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Task Model: [94m[1m30[0m[93m examples in dev set * [94m[1m5[0m[93m trials * [94m[1m# of LM calls in your program[0m[93m = ([94m[1m150 * # of LM calls in your program[0m[93m) task model calls[0m
[93m- Prompt Model: # data summarizer calls (max [94m[1m10[0m[93m) + [94m[1m10[0m[93m * [94m[1m1[0m[93m lm calls in program = [94m[1m20[0m[93m prompt model calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of calls to prompt model * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).[0m

For a preliminary estimate of potential costs, 

 10%|█         | 3/30 [00:01<00:11,  2.34it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


 10%|█         | 3/30 [00:00<00:06,  4.26it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


 13%|█▎        | 4/30 [00:00<00:05,  4.96it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


 13%|█▎        | 4/30 [00:01<00:08,  3.08it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


 10%|█         | 3/30 [00:00<00:02, 11.41it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


 10%|█         | 3/30 [00:00<00:03,  8.18it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


 13%|█▎        | 4/30 [00:00<00:06,  4.06it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


 17%|█▋        | 5/30 [00:00<00:01, 14.28it/s]


Bootstrapped 3 full traces after 6 examples in round 0.


 10%|█         | 3/30 [00:00<00:02, 12.22it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


[I 2024-04-07 15:23:47,496] A new study created in memory with name: no-name-117fdb80-015e-464d-90b1-77fc31e966a4


Starting trial #0


Average Metric: 16 / 30  (53.3): 100%|██████████| 30/30 [00:07<00:00,  3.81it/s]
[I 2024-04-07 15:23:55,374] Trial 0 finished with value: 53.33 and parameters: {'2170764072224_predictor_instruction': 1, '2170764072224_predictor_demos': 1}. Best is trial 0 with value: 53.33.


Average Metric: 16 / 30  (53.3%)
Starting trial #1


Average Metric: 17 / 30  (56.7): 100%|██████████| 30/30 [00:08<00:00,  3.35it/s]
[I 2024-04-07 15:24:04,325] Trial 1 finished with value: 56.67 and parameters: {'2170764072224_predictor_instruction': 5, '2170764072224_predictor_demos': 4}. Best is trial 1 with value: 56.67.


Average Metric: 17 / 30  (56.7%)
Starting trial #2


Average Metric: 11 / 30  (36.7): 100%|██████████| 30/30 [00:09<00:00,  3.21it/s]
[I 2024-04-07 15:24:13,670] Trial 2 finished with value: 36.67 and parameters: {'2170764072224_predictor_instruction': 3, '2170764072224_predictor_demos': 0}. Best is trial 1 with value: 56.67.


Average Metric: 11 / 30  (36.7%)
Starting trial #3


Average Metric: 19 / 30  (63.3): 100%|██████████| 30/30 [00:09<00:00,  3.18it/s]
[I 2024-04-07 15:24:23,116] Trial 3 finished with value: 63.33 and parameters: {'2170764072224_predictor_instruction': 9, '2170764072224_predictor_demos': 3}. Best is trial 3 with value: 63.33.


Average Metric: 19 / 30  (63.3%)
Starting trial #4


Average Metric: 19 / 30  (63.3): 100%|██████████| 30/30 [00:09<00:00,  3.33it/s]
[I 2024-04-07 15:24:32,130] Trial 4 finished with value: 63.33 and parameters: {'2170764072224_predictor_instruction': 8, '2170764072224_predictor_demos': 4}. Best is trial 3 with value: 63.33.


Average Metric: 19 / 30  (63.3%)
Returning predict = Predict(StringSignature(sentence -> stance
    instructions="classify the sentence's stance on the monetary policy between supportive, neutral, and opposing."
    sentence = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Sentence:', 'desc': '${sentence}'})
    stance = Field(annotation=str required=True json_schema_extra={'desc': 'hawkish, neutral, or dovish', '__dspy_field_type': 'output', 'prefix': '[Classification]", followed by the suggested stance for the prompt, for example: "[Classification] hawkish", "[Classification] neutral", or "[Classification] dovish'})
)) from continue_program


In [46]:
optimized_program

predict = Predict(StringSignature(sentence -> stance
    instructions="classify the sentence's stance on the monetary policy between supportive, neutral, and opposing."
    sentence = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Sentence:', 'desc': '${sentence}'})
    stance = Field(annotation=str required=True json_schema_extra={'desc': 'hawkish, neutral, or dovish', '__dspy_field_type': 'output', 'prefix': '[Classification]", followed by the suggested stance for the prompt, for example: "[Classification] hawkish", "[Classification] neutral", or "[Classification] dovish'})
))

In [47]:
evaluate = Evaluate(devset=testing_set, metric=answer_match, display_progress=True, return_outputs=True)
outputs = evaluate(optimized_program)

Average Metric: 323 / 496  (65.1): 100%|██████████| 496/496 [02:32<00:00,  3.26it/s]

Average Metric: 323 / 496  (65.1%)





In [48]:
outdf = pd.DataFrame(columns=['Sentence', 'Actual', 'Predicted'])
for pred in outputs[1]:
    outdf.loc[len(outdf)] = [pred[0].sentence, d[pred[0].answer].lower(), pred[1].stance.lower()]
outdf.to_csv(f"../data/llm_prompt_outputs/dspy_{seed}.csv", index=False)
print(f"Accuracy: {accuracy_score(outdf['Actual'], outdf['Predicted'])}")
print(f"F1: {f1_score(outdf['Actual'], outdf['Predicted'], average='weighted')}")
outdf.head()

Accuracy: 0.6512096774193549
F1: 0.6398555274413625


Unnamed: 0,Sentence,Actual,Predicted
0,Setting the horizon on the interest rate caps ...,neutral,neutral
1,"Nonetheless, employment is still 9.5 million b...",dovish,dovish
2,The shifting balance of domestic demand and po...,hawkish,neutral
3,"By 2009, the forecasts for both the headline a...",neutral,neutral
4,"In Japan, private consumption rebounded strong...",neutral,neutral


In [49]:
print(np.mean([0.6068066709, 0.6398555274, 0.6768750959]))
print(np.std([0.6068066709, 0.6398555274, 0.6768750959]))

0.6411790980666666
0.028620621062180572
