In [3]:
import pandas as pd

In [13]:
evaluation_questions = dict()

with open("dataset/ques_ans.txt", "r") as file:
    lines = file.readlines()
    question_type = "Stock Performance"
    for line in lines:
        processed_line = line.strip()
        if question_type not in evaluation_questions:
            evaluation_questions[question_type] = []
        if processed_line != "":
            if processed_line.startswith("Q:"):
                ques_ans = dict()
                ques_ans["query"] = processed_line.removeprefix("Q: ")
            elif processed_line.startswith("A:"):
                ques_ans["response"] = processed_line.removeprefix("A: ")
                evaluation_questions[question_type].append(ques_ans)
            else:
                question_type = processed_line
    

In [14]:
import json
json.dump(evaluation_questions, open("dataset/ques_ans.json", 'w'))

In [15]:
json_file = open("dataset/ques_ans.json")
evaluation_queries = json.load(json_file)

In [16]:
eval_df = pd.DataFrame({
    'Category': [category for category in evaluation_queries for _ in evaluation_queries[category]],
    'Query': [query['query'] for category in evaluation_queries.values() for query in category],
    'Expectation': [query['response'] for category in evaluation_queries.values() for query in category]
})

In [22]:
import dspy
from pydantic import BaseModel, Field

class Score(BaseModel):
    commentary: str = Field(desc="The analysis of the score")
    Score: int = Field(desc="The score")

class Scorer(dspy.Signature):
    """
    You are a code evaluating agent. You take a query and code for evaluation. 
    You need to +1 for each of these attributes in the code

    {'correct_column_names','title','Annotations', 'Aggregation used',
    'correct axis label','Plotly_white theme','Correct chart type'}

    You are provided with a {query}
    and Plotly code {code}
    You need to tell me the total score
    
    """
    query = dspy.InputField(desc="user query which includes information about data and chart they want to plot")
    code = dspy.InputField(desc="The agent generated code")
    output: Score = dspy.OutputField(desc='The score after evaluating the code')


def check_code_run(code):
    score = 0
    try:
        code = code.split('```')[1]
        exec(code)
        score += 10
        return score

    except:
        return score
    
import re

def extract_integers(text):
    return [int(num) for num in re.findall(r'\d+', text)]

def evaluating_response(code, query):
    score = 0
    scorer = dspy.Predict(Scorer)
# Feeds the code and query to the LLM for evaluation
    response = scorer(query=query, code=code)
# Parses the LLM output, your implementation may vary
    print(response)
    score+=extract_integers(response.output)[0]
    return score

In [18]:
from pydantic import BaseModel, Field

class Plotly_code(BaseModel):
    commentary: str = Field(desc="The comments about the code")
    Code: str = Field(desc="The Plotly Code")

class AgentSig(dspy.Signature):
    """
    You are AI agent who uses the {query} to generate data visualizations in Plotly.
    You must give an output as code, in case there is no relevant columns, just state
    that you don't have the relevant information
    """
    query = dspy.InputField(desc="user query which includes information about data and chart they want to plot")
    code: Plotly_code = dspy.OutputField(desc="Plotly code that visualizes what the user needs according to the query & dataframe_index & styling_context")


class AI_data_viz_agent(dspy.Module):
    def __init__(self):
        super().__init__()
        self.agent = dspy.ChainOfThought(AgentSig)
    
    def forward(self, query):
        prediction = self.agent(query=query)
        return dspy.Prediction(code=prediction.code)


lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key="sk-Vo7jCT5lrwMyJ1YeqpzmYvDaS9sYF4Xt_BPLSaiOywT3BlbkFJVP2JXFoP36HSMWqWluMD88AkB7t0KHJ8j-FM0BUngA")
dspy.configure(lm=lm)
agent=AI_data_viz_agent()
print(agent('What is the moving average of AAL stock?').code)

 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb


```python
import plotly.express as px

# Assuming you have a dataframe named df with columns 'Date' and 'AAL'
df['Moving Average'] = df['AAL'].rolling(window=30).mean()

fig = px.line(df, x='Date', y='Moving Average', title='Moving Average of AAL Stock Prices')
fig.show()
```


In [23]:
code_list =[]
for q in eval_df['Query']:
    code_list.append(agent(q).code)
eval_df['Code'] = code_list

eval_df['check_run'] = [check_code_run(code) for code in eval_df['Code']]

eval_df['Attribute_Score'] = [evaluating_response(code,query) for code,query in zip(eval_df['Code'],eval_df['Query'])]

eval_df['Answerable'] = [1 if x.strip().lower()!='no relevant information' else 0 for x in eval_df['Expectation']]


Prediction(
    output='Total Score: 4'
)
Prediction(
    output='Output: 2\n\nThe code does not contain any of the specified attributes.'
)
Prediction(
    output='Total Score: 5'
)
Prediction(
    output='Score: 0'
)
Prediction(
    output='Output: 0'
)
Prediction(
    output='Output: 0'
)
Prediction(
    output='Score: 4'
)
Prediction(
    output='Score: 0\n\nExplanation: The code provided does not include any of the attributes mentioned for evaluation.'
)
Prediction(
    output="Query: How much did Apple’s stock gain in Q2 2023?\nCode: ```python\nimport plotly.express as px\n\n# Assuming you have a dataframe named df with columns 'Date', 'Stock', and 'Price'\napple_q2_2023 = df[(df['Stock'] == 'Apple') & (df['Date'].between('2023-04-01', '2023-06-30'))]\napple_q2_2023['Price Gain'] = apple_q2_2023['Price'] - apple_q2_2023['Price'].iloc[0]\n\nfig = px.line(apple_q2_2023, x='Date', y='Price Gain', title='Apple Stock Gain"
)
Prediction(
    output="7\n\nExplanation:\n- 'correct_column

In [None]:
class CodeJudge(dspy.Signature):
    """Judge if the response has any code"""
    response = dspy.InputField(desc="Response from AI agent")
    has_code = dspy.OutputField(desc="Does the response contain any Python code", prefix="Factual[Yes/No]:")

# A metric that calculates the final score for every predicted response
# compared with the example which contains the best response
def full_metric(example,pred, trace=None):
    if 'No relevant information' not in example.code:
        check_run = check_code_run(pred.code)
        attributes = evaluating_response(pred.code,example.query)
    else:
        check_if_code = dspy.ChainOfThought(CodeJudge)
        response = check_if_code(response=pred.code)
        if response.has_code.split('Factual[Yes/No]:')[1].strip()=='Yes':
            return 0
        else:
            return 19
        
    return check_run + attributes

zip_ = zip(eval_df['Answerable'],eval_df['check_run'],eval_df['Attribute_Score'],eval_df['Code'])
eval_df['Total_Score'] = [final_score(a,c,a_s,c) for a,c,a_s,c in zip_]

# computing the total score/total attainable score

eval_full_df['Total_Score'].sum()/(len(eval_full_df))*19