In [None]:
import os
import re
import time
import json
import openai
import numpy as np
import traceback
import matplotlib.pyplot as plt
from dotenv import load_dotenv, find_dotenv

import langchain.chat_models
import langchain.schema
from langchain.callbacks import get_openai_callback
from langchain import OpenAI
from langchain.schema import (HumanMessage, AIMessage)

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

_ = load_dotenv(find_dotenv())
open_api_key = os.environ["OPENAI_API_KEY"]

from openai_textgen import TextGenerator
from summarizer import Summarizer
from datamodel import Message, LLMResponse, Summary

# Test TextGenerator Class

In [None]:
config =  {'temperature' : 0, 'max_tokens': 300, 'n': 3}
chatbot = TextGenerator()
prompt = "It seems like today is cold in bay area time to get cozy in the house. what do you think?"
messages = [
    {
        "role": "system",
        "content": "You are a sassy funny assistant tasked to respond in a sarcastic tone",
    },
    {
        "role": "user",
        "content": f"{prompt}"
    }
]
res = chatbot.generate(messages=messages, config=config)
res.text

# Test Summerizer Class

In [None]:
text_gen_config = {'max_tokens':2000}
text_gen = TextGenerator()
df = pd.read_excel("../data/ROBERT_KING.xlsx", index_col=0)
get_summary = Summarizer()

In [None]:
summary_default = get_summary.summarize(data=df, n_samples=3, text_gen=text_gen, textgen_config=text_gen_config, summary_method="default")
display(Summary(**summary_default))

In [None]:
summary_llm = get_summary.summarize(data=df, n_samples=3, text_gen=text_gen, textgen_config=text_gen_config, summary_method="llm")
display(Summary(**summary_llm))

# Test Vizualization Goal Generation

In [None]:
import json
import logging
from utils import clean_code_snippet
from openai_textgen import TextGenerator
from datamodel import Goal, Persona

SYSTEM_INSTRUCTIONS = """
You are a an experienced data analyst who can generate a given number of insightful GOALS about data, when given a summary of the data, and a specified persona. The VISUALIZATIONS YOU RECOMMEND MUST FOLLOW VISUALIZATION BEST PRACTICES (e.g., must use bar charts instead of pie charts for comparing quantities) AND BE MEANINGFUL (e.g., plot longitude and latitude on maps where appropriate). They must also be relevant to the specified persona. Each goal must include a question, a visualization (THE VISUALIZATION MUST REFERENCE THE EXACT COLUMN FIELDS FROM THE SUMMARY), and a rationale (JUSTIFICATION FOR WHICH dataset FIELDS ARE USED and what we will learn from the visualization). Each goal MUST mention the exact fields from the dataset summary above
"""

FORMAT_INSTRUCTIONS = """
THE OUTPUT MUST BE A CODE SNIPPET OF A VALID LIST OF JSON OBJECTS. IT MUST USE THE FOLLOWING FORMAT:

```[
    { "index": 0,  "question": "What is the distribution of X", "visualization": "histogram of X", "rationale": "This tells about "} ..
    ]
```
THE OUTPUT SHOULD ONLY USE THE JSON FORMAT ABOVE.
"""

logger = logging.getLogger("chat2vis")


class GoalExplorer():
    """Generat goals given a summary of data"""

    def __init__(self) -> None:
        pass

    def generate(self, summary: dict, textgen_config: dict,
                 text_gen: TextGenerator, n=5, persona: Persona = None) -> list[Goal]:
        """Generate goals given a summary of data"""

        user_prompt = f"""The number of GOALS to generate is {n}. The goals should be based on the data summary below, \n\n .
        {summary} \n\n"""

        if not persona:
            persona = Persona(
                persona="A highly skilled data analyst who can come up with complex, insightful goals about data",
                rationale="")

        user_prompt += f"""\n The generated goals SHOULD BE FOCUSED ON THE INTERESTS AND PERSPECTIVE of a '{persona.persona} persona, who is insterested in complex, insightful goals about the data. \n"""

        messages = [
            {"role": "system", "content": SYSTEM_INSTRUCTIONS},
            {"role": "assistant",
             "content":
             f"{user_prompt}\n\n {FORMAT_INSTRUCTIONS} \n\n. The generated {n} goals are: \n "}]

        result: list[Goal] = text_gen.generate(messages=messages, config=textgen_config)

        try:
            json_string = clean_code_snippet(result.text[0].content)
            result = json.loads(json_string)
            # cast each item in the list to a Goal object
            if isinstance(result, dict):
                result = [result]
            result = [Goal(**x) for x in result]
        except json.decoder.JSONDecodeError:
            logger.info(f"Error decoding JSON: {result.text[0].content}")
            print(f"Error decoding JSON: {result.text[0].content}")
            raise ValueError(
                "The model did not return a valid JSON object while attempting generate goals. Consider using a larger model or a model with higher max token length.")
        return result

In [None]:
text_gen_config = {'max_tokens':2000}
text_gen = TextGenerator(model_name='gpt-3.5-turbo')
df = pd.read_excel("../data/ROBERT_KING.xlsx", index_col=0)
goals_explorer = GoalExplorer()
goals = goals_explorer.generate(summary=summary_llm, textgen_config=text_gen_config, text_gen=text_gen, n=3)
for goal in goals:
    display(goal)

In [None]:
class VizGenerator:
    """Generate visualizations from prompt"""
    def __init__(self) -> None:
        pass

    def get_template(self, goal: str, library: str = 'matplotlib'):
        general_instructions = f"If the solution requires a single value (e.g. max, min, median, first, last etc), ALWAYS add a line (axvline or axhline) to the chart, ALWAYS with a legend containing the single value (formatted with 0.2F). If using a <field> where semantic_type=date, YOU MUST APPLY the following transform before using that column i) convert date fields to date types using data[''] = pd.to_datetime(data[<field>], errors='coerce'), ALWAYS use  errors='coerce' ii) drop the rows with NaT values data = data[pd.notna(data[<field>])] iii) convert field to right time format for plotting.  ALWAYS make sure the x-axis labels are legible (e.g., rotate when needed). Solve the task  carefully by completing ONLY the <imports> AND <stub> section. Given the dataset summary, the plot(data) method should generate a {library} chart ({goal.visualization}) that addresses this goal: {goal.question}. DO NOT WRITE ANY CODE TO LOAD THE DATA. The data is already loaded and available in the variable data."

        matplotlib_instructions = f" {general_instructions} DO NOT include plt.show(). The plot method must return a matplotlib object (plt). Think step by step. \n"

        if library == "matplotlib":
            instructions = {
                "role": "assistant",
                "content": f"  {matplotlib_instructions}. Use BaseMap for charts that require a map. "}
            template = \
                f"""
import matplotlib.pyplot as plt
import pandas as pd
<imports>
# plan -
def plot(data: pd.DataFrame):
    <stub> # only modify this section
    plt.title('{goal.question}', wrap=True)
    return plt;

chart = plot(data) # data already contains the data to be plotted. Always include this line. No additional code beyond this line."""
        else:
            raise ValueError("Unsupported library. Only 'matplotlib' is supported.")
        return template, instructions



In [None]:
system_prompt = """
You are a helpful assistant highly skilled in writing PERFECT code for visualizations. Given some code template, you complete the template to generate a visualization given the dataset and the goal described. The code you write MUST FOLLOW VISUALIZATION BEST PRACTICES ie. meet the specified goal, apply the right transformation, use the right visualization type, use the right data encoding, and use the right aesthetics (e.g., ensure axis are legible). The transformations you apply MUST be correct and the fields you use MUST be correct. The visualization CODE MUST BE CORRECT and MUST NOT CONTAIN ANY SYNTAX OR LOGIC ERRORS (e.g., it must consider the field types and use them correctly). You MUST first generate a brief plan for how you would solve the task e.g. what transformations you would apply e.g. if you need to construct a new column, what fields you would use, what visualization type you would use, what aesthetics you would use, etc. .
"""

In [None]:
print(system_prompt)