In [1]:
# load packages
from langchain.llms import OpenAI
from langchain import PromptTemplate
import pandas as pd
import re

In [4]:
# get openai api key
def load_secret(file_path):
    with open(file_path, 'r') as file:
        secret = file.read().strip()
    return secret

secret_file_path = "./secret.txt"  # Replace with the path to your secret file if different

import os
os.environ["OPENAI_API_KEY"] = load_secret(secret_file_path)

In [5]:
# load model
llm = OpenAI(temperature=0, model_name = "gpt-3.5-turbo")



In [19]:
# load data
df = pd.read_csv('./Demo_Data.csv')
df['Date'] = pd.to_datetime(df['Date'])

# create prompt
df_template = """
    You are an expert data scientist, working with a pandas dataframe in Python. The name of the dataframe is `df`.
    If a package is needed, call `import` at the top of the code.

    Your task is to create a plan of steps to take to answer the question below. 
    Do not return blocks of code, only the plan of steps needed to get the answer. 

    This is the results of `print(df.head())`:
    {df_head}

    This is the results of `print(df.describe())`:
    {df_describe}

    This is the results of `print(df.dtypes)`:
    {df_dtypes}

    This is the results of `print(df.columns)`:
    {df_columns}

    Begin!

    Question: {input}
    Answer:
"""

prompt_template = PromptTemplate.from_template(df_template)

final_prompt = prompt_template.format(df_head = str(df.head().to_markdown()), 
                                      df_describe = str(df.describe().to_markdown()),
                                      df_dtypes = str(df.dtypes.to_markdown()),
                                      df_columns = str(df.columns),
                                      input = "calculate autocrrelation for each country and product and graph them")

print(final_prompt)



    You are an expert data scientist, working with a pandas dataframe in Python. The name of the dataframe is `df`.
    If a package is needed, call `import` at the top of the code.

    Your task is to create a plan of steps to take to answer the question below. 
    Do not return blocks of code, only the plan of steps needed to get the answer. 

    This is the results of `print(df.head())`:
    |    | Country       | Product   | Date                |   Revenue |
|---:|:--------------|:----------|:--------------------|----------:|
|  0 | United States | Cookies   | 2015-07-01 00:00:00 |       100 |
|  1 | United States | Cookies   | 2015-08-01 00:00:00 |        90 |
|  2 | United States | Cookies   | 2015-09-01 00:00:00 |        80 |
|  3 | United States | Cookies   | 2015-10-01 00:00:00 |       110 |
|  4 | United States | Cookies   | 2015-11-01 00:00:00 |       100 |

    This is the results of `print(df.describe())`:
    |       | Date                          |   Revenue |
|:---

In [20]:
# call model
answer = llm(final_prompt)

formatted_answer = f"""{answer.strip()}"""

print(formatted_answer)

1. Group the dataframe by 'Country' and 'Product'
2. For each group, calculate the autocorrelation of the 'Revenue' column using the pandas autocorr() function
3. Store the autocorrelation values in a new column in the group dataframe
4. Reset the index of the group dataframe
5. Use seaborn or matplotlib to create a line plot for each group, with the x-axis as the lag and the y-axis as the autocorrelation value
6. Add a legend to the plot to differentiate between the different groups (based on 'Country' and 'Product')
7. Display the plot.


In [21]:
# load data
df = pd.read_csv('./Demo_Data.csv')
df['Date'] = pd.to_datetime(df['Date'])

# create prompt
df_template = """
    You are an expert data scientist, working with a pandas dataframe in Python. The name of the dataframe is `df`.
    If a package is needed, call `import` at the top of the code.

    Your task is to execute the list of steps below to answer the question below. Only write code to execute the steps, 
    inside of a python code block like ```python```, with comments to help understand what you are doing and what step this code is executing.
    If given a choice between two or more packages to use in a step, choose the first one. 
    Any output from the code should be formatted to work in a streamlit app. 

    This is the results of `print(df.head())`:
    {df_head}

    This is the results of `print(df.describe())`:
    {df_describe}

    This is the results of `print(df.dtypes)`:
    {df_dtypes}

    This is the results of `print(df.columns)`:
    {df_columns}

    Begin!

    Question: {input}
    Steps: {steps}
    Answer:
    """

prompt_template = PromptTemplate.from_template(df_template)

final_prompt = prompt_template.format(df_head = str(df.head().to_markdown()), 
                                      df_describe = str(df.describe().to_markdown()),
                                      df_dtypes = str(df.dtypes.to_markdown()),
                                      df_columns = str(df.columns),
                                      steps = formatted_answer,
                                      input = "calculate autocrrelation for each country and product and graph them")

print(final_prompt)



    You are an expert data scientist, working with a pandas dataframe in Python. The name of the dataframe is `df`.
    If a package is needed, call `import` at the top of the code.

    Your task is to execute the list of steps below to answer the question below. Only write code to execute the steps, 
    inside of a python code block like ```python```, with comments to help understand what you are doing and what step this code is executing.
    If given a choice between two or more packages to use in a step, choose the first one. 
    Any output from the code should be formatted to work in a streamlit app. 

    This is the results of `print(df.head())`:
    |    | Country       | Product   | Date                |   Revenue |
|---:|:--------------|:----------|:--------------------|----------:|
|  0 | United States | Cookies   | 2015-07-01 00:00:00 |       100 |
|  1 | United States | Cookies   | 2015-08-01 00:00:00 |        90 |
|  2 | United States | Cookies   | 2015-09-01 00:00:00

In [22]:
# call model
answer = llm(final_prompt)

formatted_answer = f"""{answer.strip()}"""

print(formatted_answer)

```python
# Step 1: Group the dataframe by 'Country' and 'Product'
grouped_df = df.groupby(['Country', 'Product'])

# Step 2: For each group, calculate the autocorrelation of the 'Revenue' column using the pandas autocorr() function
autocorr_values = grouped_df['Revenue'].apply(lambda x: x.autocorr())

# Step 3: Store the autocorrelation values in a new column in the group dataframe
grouped_df['Autocorrelation'] = autocorr_values

# Step 4: Reset the index of the group dataframe
grouped_df = grouped_df.reset_index()

# Step 5: Use seaborn or matplotlib to create a line plot for each group, with the x-axis as the lag and the y-axis as the autocorrelation value
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('darkgrid')
fig, ax = plt.subplots(figsize=(10, 6))

for name, group in grouped_df.groupby(['Country', 'Product']):
    ax.plot(group.index, group['Autocorrelation'], label=name)

# Step 6: Add a legend to the plot to differentiate between the different groups (b

In [23]:
exec(formatted_answer)

SyntaxError: invalid syntax (<string>, line 1)