In [1]:
import os

from dotenv import load_dotenv
import pandas as pd

from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler



In [8]:
df.sample(3)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
1258596,2020-06-07 22:50:08,4745996322265,fraud_Harris Group,food_dining,8.97,Carrie,Washington,F,6114 Adams Harbor Suite 096,Kingsford Heights,...,41.4802,-86.6919,1423,"Psychologist, forensic",1998-10-07,0c674f6470e07e6fcfe9c080b0be5c6d,1370645408,41.744401,-86.527846,0
489008,2019-08-02 01:15:15,6506116513503136,fraud_Reynolds-Schinner,misc_pos,257.4,Kimberly,Rice,F,63991 Destiny Rue Apt. 651,Tyler,...,32.2768,-95.3031,144160,Sports development officer,1984-05-04,0ca1fa6105005db4f0633c09fb99253c,1343870115,32.957138,-94.450739,0
569350,2019-08-30 14:18:04,377993105397617,fraud_Friesen Inc,shopping_pos,2.85,Nathan,Martinez,M,586 Thomas Cliffs,Oconto Falls,...,44.8755,-88.1555,5548,Mining engineer,1975-09-11,6df34075a2e98576fb0b38e128223c49,1346336284,44.204083,-88.578745,0


In [3]:
load_dotenv()
open_api_key = os.environ['OPENAI_API_KEY']

In [4]:
schema = """
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1000 non-null   int64  
 1   trans_date_trans_time  1000 non-null   object 
 2   cc_num                 1000 non-null   int64  
 3   merchant               1000 non-null   object 
 4   category               1000 non-null   object 
 5   amt                    1000 non-null   float64
 6   first                  1000 non-null   object 
 7   last                   1000 non-null   object 
 8   gender                 1000 non-null   object 
 9   street                 1000 non-null   object 
 10  city                   1000 non-null   object 
 11  state                  1000 non-null   object 
 12  zip                    1000 non-null   int64  
 13  lat                    1000 non-null   float64
 14  long                   1000 non-null   float64
 15  city_pop               1000 non-null   int64  
 16  job                    1000 non-null   object 
 17  dob                    1000 non-null   object 
 18  trans_num              1000 non-null   object 
 19  unix_time              1000 non-null   int64  
 20  merch_lat              1000 non-null   float64
 21  merch_long             1000 non-null   float64
 22  is_fraud               1000 non-null   int64  
dtypes: float64(5), int64(6), object(12)
"""

In [9]:
# This is an LLMChain to write a synopsis given a title of a play and the era it is set in.
llm = OpenAI(temperature=.2)
analysis_template = """
You are an expert in data analysis with 20 years of professional experience, holding multiple PhDs, and who excels at exploratory 
data analysis (EDA). 
Based on this schema: {schema}, Provide a detailed and well step by step recipe for executing EDA for this dataset.
Each item in the list should be clear and explicitly refering to the data and/or variables in the data.
"""
# Prompt Template
prompt_template = PromptTemplate(
    input_variables=['schema'],
    template=analysis_template
)

# Define the LLM to use
llm = OpenAI(model_name="gpt-3.5-turbo",
             temperature=0.2,
             streaming=True, 
             callbacks=[StreamingStdOutCallbackHandler()],
             verbose=True
            )
chain = LLMChain(llm=llm, prompt=prompt_template, verbose=True)
response = chain.run(schema=schema)





[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert in data analysis with 20 years of professional experience, holding multiple PhDs, and who excels at exploratory 
data analysis (EDA). 
Based on this schema: 
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1000 non-null   int64  
 1   trans_date_trans_time  1000 non-null   object 
 2   cc_num                 1000 non-null   int64  
 3   merchant               1000 non-null   object 
 4   category               1000 non-null   object 
 5   amt                    1000 non-null   float64
 6   first                  1000 non-null   object 
 7   last                   1000 non-null   object 
 8   gender                 1000 non-null   object 
 9   street                 1000 non-null   object 
 10  city                   1000 non-null   object 
 11  state                  1000 

In [None]:
prompt_template = PromptTemplate(input_variables=['schema'], template=analysis_template)
synopsis_chain = LLMChain(llm=llm, prompt=prompt_template, output_key="input")

from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI


agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    df,
    agent_type="openai-functions",
    agent_executor_kwargs={"handle_parsing_errors": True},
    verbose=True
)

# This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
overall_chain = SequentialChain(
    chains=[synopsis_chain, agent],
    input_variables=["schema"],
    # Here we return multiple variables
    output_variables=["scripts", "agent_output"],
    verbose=True)