In [3]:
import yaml
import pandas as pd
from langchain.llms import Cohere, OpenAI, AI21
from langchain.embeddings import CohereEmbeddings,OpenAIEmbeddings
from langchain.vectorstores import FAISS,Pinecone
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import DataFrameLoader
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate

#### MY OWN DATALOADER

In [117]:
class CSVLoader_v1(BaseLoader):
    """Loads a CSV file into a list of documents.
    Each document represents one row of the CSV file. Every row is converted into a
    key/value pair and outputted to a new line in the document's page_content.
    The source for each document loaded from csv is set to the value of the
    `file_path` argument for all doucments by default.
    You can override this by setting the `source_column` argument to the
    name of a column in the CSV file.
    The source of each document will then be set to the value of the column
    with the name specified in `source_column`.
    Output Example:
        .. code-block:: txt
            column1: value1
            column2: value2
            column3: value3
    """

    def __init__(
        self,
        file_path: str,
        source_column= None,
        encoding = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
    def load(self):
        """Load data into document objects."""

        docs = []
        with open(self.file_path,'r') as f:
            #import pdb;pdb.set_trace()
            ticker = self.file_path.split('/')[2]
            meta_data = self.file_path.split('.')[-2].split('/')[-1]
            dict1 = {'balance':'Balance Sheet','cash':'Cash Flow','income':'Income Statement',\
                    'ratios':'Key Financial Ratios','est':'Analyst Estimates','fraud':'Fraud Ratios',
                    'c_news':'News','s_news':'Sentiment News'}
            if meta_data in dict1.keys():
                meta_data = dict1[meta_data]
            metadata = {"ticker": ticker, "metadata": meta_data,"file_path": self.file_path}
            file_content = f.read()
        doc = Document(page_content=file_content, metadata=metadata)
        return [doc]

In [118]:
class TextLoader_v1(BaseLoader):
    def __init__(
        self,
        file_path: str,
        source_column= None,
        encoding = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
    def load(self):
        """Load data into document objects."""

        docs = []
        with open(self.file_path,'r') as f:
            ticker = self.file_path.split('/')[2]
            meta_data = self.file_path.split('.')[-2].split('/')[-1]
            dict1 = {'analysis_sec':'SEC analysis summary'}
            if meta_data in dict1.keys():
                meta_data = dict1[meta_data]
            metadata = {"ticker": ticker, "metadata": meta_data,"file_path": self.file_path}
            file_content = f.read()
        doc = Document(page_content=file_content, metadata=metadata)
        return [doc]

In [5]:
with open("apis.yaml", "r") as file:
    yaml_data = yaml.load(file, Loader=yaml.FullLoader)
open_ai_params = {'max_tokens':1000,'openai_api_key' : yaml_data['LLMS']['OPENAI_API_KEY'],'temperature' :0,'model_name':'text-davinci-003'}
cohere_params = {
    "model": "command-xlarge-nightly",
    "max_tokens": 2202,
    "cohere_api_key": yaml_data["LLMS"]["COHERE_API_KEY"],
    "temperature": 0,
    "k": 0,
}
ai21_params = {
    "model": "j2-jumbo-instruct",
    "numResults": 1,
    "temperature": 0,
    "topP": 1,
    "ai21_api_key": yaml_data["LLMS"]["AI21_API_KEY"],
    "maxTokens": 25,
}


In [7]:
faiss_db = FAISS.load_local('entiredocument',oai)

In [102]:
query = "How is Tesla's cash flow?"
faiss_db.as_retriever(search_kwargs={"k": 2}).get_relevant_documents(query)

[Document(page_content=',ttm,2022-12-31,2021-12-31,2020-12-31,2019-12-31,ticker\nNet income(figures in $M),12583.0,12583.0,5519.0,690.0,-862.0,TSLA\nDepreciation & amortisation(figures in $M),3747.0,3747.0,2911.0,2322.0,2154.0,TSLA\nStock-based compensation(figures in $M),1560.0,1560.0,2121.0,1734.0,898.0,TSLA\nChange in working capital(figures in $M),-3908.0,-3908.0,518.0,184.0,-349.0,TSLA\nAccounts receivable(figures in $M),-1124.0,-1124.0,-130.0,-652.0,-367.0,TSLA\nInventory(figures in $M),-6465.0,-6465.0,-1709.0,-422.0,-429.0,TSLA\nOther working capital(figures in $M),7552.0,7552.0,3483.0,2701.0,968.0,TSLA\nOther non-cash items(figures in $M),340.0,340.0,245.0,408.0,477.0,TSLA\nNet cash provided by operating activities(figures in $M),14724.0,14724.0,11497.0,5943.0,2405.0,TSLA\n"Investments in property, plant and equipment(figures in $M)",-7172.0,-7172.0,-8014.0,-3242.0,-1437.0,TSLA\n"Acquisitions, net(figures in $M)",0.0,0.0,0.0,-13.0,-45.0,TSLA\nPurchases of investments(figures in

In [88]:
def qachain(vectorstore,query):
    documents = vectorstore.as_retriever(search_kwargs={"k": 2}).get_relevant_documents(query)

    context_full_doc = []
    file_names = []
    for doc in documents:
        page_content = doc.page_content
        meta_data = doc.metadata['metadata']
        ticker = doc.metadata['ticker']
        context_precursor = '''The below contains information about {} and the information is {}'''.format(ticker,meta_data)
        context_full= '''{}
        {}'''.format(context_precursor,page_content)
        context_full_doc.append(context_full)
        file_names.append(doc.metadata['file_path'])
    context_full_doc.append(query)
    context_full_doc = '\n'.join(context_full_doc)
    return context_full_doc,file_names


In [92]:
faiss_db.as_retriever(search_kwargs={"k": 2}).get_relevant_documents(query)

[Document(page_content='Our cash flows provided by operating activities during the three months ended March 2023 and 2022 were $2.51 billion and $4.00 billion, respectively, representing a decrease of $1.48 billion.There have been no material changes to our critical accounting policies and estimates since our Annual Report on Form 10-K for the year ended December 31, 2022.Gross margin for total automotive decreased from 32.9% to 21.1% in the three months ended March 31, 2023 as compared to the three months ended March 31, 2022.Gross margin for total automotive & services and other segment decreased from 30.5% to 19.9% in the three months ended March 31, 2023 as compared to the three months ended March 31, 2022, primarily due to the automotive gross margin decrease discussed above, partially offset by an improvement in our services and other gross margin.R & D expenses as a percentage of revenue decreased from 5% to 3% in the three months ended March 31, 2023 as compared to the three mo

In [103]:
prompt_type,file_names = qachain(faiss_db,"How is Tesla's cash flow?")

In [65]:
def process_file_names(file_names):
    csv_filter = [file_name for file_name in file_names if '.csv' in file_name]
    df = pd.read_csv(csv_filter[0])
    df.rename(columns = {'Unnamed: 0':'Description'},inplace = True)
    return df

In [108]:
llm = Cohere(**cohere_params)
llm = OpenAI(**open_ai_params)
#llm = AI21(**ai21_params)

#### PineCone sampling

In [112]:
pinecone_key = '76b7e747-3862-4faa-9326-8d4c99ac36e9'
pinecone_env = 'us-west1-gcp-free'

In [114]:
import pinecone 

# initialize pinecone
pinecone.init(
    api_key=pinecone_key,  # find at app.pinecone.io
    environment=pinecone_env  # next to api key in console
)

In [122]:
index_name = "financial-analysis"

In [6]:
csv_loader = DirectoryLoader('../ticker', glob="**/*.csv", loader_cls=CSVLoader)
text_loader = DirectoryLoader('../ticker', glob="**/*.txt", loader_cls=TextLoader)
co = CohereEmbeddings(cohere_api_key=cohere_params["cohere_api_key"])
oai = OpenAIEmbeddings(openai_api_key = yaml_data["LLMS"]['OPENAI_API_KEY'])
final_docs = []
for loader in [csv_loader,text_loader]:
    docs = loader.load()
    final_docs.extend(docs)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(final_docs)

In [167]:
from langchain.vectorstores import FAISS

In [12]:
len(documents)

109749

In [None]:
docsearch = Pinecone.from_documents(documents, oai, index_name=index_name)

In [None]:
docsearch

In [129]:

query = "What do you know about AAPL?"
docs = docsearch.similarity_search(query,filter = {'ticker':'TSLA'})

In [None]:
docs.similarity_seach

In [131]:
docs = Pinecone.from_existing_index(index_name=index_name, embedding=oai)

In [13]:
llm = AI21(**ai21_params)

In [16]:
print(llm('How are you doing?'))


[object Object]
