In [2]:
import yaml
import pandas as pd
from langchain.llms import Cohere, OpenAI, AI21
from langchain.embeddings import CohereEmbeddings,OpenAIEmbeddings
from langchain.vectorstores import FAISS,Pinecone
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import DataFrameLoader
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
import pinecone

  from tqdm.autonotebook import tqdm


#### MY OWN DATALOADER

In [None]:
class CSVLoader_v1(BaseLoader):
    """Loads a CSV file into a list of documents.
    Each document represents one row of the CSV file. Every row is converted into a
    key/value pair and outputted to a new line in the document's page_content.
    The source for each document loaded from csv is set to the value of the
    `file_path` argument for all doucments by default.
    You can override this by setting the `source_column` argument to the
    name of a column in the CSV file.
    The source of each document will then be set to the value of the column
    with the name specified in `source_column`.
    Output Example:
        .. code-block:: txt
            column1: value1
            column2: value2
            column3: value3
    """

    def __init__(
        self,
        file_path: str,
        source_column= None,
        encoding = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
    def load(self):
        """Load data into document objects."""

        docs = []
        with open(self.file_path,'r') as f:
            #import pdb;pdb.set_trace()
            ticker = self.file_path.split('/')[2]
            meta_data = self.file_path.split('.')[-2].split('/')[-1]
            dict1 = {'balance':'Balance Sheet','cash':'Cash Flow','income':'Income Statement',\
                    'ratios':'Key Financial Ratios','est':'Analyst Estimates','fraud':'Fraud Ratios',
                    'c_news':'News','s_news':'Sentiment News'}
            if meta_data in dict1.keys():
                meta_data = dict1[meta_data]
            metadata = {"ticker": ticker, "metadata": meta_data,"file_path": self.file_path}
            file_content = f.read()
        doc = Document(page_content=file_content, metadata=metadata)
        return [doc]

In [None]:
with open("apis.yaml", "r") as file:
    yaml_data = yaml.load(file, Loader=yaml.FullLoader)
open_ai_params = {'max_tokens':2000,'openai_api_key' : yaml_data['LLMS']['OPENAI_API_KEY'],'temperature' :0,'model_name':'text-davinci-003'}
cohere_params = {
    "model": "command-xlarge-nightly",
    "max_tokens": 2202,
    "cohere_api_key": yaml_data["LLMS"]["COHERE_API_KEY"],
    "temperature": 0,
    "k": 0,
}
csv_loader = DirectoryLoader('../ticker', glob="**/*.csv", loader_cls=CSVLoader)
text_loader = DirectoryLoader('../ticker', glob="**/*.txt", loader_cls=TextLoader)
co = CohereEmbeddings(cohere_api_key=cohere_params["cohere_api_key"])
oai = OpenAIEmbeddings(openai_api_key = yaml_data["LLMS"]['OPENAI_API_KEY'])
# final_docs = []
# for loader in [csv_loader,text_loader]:
#     docs = loader.load()
#     final_docs.extend(docs)
# text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
# documents = text_splitter.split_documents(final_docs)


In [None]:
docs[0]

In [None]:
def metadatagenerator(documents):
    for doc in documents:
        file_path = doc.metadata['source']
        ticker = file_path.split('/')[2]
        meta_data = file_path.split('.')[-2].split('/')[-1]
        dict1 = {'balance':'Balance Sheet','cash':'Cash Flow','income':'Income Statement',\
                'ratios':'Key Financial Ratios','est':'Analyst Estimates','fraud':'Fraud Ratios',
                'c_news':'News','s_news':'Sentiment News'}
        if meta_data in dict1.keys():
            meta_data = dict1[meta_data]
        metadata = {"ticker": ticker, "metadata": meta_data,"file_path": file_path}
        doc.metadata = metadata
    return documents

In [None]:
documents = metadatagenerator(documents)

In [None]:
index_name = 'financial-analysis'
pinecone_key = yaml_data['PINECONE']['API_KEY']
pinecone_env = yaml_data['PINECONE']['ENV']
pinecone.init(
    api_key=pinecone_key,  # find at app.pinecone.io
    environment=pinecone_env  # next to api key in console
)
docsearch = Pinecone.from_documents(documents, oai, index_name=index_name)

In [None]:
faiss = FAISS.from_documents(docs_cta,oai)

In [None]:
faiss.as_retriever(search_kwargs={"k": 10}).get_relevant_documents("What is increase in CTA's cash flow from FY 2023 to FY 2021?")[0].page_content

In [None]:
faiss.save_local('entiredocument')

In [None]:
faiss_db = FAISS.load_local('entiredocument',oai)

In [None]:
query = "What is increase in CTA's payables from FY 2022 to FY 2021?"
faiss.as_retriever(search_kwargs={"k": 5}).get_relevant_documents(query)

In [None]:
faiss.as_retriever(search_kwargs={"k": 1}).get_relevant_documents(query)

In [None]:
def qachain(vectorstore,query):
    ###Check if there are multiple files being fetched. Else stay to 5 documents
    filter_dict = {'$and':[{'ticker':self.ticker},{'metadata':{'$ne':'Sentiment News'}}]}
    documents = vectorstore.as_retriever(search_kwargs={"k": 5,filter = filter_dict}).get_relevant_documents(query)
    #import pdb;pdb.set_trace()
    k_count = min(len(set([doc.metadata['file_path'] for doc in documents])),3)*5
    if k_count != 5:
        documents = vectorstore.as_retriever(search_kwargs={"k": k_count}).get_relevant_documents(query)    
    #page_content = vectorstore.as_retriever(search_kwargs={"k": 10}).get_relevant_documents(query)
    page_content = '\n\n'.join([doc.page_content for doc in documents])
    meta_data = documents[0].metadata
   # file_path = 
    context_precursor =  '''The below contains information about {} and you are a financial analyst'''.format(meta_data['ticker'])
   # import pdb;pdb.set_trace()
    prompt_template = """Use the following information to answer the question at the end in a coherent summary. 
{context_precursor}
{page_content}
Question: {question}
Think step by step. If there is not sufficient information provided, just say you don't know.
"""
    prompt = prompt_template.format(context_precursor = context_precursor,page_content = page_content,question = query)
    return prompt

In [None]:
prompt_type = qachain(faiss,"What is increase in CTA's payables from FY 2022 to FY 2021?")

In [None]:
print(prompt_type)

In [None]:
llm = Cohere(**cohere_params)
llm = OpenAI(**open_ai_params)
#llm = AI21(**ai21_params)

In [None]:
yaml_data

In [None]:
pinecone.list_indexes()

In [None]:
index = pinecone.Index("financial-analysis")

In [None]:
docsearch = Pinecone.from_existing_index('financial-analysis', oai)

In [None]:
docsearch.similarity_search('How are you doing?',filter = {"metadata":'Key Financial Ratios'})

In [None]:

docsearch.as_retriever(search_kwargs={"k": 100,"filter":filter_dict}).get_relevant_documents('Selling and administrative expenses as a percent of revenue for the three months?')

In [None]:
with open("apis.yaml", "r") as file:
    yaml_data = yaml.load(file, Loader=yaml.FullLoader)
openbb.keys.finnhub(key=data_dict["OPENBB"]["FINNHUB_KEY"], persist=True)
import os
list1 = []
from openbb_terminal.sdk import openbb
from datetime import datetime as datetime
from dateutil.relativedelta import relativedelta
end_date = datetime.today().strftime("%Y-%m-%d")
start_date = (datetime.today() - relativedelta(months=2)).strftime(
    "%Y-%m-%d"
)
import pandas as pd
for ticker in os.listdir('../ticker'):
    try:
        if ticker == "BRK-B":
            df = openbb.stocks.ba.cnews(
                "BRK.A", start_date=start_date, end_date=end_date
            )
            df.insert(0,'ticker',ticker)
        else:
            df = openbb.stocks.ba.cnews(
                ticker, start_date=start_date, end_date=end_date
            )
        time.sleep(1)
        df = pd.DataFrame(df)[["related", "datetime", "headline", "summary"]]
        df["datetime"] = df["datetime"].apply(
            lambda x: datetime.fromtimestamp(x)
        )
        choices = list(
            stock_summary[stock_summary["Ticker"] == ticker].values[0]
        )
        result = pd.DataFrame(
            [
                process.extract(headline, choices, limit=2)
                for headline in df["headline"]
            ]
        )
        result.columns = choices
        result[choices[0]] = [x[1] for x in result[choices[0]]]
        result[choices[1]] = [x[1] for x in result[choices[1]]]
        result["headline"] = df["headline"]
        result["final_score"] = [
            max(x, y) for x, y in zip(result[choices[0]], result[choices[1]])
        ]
        result = result[result["final_score"] > 50][["headline", "final_score"]]
        result["datetime"] = df["datetime"]
        list1.append(result)
    except:
        pass
#         list1.append(result)

In [5]:
import yaml
from openbb_terminal.sdk import openbb
from datetime import datetime as datetime
from dateutil.relativedelta import relativedelta
end_date = datetime.today().strftime("%Y-%m-%d")
start_date = (datetime.today() - relativedelta(months=2)).strftime(
    "%Y-%m-%d"
)
import pandas as pd
from fuzzywuzzy import process
with open("apis.yaml", "r") as file:
    yaml_data = yaml.load(file, Loader=yaml.FullLoader)
    data_dict = dict(yaml_data)
openbb.keys.finnhub(key=data_dict["OPENBB"]["FINNHUB_KEY"], persist=True)
import os
import time
stock_summary = pd.read_json("https://www.sec.gov/files/company_tickers.json").T
stock_summary = stock_summary[["title", "ticker"]]
stock_summary.columns = ["Company", "Ticker"]
list1 = []

In [None]:
for ticker in os.listdir('../ticker'):
    try:
        if ticker == "BRK-B":
            df = openbb.stocks.ba.cnews(
                "BRK.A", start_date=start_date, end_date=end_date
            )
            df.insert(0,'ticker',ticker)
        else:
            df = openbb.stocks.ba.cnews(
                ticker, start_date=start_date, end_date=end_date
            )
        time.sleep(1)
        
        df = pd.DataFrame(df)[["related", "datetime", "headline", "summary"]]
        df["datetime"] = df["datetime"].apply(
            lambda x: datetime.fromtimestamp(x)
        )
        choices = list(
            stock_summary[stock_summary["Ticker"] == ticker].values[0]
        )
        result = pd.DataFrame(
            [
                process.extract(headline, choices, limit=2)
                for headline in df["headline"]
            ]
        )
        result.columns = choices
        result[choices[0]] = [x[1] for x in result[choices[0]]]
        result[choices[1]] = [x[1] for x in result[choices[1]]]
        result["headline"] = df["headline"]
        result["final_score"] = [
            max(x, y) for x, y in zip(result[choices[0]], result[choices[1]])
        ]
        #import pdb;pdb.set_trace()
        result = result[result["final_score"] > 50][["headline", "final_score"]]
        result["datetime"] = df["datetime"]
        result.insert(0,'ticker',ticker)
        list1.append(result)
    except:
        pass

In [None]:
x = pd.concat(list1)

In [6]:
from langchain.embeddings import CohereEmbeddings,OpenAIEmbeddings
oai = OpenAIEmbeddings(openai_api_key = yaml_data["LLMS"]['OPENAI_API_KEY'])


In [None]:
x.to_csv('sample_input.csv')

In [7]:
classifications = ['This is extremly positive news for my stock and it will rise highly today.',\
                   'This will have a positive impact on my stock','There is no impact on the share price',\
                   'This will have a negative impact on my stock',\
                   'This is terrible news and the share price will drop significantly']
        ## Create a faiss vector database
        #import pdb;pdb.set_trace()
faiss_classifications = FAISS.from_texts(classifications,oai)
similarity_scores = []
x = pd.read_csv('sample_input.csv')
x.drop('Unnamed: 0',axis = 1,inplace = True)


In [9]:
from tqdm import tqdm

In [10]:
sentiments = []
for headline in tqdm(list(x['headline'])):
    sentiments.append(faiss_classifications.similarity_search_with_score(headline,k=1)[0][0].page_content)
    

100%|█████████████████████████████████████████████████████████████████████████████████████| 32352/32352 [3:11:13<00:00,  2.82it/s]


In [12]:
x['sentiment'] = sentiments

In [13]:
x.to_csv('sample_input.csv')

In [15]:
(1231940608**0.5)*2

70198.02299210428

In [None]:
(1231940608**0.5)*2