Plan: 
1. Do overall sentiment analysis with nltk (and get price from the Factset datafeed, and then graph sentiment + stock price over time?)
2. Do keynes testing per document with nltk
3. Do BERTopic modeling on the sentences? pages? 
4. Could I do the sentiment analysis on each page or sentence and then tie that back to the original source? Say, for instance, the topic is "government" and the sentiment is general negative? Maybe use nltk or VADER? 
5. Then use AI to give a summary of the page? 
6. Put in company names as stop words? 

*Automated pipeline: 
You put in the transcripts (get from Factset or Bloomberg), automated pipeline produces a) summary of transcript, 
This should all be hosted in a Python Shiny or node js. Or maybe it just outputs a nice little text file with a: 
1. Title
2. Brief AI Summary (use RAG?)
3. Overall Sentiment Score  
4. Keynes Testing Output
5. Topic Modeling + Sentiment Scores per Topic [with GPT generated topic names...maybe pull out the finacial topics and see which companies have the worst sentiment score? Use few shot prompting] - How do I choose the number of topics...?
6. Wordcloud generated
7. Use ticker as check for sentiment analysis? 


for chunking: 
https://www.geeksforgeeks.org/how-to-chunk-text-data-a-comparative-analysis/


#Create a node.js application:  
https://plotly.com/nodejs/line-and-scatter/


In [None]:
#########################################################################################################################################
# Package Installation: 
#########################################################################################################################################

#%pip install transformers torch
import numpy as np
import pandas as pd
from pypdf import PdfReader
from bertopic import BERTopic
import os

# Import torch first to avoid circular import issues
import torch
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
from multiprocessing import Pool
import time
import math
import nltk
import ssl
import string
import pandas as pd
from collections import Counter
from typing import List
from keyness import log_likelihood
import re
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import spacy


from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from nltk.text import Text  
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
#Import vader sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Import vader sentiment
from sklearn.feature_extraction.text import CountVectorizer
import argparse
from json import JSONDecodeError
import os
import requests
from requests.exceptions import HTTPError
import sys
import json
from sklearn.cluster import KMeans



In [None]:
#Retrieve API key
with open("C:\\Users\\lschlake\\secrets.txt") as f:
    secrets = f.read()
secrets = json.loads(secrets)

#Get API key from json
api_key= secrets['azure_api_key']

#Set parameters for model
rand_api_base = "https://apigw.rand.org/openai/RAND/inference/"
rand_api_version = '2024-02-01'
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_KEY"] = secrets['azure_api_key']
header = {"Ocp-Apim-Subscription-Key": secrets['azure_api_key']}


In [None]:
#Define two functions to be used for text splitting

#Function for chunking text into chunks based on a defined chunk size
def chunk_text(text, chunk_size):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
#If you use the chunk_text function, be sure you set the chunk size (in characters): 
cs = 800

#Function to split text into paragraphs. Use if you want to do pargraph splitting: 
def split_text(text):
    # Split the text wherever there is " \n \n"
    split_parts = text.split(" \n \n")
    return split_parts

In [None]:
class Transcript:
    def __init__(self, filepath, ticker):
        self.filepath = filepath
        self.ticker = ticker

    #Always run the processing functions first to convert PDF --> text, chunk text, and do VADER sentiment analysis
    def processing(self): 

        #Store text from each transcripts here:
        corpus_text = []

        #The text of each document
        each_document_text = []

        #Initialize sentiment score
        sentiment_analyzer = SentimentIntensityAnalyzer()

        #Dataframe for sentiment scores and initialize sentiment score function
        sentiment_table = pd.DataFrame(columns=["file","document","chunk",'vader_pos_score', "vader_neg_score", "vader_neutral_score", "vader_compound"])

        #Stores text of each PDF
        indv_doc_text=""

        #Path to PDF:
        path = self.filepath

        #Read in PDF as text
        reader = PdfReader(path)

        #For each page in PDF, extract the text and add it to the document_text variable
        #If you want to do this for each PDF in a folder, put a line of code here that loops through the folder
        #Then indent the rest of the code to run subordinate to the loop
        document_text = ""
        for x in range(len(reader.pages)):
            page = reader.pages[x]
            document_text = document_text + page.extract_text()
            #document_text.replace("wwwcallstreetcom_copyright","")
        each_document_text.append(document_text)

        #Chunks the document one of three ways:
        paragraph_chunk = sent_tokenize(document_text) #For sentence chunking
        #paragraph_chunk = split_text(document_text) #For pargraph chunking
        #paragraph_chunk = split_text(document_text) #For naive chunking


        #For each text chunk, get sentiment scores using VADER Sentiment and store those in the sentiment_table dataframe
        for i, paragraph in enumerate(paragraph_chunk):


            substring = "www.callstreet.com"

            #Avoids processing sentences with "www.callstreet.com"
            if substring not in paragraph:

                #Append each chunk to the corpus text
                indv_doc_text = indv_doc_text+paragraph
                corpus_text.append(paragraph)

                #Get sentiment score of the paragraph
                score = sentiment_analyzer.polarity_scores(paragraph)

                #Sets the sentiment scores and adds them to the sentiment table
                pos_score=score["pos"]
                neg_score=score["neg"]
                neutral_score=score["neu"]
                compound_score= score["compound"]
                new_row = pd.DataFrame({"file": f"{self.ticker}","document": [x],
                                                    "chunk": [paragraph],
                                                    "vader_pos_score":[pos_score],
                                                    "vader_neg_score":[neg_score] ,
                                                    "vader_neutral_score":[neutral_score] ,
                                                    "vader_compound": [compound_score]})
                
                #Adds a new row to the sentiment table for each sentence
                sentiment_table = pd.concat([sentiment_table, new_row], ignore_index=True)

        #Set some characteristics of the class to use in other functions:          
        self.dataframe = sentiment_table
        self.corpus = corpus_text
        self.indv_doc_text=indv_doc_text

    #Returns the sentiment table
    def get_sentiment(self):

        #Return the top sentiment scores 
        return self.dataframe
    
    #Gets the BERTopics
    def get_topics(self, cluster_number):
        self.cluster = cluster_number
        if self.corpus is None:
            raise "You must run preprocessing before running the topic model"
        
        vectorizer_model = CountVectorizer(stop_words="english")

        cluster_model = KMeans(n_clusters=cluster_number)

        topic_model = BERTopic(
            vectorizer_model=vectorizer_model,
            hdbscan_model=cluster_model
            #Specify an embedding model
            #Specify a clustering model
        )

        #Fit the BERTopic model
        topics, probs = topic_model.fit_transform(self.corpus)
        topic_names = topic_model.topic_labels_

                
        #Define prompt
        names = """
        This is a list of topics from a BERTopic model that analyses text from earnings call transcripts of public companies.
          Please generate new names for each of these topics and return a list. 
          Seperate the topic numbers from the topic names using a colon.  
          Output the original topic number along with the name. 
          Put each topic name on a seprate line. Include no other text in your response."""
        
        #Sets the promt
        topics = topic_model.topic_labels_
        for x in topics: 
            names = names + topics[x] + "; "
        # Set deployment ID based on model argument.

        #If necessary, specify your deployment id here:
        #deployment_id = 

        #Define you API endpoint here
        #endpoint =  

        chat_completion_response = requests.post(
                endpoint,
                headers = {'Ocp-Apim-Subscription-Key': api_key},
                json = {
                    'messages': [
                        {
                            'role': 'user',
                            'content': names
                        }
                    ]
                }
            )
        
        #Retrieve the GPT-generate new names:
        response_deserialized_content = chat_completion_response.json()
        result = response_deserialized_content['choices'][0]['message']['content'].splitlines()
        #Store the new names in a dataframe:
        new_topic_names = pd.DataFrame(result)
        #Rename the axis as the topic number: 
        new_topic_names=new_topic_names.set_axis(['Topic'], axis=1)
        #Split the topic numbers off from the topic name:
        new_topic_names[["Topic","New Topic Name"]]=new_topic_names["Topic"].str.split(":", n=1, expand=True)
        #Convert the topic number to a numeric variable:
        new_topic_names["Topic"]= pd.to_numeric(new_topic_names["Topic"], downcast='integer', errors='coerce')
        #Store the BERTopic model output in a data table:
        topic_categories = pd.DataFrame(topic_model.get_document_info(self.corpus))
        #Merge the BERTtopic model table and the sentiment scores based on the category number
        table1 = pd.merge(topic_categories, self.dataframe, how="outer", left_index=True, right_index=True)
        #Merge the BERTopic model/sentiment score tables and the new topic names table
        table2 = pd.merge(table1,new_topic_names, how="right", on="Topic")

        #print(f"{response_deserialized_content['choices'][0]['message']['content']}\n")
        return table2.groupby("New Topic Name")["vader_compound"].mean()

    #Gets a summary for the text
    def get_summary(self):
        import time
        
        #If necessary, specify your deployment id here:
        #deployment_id = 

        #Define you API endpoint here
        #endpoint =  
                    
        x= self.indv_doc_text
        time.sleep(2)
        prompt = "Produce a short summary of this text. Include a description of key points and the sentiment of each of those points:" + x

                #print('Sending request to Azure OpenAI Chat Completions endpoint...')

        chat_completion_response = requests.post(
                        endpoint,
                        headers = {'Ocp-Apim-Subscription-Key': api_key},
                        json = {
                            'messages': [
                                {
                                    'role': 'user',
                                    'content': prompt
                                },
                            ]
                        }
                    )
        response_deserialized_content = chat_completion_response.json()
        print(f"{response_deserialized_content['choices'][0]['message']['content']}\n")


    #Does AI sentiment analysis
    def get_ai_sentiment(self):
        #Code in development for sentiment analysis using RANDChat API:
        #Get ticker from yahoo finance?

        #If necessary, specify your deployment id here:
        #deployment_id = 
        #Define you API endpoint here
        #endpoint =  

        AI_sentiment= pd.DataFrame(columns=["chunk_number","sentiment_score"])
        paragraph_number = 1

        for x in self.corpus: 
                time.sleep(3)
                context = """
                I want you to perform a sentiment analysis on sentences. 
                Here are some examples of how to go about the sentiment analysis. The sentiment scores should range from 
                -1 (completely negative) to 1 (completely positive). 0 represents a sentiment nuetral chunk. 

                Example #1
                Chunk: "Our revenue exceeded expectations and all projections from the previous quarter." 
                Sentiment Score: 0.8


                Example #2
                Chunk: "We are experienced a slowdown in our supply chain as a result of geopolitical tension."
                Sentiment Score: -0.6

                Example #3
                Chunk: "All forward-looking statements are merely projects"
                Sentiment Score: 0.0
                """

                prompt = context + " . Now classify this body of text. Only return the sentiment score in your respond. Do not include any other text: " + x

                chat_completion_response = requests.post(
                        endpoint,
                        headers = {'Ocp-Apim-Subscription-Key': api_key},
                        json = {
                            'messages': [
                                {
                                    'role': 'user',
                                    'content': prompt
                                }
                            ]
                        }
                    )

                response_deserialized_content2 = chat_completion_response.json()
                #print(x)
                #print(f"{paragraph_number},{response_deserialized_content2['choices'][0]['message']['content']}\n")
                new_row = pd.DataFrame({"chunk_number":paragraph_number,"sentiment_score":[response_deserialized_content2['choices'][0]['message']['content']]})
                AI_sentiment = pd.concat([AI_sentiment, new_row], ignore_index=True)
                paragraph_number += 1
                print("Loading Sentence #", paragraph_number)

        return AI_sentiment 
        

In [None]:
d1 = Transcript("C:\\Users\\lschlake\\Documents\\2024-2025Courses\\Text Analysis\\Project\\Real Transcripts\\L3Harris Technologies Inc (2).pdf", "L3 Harris")

print("-----------------------------------------------")
print(f"           {d1.ticker} Report                 ")
print("-----------------------------------------------")

d1.processing()
d1.get_summary()
d1.get_topics(30) #Runs the get topic function, specifies the KMeans model to create 30 topics
