In [1]:
# to work with json files
import json

# to work with regular expressions. 
# use cases for this script include: extract date from text
import re

file_to_test = r'C:\MIDS\FromGrzegorz\json 2\json\3m-company-q4-2021.json'


# Goal 1:
create 3 lists. 1 for presentation speech, 1 for questions, 1 for units of q&a(s)


In [2]:
with open(file_to_test, 'r') as file:
    data = json.load(file)

# type(data) # dict
data




{'title': '3M Company, Q4 2021 Earnings Call, Jan 25, 2022',
 'company_id': '289194',
 'company_name': '3M Company (NYSE:MMM)',
 'year': 2022,
 'month': 1,
 'content': [{'flow_of_call': '0',
   'transcriptcomponenttypename': 'Presentation Operator Message',
   'transcriptpersonname': 'Operator',
   'speakertypename': 'Operator',
   'componenttext': 'Ladies and gentlemen, thank you for standing by. Welcome to the 3M fourth quarter earnings conference call. [Operator Instructions] As a reminder, this conference is being recorded, Tuesday, January 25, 2022. \nI would now like to turn the call over to Bruce Jermeland, Senior Vice President of Investor Relations at 3M.'},
  {'flow_of_call': '1',
   'transcriptcomponenttypename': 'Presenter Speech',
   'transcriptpersonname': 'Bruce  Jermeland',
   'speakertypename': 'Executives',
   'componenttext': "Thank you, and good morning, everyone, and welcome to our fourth quarter earnings conference call. With me today are Mike Roman, 3M's Chairman

In [3]:
# capture company, event type, date, companyname, company id, year, month
company = data['company_name']


def extractDateFromText(text):
    '''
    Arg:
    text - title that has the date within in it
    Returns: 
    date as string
    '''
    
    date_pattern = r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2},\s\d{4}\b"
    date_string = re.search(date_pattern, text)
    
    return date_string.group()

event_date = extractDateFromText(data['title'])


def extract_event_type(text):
    '''
    Arg:
    text - text from which the event type is to be extracted
    
    Returns:
    str of earnings call nature
    '''
    listOfParts = text.split(', ', 2)
    # print(listOfParts)
    
    event_type = str(listOfParts[1])
    
    return event_type

event = extract_event_type(data['title'])

company_id, year, month = data['company_id'], data['year'], data['month']

company, event_date, event, company_id, year, month

('3M Company (NYSE:MMM)',
 'Jan 25, 2022',
 'Q4 2021 Earnings Call',
 '289194',
 2022,
 1)

In [4]:
# generate prensetation chunks, questions

content_list = data['content']
# type(content_list) list


def listBasedOfTranscriptType (contentList, componentType):
    '''
    Creates a list of values from list of dictionaries based on specified dictionary key
    
    Args:
    contentList = list containing dictionaries from which component Type values are to be extracted
    
    Returns:
    list of values of specified key for all the dictionary items in list
    '''
    
    valueList = []
    
    valueList = [c['componenttext'] for c in contentList if c['transcriptcomponenttypename'] == componentType]
    
    return valueList


presentationList = listBasedOfTranscriptType (content_list, 'Presenter Speech')
questionList = listBasedOfTranscriptType (content_list, 'Question')

# presentationList = []

# presentationList = [c['componenttext']  for c in content_list if c['transcriptcomponenttypename'] == 'Presenter Speech' ]
# presentationList



In [5]:
presentationList

["Thank you, and good morning, everyone, and welcome to our fourth quarter earnings conference call. With me today are Mike Roman, 3M's Chairman and Chief Executive Officer; and Monish Patolawala, our Chief Financial and Transformation Officer. Mike and Monish will make some formal comments, and then we will take your questions. Please note that today's earnings release and slide presentation accompanying this call are posted on our Investor Relations website at 3m.com under the heading Quarterly Earnings. \nPlease turn to Slide 2. Before we begin, I would like to announce our next 2 investor events. On the morning of February 14, we will be having a virtual investor meeting where we will be providing a near-term strategic update along with our 2022 guidance. Also, please mark your calendars for our first quarter earnings conference call which will take place on Tuesday, April 26. \nPlease take a moment to read the forward-looking statement on Slide 3. During today's conference call, w

In [6]:
questionList

["I wasn't expecting the first question. So as much upside to your early December guidance, Monish. You called out a number of factors, but you didn't call out N95, which given all the talk we've got from the federal government about free masks and the new guidance and I'm just curious what you're seeing from that side of the business given all the commentary we've seen?",
 "Okay. And then just -- I know you're going to give guidance on Feb 14. But just curious, just given the lots of moving parts at the margin line, just wondering how we think about the takeoff point into 2022, specifically 1Q '22. Normal seasonality would have you up slightly from 4Q. Just wondering how you think about that. And have we seen the peak of the inflation curve at this point?",
 'Of the inflation curve.',
 "A couple here from me. First, just -- I was wondering if you could level set us actually now on the actual size of the respirator business. I think we were 600 million pre-COVID. I feel like we're in t

In [7]:
len(content_list)

71

In [14]:
# get units of question and answers

list_qa = [] # list_qa - list of units of questions and answers. A unit is a dictionary of 2 keys - question, answer. 
# Value for answer is a list of all the answers to a particular question

i=0

for i in range(len(content_list)):
    if content_list[i]['transcriptcomponenttypename'] == 'Question':
        qa_dict = {'Question':'', 'Answer(s)': []}
        qa_dict['Question'] = content_list[i]['componenttext']
        j=i
        while content_list[j+1]['transcriptcomponenttypename'] == 'Answer':
            qa_dict['Answer(s)'] += [content_list[j+1]['componenttext']]
            j+=1
        list_qa.append(qa_dict)

list_qa

[{'Question': "I wasn't expecting the first question. So as much upside to your early December guidance, Monish. You called out a number of factors, but you didn't call out N95, which given all the talk we've got from the federal government about free masks and the new guidance and I'm just curious what you're seeing from that side of the business given all the commentary we've seen?",
  'Answer(s)': ["Yes. So Nigel, I would say when we gave you the guidance in December, at that time, we had not seen the pickup of N95s. And one of the factors that made us deliver better than what we thought in December was the pickup of the respirator business. We came in $40 million better than what we had originally predicted. So we have seen that pickup. But I would still say it's volatile. We'll see how this plays itself out. We are pleased with the partnership that we have with the federal government right now as regards to this. We've had a lot of dialogue with them. And as things evolve, we'll k

In [9]:
len(list_qa)

25

# Get sentiment on the three categories

See Text splitter details in langchain


In [10]:
# length of items in lists - presentation, questions, qa
pll = [] # presentationlistlength
qll = [] # questionslistlength
qall = [] # questionanswerslistlength

def lengthItemsInList (catList):
    '''
    Function to calculate the length of items in a list
    
    Arg:
    catList: List to check the length of each item
    
    Returns:
    list of lengths of items of input list
    
    '''
    
    ll = [len(i) for i in catList]
    print (ll)
    
    return ll


pll = lengthItemsInList(presentationList)
qll = lengthItemsInList(questionList)
qall = lengthItemsInList(list_qa)


# pll - [3095, 1778, 15409, 5759]
# qll - [371, 388, 23, 305, 470, 314, 204, 542, 589, 427, 486, 554, 320, 190, 339, 258, 292, 102, 493, 314, 408, 308, 297, 289, 406]
# qall - [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] # fpr a dictionary it only counts the number of items/keys

# find length of answers in qa list
llal = [] # list of lengths of the answers
for i in list_qa:
    al = i['Answer(s)']
    ll = [len(j) for j in al]
    for k in ll:
        llal.append(k)

# llal
# [609,  42,  13,  3123,  416, 1529, 1026,  275,  654,  2341,  1167,  2835,  1569,  594,  245,  1077,  228,  815,  293,  959,
# 1430,  967,  441,  535,  623,  1763,  697]

[3095, 1778, 15409, 5759]
[371, 388, 23, 305, 470, 314, 204, 542, 589, 427, 486, 554, 320, 190, 339, 258, 292, 102, 493, 314, 408, 308, 297, 289, 406]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [11]:
# process prensentation list through llm

# for chunking the text
from langchain.text_splitter import CharacterTextSplitter
# to use chat model
from langchain.chat_models import AzureChatOpenAI
# to define roles in chat
from langchain.schema import HumanMessage, SystemMessage, AIMessage
# to create prompt template
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)
# set up langchain
from langchain.chains import LLMChain

# to reference envinronment variables
from dotenv import load_dotenv
import os

load_dotenv()

# set chatModel
chat_model = AzureChatOpenAI(deployment_name='gpt-4')

# create prompt
prompt = PromptTemplate(
    template = 'You are very good at determinining sentiment based on transcripts \
of human conversations of the following type of speech - {speechType}. \
For a given piece of text you can respond with 1 word describing \
the sentiment of the text. The word choice is 1 of these three: Positive, Negative, Neutral',
    input_variables = ['speechType']
)

human_template = '{text}'

human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

system_message_prompt = SystemMessagePromptTemplate(prompt=prompt)

# create chat prompt template
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

# chat_prompt ...ChatPromptTemplate(input_variables=['text', 'speechType']

# chat_prompt.format(speechType='Presentation Speech', text= presentationList[0])
# shows the content of system and human with variables populated in template

chain = LLMChain(llm=chat_model, prompt= chat_prompt)

# define the sentiment analyzer as a function
def determineSentimentForTranscriptType (transcriptList, variableValueForSystemPromptTemplate, theLLMChain):
    '''
    Function return a list of sentiment for each item in transcript list
    
    Arg:
    
    transcriptList: items of type str from a list
    variableValueForSystemPromptTemplate : the variable value that is to be passed to the System Prompt Template
    theLLMChain : the langchain chain used to determine sentiment
    
    Returns:
    List of sentiment for each item in transcript list
    
    '''
    sentimentList = []
    
    for c in transcriptList:
        response = chain.run({
            'speechType': variableValueForSystemPromptTemplate,
            'text': c
        }
        )
        sentimentList.append(response)
    
    return sentimentList
    

sentiment_for_presentationList = determineSentimentForTranscriptType(presentationList,
                                                                     'Presentation Speech at an Investor relations call',
                                                                    chain)
sentiment_for_questionList = determineSentimentForTranscriptType(questionList,
                                                                     'Questions raised by Analysts at an Investor relations call',
                                                                    chain)

print(f'sentiment_for_presentationList:\n{sentiment_for_presentationList}')
print(f'sentiment_for_questionList:\n{sentiment_for_questionList}')

# to collect the sentiment for the 
# sentiment_for_presentationList = []

# for c in presentationList:
#     response = chain.run({
#     'speechType':'Presentation Speech at an Investor relations call',
#     'text': c
#         }
#         )
    
#     sentiment_for_presentationList.append(response)

# sentiment_for_presentationList


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Requests to the Creates a completion for the chat message Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 1 second. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Requests to the Creates a completion for the chat message Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 3 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.chat_models.openai.ChatOpenA

sentiment_for_presentationList:
['Positive', 'Positive', 'Positive', 'Positive']
sentiment_for_questionList:
['Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Negative', 'Negative', 'Neutral', 'Negative', 'Neutral', 'Negative', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Negative', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Negative', 'Neutral', 'Neutral']


In [36]:
# sentiment for units of question and answer(s)

prompt_for_qa = PromptTemplate(
    template = 'You are very good at determining the sentiment of human conversation based on \
    transcripts of these conversations in the following context - {speechContext}. \
    For a given conversation, can you please respond with 1 word that indicates the overall sentiment of the conversation. \
    The response is to be in 1 of the 3 following words: Positive, Negative, Neutral',
    input_variables = ['speechContext']
)

human_qa_template = 'Question by the analyst: {question}. Responses by Company executives - {answers}'

system_message_qa_prompt = SystemMessagePromptTemplate(prompt = prompt_for_qa)

human_message_qa_prompt = HumanMessagePromptTemplate.from_template(human_qa_template)

qa_chat_prompt = ChatPromptTemplate.from_messages([system_message_qa_prompt, human_message_qa_prompt])

qa_chain = LLMChain(llm = chat_model, prompt = qa_chat_prompt)

sentiment_for_list_qa = []

for c in list_qa:
    q = c['Question']
    answers_from_list = '\n'.join(c['Answer(s)'])
    
    response = qa_chain.run(
        {
        'speechContext': 'Conversation between Analyst and Executives at an Investor relations call.',
        'question': q,
        'answers': answers_from_list
        }
        
    )
    sentiment_for_list_qa.append(response)

sentiment_for_list_qa
    


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Requests to the Creates a completion for the chat message Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 3 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Requests to the Creates a completion for the chat message Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 2 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.chat_models.openai.ChatOpen

['Positive',
 'Neutral',
 'Positive',
 'Negative',
 'Positive',
 'Neutral',
 'Neutral',
 'Negative',
 'Neutral',
 'Neutral',
 'Positive',
 'Positive',
 'Neutral',
 'Neutral',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Neutral',
 'Positive',
 'Positive']

# Rough

In [19]:
# get units of question and answers

def get_list_qa(contentListData):
    '''
    From the list of dictionaries, create another list of dictionaries. Each list items dictionary has 3 keys - 
    id, question, answers. Answers is a list of all the answers by the different executives
    
    Arg:
    contentListData - list of dictionaries
    
    Returns:
    list of dictionaries of fewer keys. A unit is a dictionary of 3 keys - id for question, question, answer. 
    Value for answer is a list of all the answers to a particular question
    
    '''
    
    list_qa = []
    
    i = 0
    
    for i in range(len(contentListData)):
        if contentListData[i]['transcriptcomponenttypename'] == 'Question':
            qa_dict = {'flowcallid': '', 'Question':'', 'Answer(s)': []}
            qa_dict['flowcallid'] = contentListData[i]['flow_of_call']
            qa_dict['Question'] = contentListData[i]['componenttext']
            j=i
            while contentListData[j+1]['transcriptcomponenttypename'] == 'Answer':
                qa_dict['Answer(s)'] += [contentListData[j+1]['componenttext']]
                j+=1
            list_qa.append(qa_dict)
    return list_qa
        
content_list = data['content']        

qalist = get_list_qa(content_list)

In [20]:
qalist

[{'flowcallid': '6',
  'Question': "I wasn't expecting the first question. So as much upside to your early December guidance, Monish. You called out a number of factors, but you didn't call out N95, which given all the talk we've got from the federal government about free masks and the new guidance and I'm just curious what you're seeing from that side of the business given all the commentary we've seen?",
  'Answer(s)': ["Yes. So Nigel, I would say when we gave you the guidance in December, at that time, we had not seen the pickup of N95s. And one of the factors that made us deliver better than what we thought in December was the pickup of the respirator business. We came in $40 million better than what we had originally predicted. So we have seen that pickup. But I would still say it's volatile. We'll see how this plays itself out. We are pleased with the partnership that we have with the federal government right now as regards to this. We've had a lot of dialogue with them. And as t