# Preliminaries

## Load Packages

In [1]:
import pandas as pd
import re
import textstat
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize
from bertopic.representation import KeyBERTInspired
import spacy

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
# Load data
csv_file_path = '../OutputData/output.csv'
df = pd.read_csv(csv_file_path)

# Add unique ID
df.reset_index(inplace = True)
df.rename(columns = {'index': 'ID'}, inplace = True)
df.head()

Unnamed: 0,ID,speaker,text,type,speaker_type,speaker_company,speaker_role,company_name,date
0,0,Operator,"Good day, and welcome to the Apple Inc. Second...",presentation,Operator,,,AAPL.OQ,2019-Apr-30
1,1,Nancy Paxton,"Thank you. Good afternoon, and thanks to every...",presentation,Corporate Participant,Apple Inc.,Senior Director of IR and Treasury,AAPL.OQ,2019-Apr-30
2,2,Timothy D. Cook,"Thanks, Nancy. Good afternoon, and thanks to a...",presentation,Corporate Participant,Apple Inc.,CEO & Director,AAPL.OQ,2019-Apr-30
3,3,Luca Maestri,"Thank you, Tim. Good afternoon, everyone.\nRev...",presentation,Corporate Participant,Apple Inc.,CFO & Senior VP,AAPL.OQ,2019-Apr-30
4,4,Nancy Paxton,"Thank you, Luca. (Operator Instructions) Opera...",presentation,Corporate Participant,Apple Inc.,Senior Director of IR and Treasury,AAPL.OQ,2019-Apr-30


# Readability Metrics

In [3]:
# Calculate readability metrics
def calculate_readability_metrics(df, text_var, keep_all = False):

    # Define readability functions
    readability_functions = {
        'automated_readability_index': textstat.automated_readability_index,
        'coleman_liau_index': textstat.coleman_liau_index,
        'dale_chall_readability_score': textstat.dale_chall_readability_score,
        'flesch_reading_ease': textstat.flesch_reading_ease,
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade,
        'gunning_fog': textstat.gunning_fog,
        'smog_index': textstat.smog_index,
    }

    # Evaluate readability metrics
    for metric, function in readability_functions.items():
        df[f'r_{metric}_{text_var}'] = df[text_var].apply(function)

    # Calculate overall readability
    metric_columns = [f'r_{metric}_{text_var}' for metric in readability_functions]
    df[f'readability_overall_{text_var}'] = df[metric_columns].mean(axis = 1)

    # Drop columns (if required)
    if not keep_all:
        df.drop(metric_columns, axis=1, inplace=True)
        if not keep_all:
            df.drop([text_var], axis=1, inplace=True)

    # end
    return df

# Create readability features
def create_readability_features(df):

    # Aggregate data by speaker
    grouped = df.groupby(['company_name', 'date', 'speaker_type', 'type'])['text'].apply(' '.join).reset_index()

    # Pivot data to wide format
    pivot_table = grouped.pivot(index = ['company_name', 'date'], columns = ['speaker_type', 'type'], values = 'text').reset_index()
    pivot_table = pivot_table.fillna('')
    pivot_table.columns = ['company_name', 'date', 'conf_qna', 'corp_pres', 'corp_qna', 'op_pres', 'op_qna']

    # Calculate readability by speaker type
    for var in ['conf_qna', 'corp_pres', 'corp_qna', 'op_pres', 'op_qna']:
        pivot_table = calculate_readability_metrics(pivot_table, text_var = var)

    # end
    return pivot_table

In [4]:
# Test function
test = create_readability_features(df)
test

Unnamed: 0,company_name,date,readability_overall_conf_qna,readability_overall_corp_pres,readability_overall_corp_qna,readability_overall_op_pres,readability_overall_op_qna
0,AAPL.OQ,2019-Apr-30,17.500000,17.454286,17.360000,16.772857,17.032857
1,AAPL.OQ,2019-Jan-29,17.531429,17.424286,17.351429,16.741429,17.718571
2,AAPL.OQ,2019-Jul-30,17.428571,17.588571,17.438571,16.727143,18.014286
3,AAPL.OQ,2019-Oct-30,17.571429,17.465714,17.024286,16.825714,18.670000
4,AAPL.OQ,2020-Apr-30,17.785714,17.788571,17.137143,16.771429,18.474286
...,...,...,...,...,...,...,...
75,NVDA.OQ,2022-May-25,17.344286,17.235714,16.954286,15.260000,17.530000
76,NVDA.OQ,2023-Aug-23,17.561429,16.322857,17.152857,15.304286,17.420000
77,NVDA.OQ,2023-May-24,16.807143,16.730000,16.981429,15.687143,18.357143
78,NVDA.OQ,2023-Nov-21,18.104286,16.544286,16.851429,15.935714,18.038571


In [None]:
test.to_csv('OutputData/readability.csv', index = False)

# Topic Extraction

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')

# Redact text for topic models
def redact_text_with_spacy(text):

    # Process text
    doc = nlp(text)
    redacted_text = text
    sorted_entities = sorted(doc.ents, key = lambda ent: ent.start_char, reverse = True)

    # Replace entities with their label
    for ent in sorted_entities:
        if ent.label_ not in ['ORDINAL', 'CARDINAL']:
            redacted_text = redacted_text[:ent.start_char] + '[' + ent.label_ + ']' + redacted_text[ent.end_char:]

    return redacted_text, text

In [4]:
df['text'] = df['text'].apply(sent_tokenize)

In [6]:
df = df.explode('text').reset_index(drop=True)


In [7]:
df.head()

Unnamed: 0,ID,speaker,text,type,speaker_type,speaker_company,speaker_role,company_name,date
0,0,Operator,"Good day, and welcome to the Apple Inc. Second...",presentation,Operator,,,AAPL.OQ,2019-Apr-30
1,0,Operator,Today's call is being recorded.,presentation,Operator,,,AAPL.OQ,2019-Apr-30
2,0,Operator,"At this time, for opening remarks and introduc...",presentation,Operator,,,AAPL.OQ,2019-Apr-30
3,0,Operator,Please go ahead.,presentation,Operator,,,AAPL.OQ,2019-Apr-30
4,1,Nancy Paxton,Thank you.,presentation,Corporate Participant,Apple Inc.,Senior Director of IR and Treasury,AAPL.OQ,2019-Apr-30


In [None]:
def create_sentence_topics(df):

    # Redact text and store original text
    df['redacted_text'], df['original_text'] = zip(*df['text'].apply(redact_text_with_spacy))

    # Subset data for corporate participant text
    subset_df = df[df['speaker_type'] == "Corporate Participant"]

    # Tokenize text into sentences for both redacted and original texts
    subset_df['sentences'] = subset_df['redacted_text'].apply(sent_tokenize)
    subset_df['original_sentences'] = subset_df['original_text'].apply(sent_tokenize)

    # Flatten list of lists into a single list of sentences along with their original texts and IDs
    all_sentences = [(sent, orig, full_text, ID) for full_text, sents, origs, ID in zip(subset_df['original_text'], subset_df['sentences'], subset_df['original_sentences'], subset_df['ID']) for sent, orig in zip(sents, origs)]

    # Remove duplicates
    unique_sentences = list(set(all_sentences))

    # Split the tuples for processing
    redacted_only = [sent[0] for sent in unique_sentences]
    original_only = [sent[1] for sent in unique_sentences]
    full_text_only = [sent[2] for sent in unique_sentences]
    ids_only = [sent[3] for sent in unique_sentences]

    # Define and fit the topic model
    topic_model = BERTopic(representation_model = "KeyBERTInspired", min_topic_size = 100, nr_topics = 'auto')
    topics, probabilities = topic_model.fit_transform(redacted_only)

    # Create DataFrame with topic mappings
    sentence_df = pd.DataFrame({
        'Sentence': redacted_only,
        'Topic': topics,
        'Original_Sentence': original_only,
        'Original_Text': full_text_only,
        'ID': ids_only
    })

    # Include topic names
    topic_info = topic_model.get_topic_info()
    topic_info.rename(columns = {'Name': 'Topic_Name'}, inplace = True)
    topic_info['Topic'] = topic_info.index - 1
    sentence_df = pd.merge(sentence_df, topic_info[['Topic', 'Topic_Name']], on = 'Topic', how = 'left')

    # Finalze output
    selected_df = sentence_df.loc[:, ['ID', 'Sentence', 'Original_Sentence', 'Topic', 'Topic_Name']]
    sorted_df = selected_df.sort_values(by = 'ID')

    return sorted_df

In [None]:
sentence_df = create_sentence_topics(df)

In [None]:
sentence_df.head(20)

Unnamed: 0,ID,Sentence,Original_Sentence,Topic,Topic_Name
3216,1,Thank you.,Thank you.,20,20_thank_you_contact_408
12070,1,"After that, we'll open the call to questions f...","After that, we'll open the call to questions f...",-1,-1_and_the_to_of
14937,1,"Good [TIME], and thanks to everyone for joinin...","Good afternoon, and thanks to everyone for joi...",-1,-1_and_the_to_of
16685,1,Please note that some of the information you'l...,Please note that some of the information you'l...,-1,-1_and_the_to_of
15262,1,I'd now like to turn the call over to [PERSON]...,I'd now like to turn the call over to Tim for ...,-1,-1_and_the_to_of
13019,1,"Speaking first is [ORG]'s CEO, [PERSON]; and h...","Speaking first is Apple's CEO, Tim Cook; and h...",-1,-1_and_the_to_of
7338,1,[ORG] assumes no obligation to update any forw...,Apple assumes no obligation to update any forw...,-1,-1_and_the_to_of
11257,1,"For more information, please refer to the risk...","For more information, please refer to the risk...",-1,-1_and_the_to_of
27901,1,Actual results or trends could differ material...,Actual results or trends could differ material...,-1,-1_and_the_to_of
1164,2,"In [DATE], we announced that we are working wi...","In February, we announced that we are working ...",3,3_org_and_the_to


In [None]:
sentence_df.to_csv('sentence_with_topics.csv', index = False)

In [None]:
# # Create mapping from original texts to sentences
# text_to_sentences = subset_df['text'].apply(sent_tokenize).to_dict()

# # Reverse mapping: sentence to text index
# sentence_to_text = {}
# for text_idx, sentences in text_to_sentences.items():
#     for sentence in sentences:
#         if sentence in unique_sentences:
#             sentence_to_text[sentence] = text_idx

# # Initialize DataFrame to store topic probabilities for each text
# avg_topic_probabilities = pd.DataFrame(0, index = subset_df.index, columns = np.unique(topics))
# avg_topic_probabilities['text'] = subset_df['text']
# avg_topic_probabilities['num_sentences'] = 0

# # Iterate through sentences and update total probabilities
# for sentence, text_idx in sentence_to_text.items():
#     for i in range(-1, num_topics - 2):
#         avg_topic_probabilities.at[text_idx, i] += sentence_to_probability[sentence][i]
#     avg_topic_probabilities.at[text_idx, 'num_sentences'] += 1

# # Normalize data
# for i in range(-1, num_topics - 2):
#     avg_topic_probabilities[i] = avg_topic_probabilities[i] / avg_topic_probabilities['num_sentences']

# # Clean data
# avg_topic_probabilities.columns = ['T-1', 'T0', 'T1', 'T2', 'T3', 'text', 'num_sentences']
# avg_topic_probabilities = avg_topic_probabilities.drop('num_sentences', axis = 1)
# cleaned_topic_df = avg_topic_probabilities.dropna()

# # Merge to main data
# result = pd.merge(df, cleaned_topic_df, on = 'text', how = 'left')
# result.fillna({'T-1': -99, 'T0': -99, 'T1': -99, 'T2': -99, 'T3': -99}, inplace = True)
# result.head(20)

# result.to_csv('output_with_topics.csv', index = False)