# Earning Call Transcripts Sentiment

For: Tan Cheen Hao!

The transcripts are already given to us by quarter by company so aggregation is not needed.

In the very basic form we basically want the output to be a csv file in the format below. (ideally order by quarter_year then by ticker but doesn't matter). `transcript_sentiment` should be values between 0 to 1 where the value vaguely represents the probability of a positive sentiment. Or -1 to 1 where -1 is neg and 1 is pos. This depends on you but _make it clear with a markdown at the end._

| ticker | quarter_year | transcript_sentiment |
| ------ | ------------ | -------------------- |
| BAC    | Q1 2001      | 0.2                  |
| JPM    | Q1 2001      | 0.67                 |
| WFC    | Q1 2001      | 0.97                 |

Now, you could also explore the use of LLMs and prompt engineering to extract specific information from the text first. For example, you could look into using LLMs to extract company specific info vs market info or ask the LLM to find how "confident" the announcer is before extracting the sentiment.

For earning calls, instead of finding whether its positive or negative, you could also find the degree of complexity, or even degree of confidence. Also, look into **aspect based sentiment analysis**, it could be useful. Ideally, you should have 2 output files; 1 for revenue and 1 for CAR.

Be creative!


In [None]:
import os
import json
import numpy as np
from collections import Counter
import pandas as pd
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel

In [None]:
# Directory containing the JSON files
json_folder_path ='ECC Transcript\Banks' 

# List to store transcripts
transcripts = []

# Loop through all files in the folder
for filename in os.listdir(json_folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(json_folder_path, filename)
        
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Combine all component texts into one document
        components = data.get("components", [])
        full_text = " ".join(component["text"] for component in components if "text" in component)
        
        transcripts.append({
            "filename": filename,
            "transcript": full_text
        })

# Create DataFrame
df = pd.DataFrame(transcripts)

# Ensure you have the necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Text Preprocessing
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['processed_text'] = df['transcript'].apply(preprocess_text)


In [None]:
def _temp_fn(top_words: list[tuple]):
    def _replace(doc):
        for word, _ in top_words:
            doc = doc.replace(word, "")
        return doc
    return _replace
    
def remove_popular_words(df, top_words: list[tuple]):
    df = df.copy()
    df["no_pop_words"] = df["processed_text"].apply(_temp_fn(top_words))
    return df

#######################################################################

# Compute TF-IDF
vectorizer = TfidfVectorizer(max_features=1000) 
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add filenames back for reference
tfidf_df['filename'] = df['filename']

# Compute number of tokens per document (based on processed text)
df['num_tokens'] = df['processed_text'].apply(lambda x: len(x.split()))



# Remove popular words from the DataFrame
df = remove_popular_words(df, [("--", 1)])


# Recompute token count after removing popular words
df['num_tokens_no_pop'] = df['no_pop_words'].apply(lambda x: len(x.split()))

# Flatten all tokens from processed text again after removing popular words
all_tokens = " ".join(df['no_pop_words']).split()

vocab = set(all_tokens)
vocab_size = len(vocab)

# Basic Statistics
total_documents = len(df)
total_tokens = df['num_tokens_no_pop'].sum()
avg_doc_length = df['num_tokens_no_pop'].mean()

# Get top 50 most common words from processed text (not TF-IDF)
top_words = Counter(all_tokens).most_common(50)

# Get top 50 words based on average TF-IDF score
avg_tfidf = tfidf_df.drop(columns=['filename']).mean().sort_values(ascending=False)
top_tfidf_words = avg_tfidf.head(50)

# Print statistics
print(f"Total documents      : {total_documents}")
print(f"Total tokens         : {total_tokens}")
print(f"Average doc length   : {avg_doc_length:.2f} tokens")
print(f"Vocabulary size      : {vocab_size}")

print("\nTop 50 most common words (raw frequency):")
for word, freq in top_words:
    print(f"{word:<15} {freq}")

print("\nTop 50 words by TF-IDF importance:")
print(top_tfidf_words)


In [None]:
# Create dictionary of word: average_tfidf_score
tfidf_scores = tfidf_df.drop(columns=['filename']).mean().to_dict()

# Generate raw WordCloud
wordcloud = WordCloud(max_words=80, max_font_size=100, width=800, height=400, background_color='white')
wordcloud.generate_from_frequencies(dict(top_words))

# Generate TF-IDF-based WordCloud
wordcloud1 = WordCloud(max_words=80, max_font_size=100, width=800, height=400, background_color='white')
wordcloud1.generate_from_frequencies(tfidf_scores)

# Display the WordCloud
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Count", fontsize=16)
plt.show()


# Display the WordCloud
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud1, interpolation="bilinear")
plt.axis("off")
plt.title("Average Word TF-IDF", fontsize=16)
plt.show()

Feed transcipt thu FinBert NLP model


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [None]:
import textstat
import torch
from torch.nn.functional import softmax
from sklearn.preprocessing import StandardScaler
import re
import pandas as pd


scale = StandardScaler()

# Generate synthetic text for each document based on TF-IDF scores
synthetic_texts = []
for idx, row in tfidf_df.iterrows():
    tfidf_scores = row.drop('filename').to_dict()
    synthetic_text = " ".join([
        f"{word} " * int(score * 100)
        for word, score in tfidf_scores.items()
        if score > 0
    ])
    synthetic_texts.append(synthetic_text)

# Compute raw complexity scores
raw_complexities = []
for synthetic_text in synthetic_texts:
    complexity = textstat.flesch_reading_ease(synthetic_text)
    raw_complexities.append([complexity])  # 2D for scaler

# Scale complexity scores
scaled_complexities = scale.fit_transform(raw_complexities)

# Sentiment analysis and result assembly
sentiment_results = []
for idx, synthetic_text in enumerate(synthetic_texts):
    # Tokenize the synthetic text
    inputs = tokenizer(synthetic_text, return_tensors="pt", truncation=True, max_length=512, padding=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = softmax(logits, dim=1)

        sentiment_score = (-1 * probs[0][0].item()) + (1 * probs[0][2].item())
        confidence = torch.max(probs).item()

    complexity = scaled_complexities[idx][0]

    # Parse filename to get company, quarter, and year
    filename = tfidf_df.loc[idx, 'filename'].replace(".json", "")
    quarter_match = re.search(r'(Q[1-4]) (\d{4})', filename)

    if quarter_match:
        quarter = quarter_match.group(1)
        year = quarter_match.group(2)
    else:
        quarter = "Unknown"
        year = "Unknown"

    company_match = re.search(r'companyid_(\d+)', filename)
    company_id = company_match.group(1) if company_match else "Unknown"

    sentiment_results.append({
        "company": company_id,
        "quarter": quarter,
        "year": year,
        "sentiment_score": sentiment_score,
        "confidence": confidence,
        "complexity": complexity
    })

    print(f"{company_id} | {quarter} | {year} | Sentiment: {sentiment_score:.3f} | Confidence: {confidence:.3f} | Complexity: {complexity:.2f}")


sentiment_df = pd.DataFrame(sentiment_results)
sentiment_df.to_csv("sentiment_results.csv",index=False)


PAOPAO your code at the bottom


In [1]:
import pandas as pd
import numpy as np
import os
import json
from tqdm.auto import tqdm

In [7]:
# Directory containing the JSON files
json_folder_path ='data/text/earning_call_transcripts' 

# List to store transcripts
transcripts = []

# Loop through all files in the folder
for filename in tqdm(os.listdir(json_folder_path)):
    if filename.endswith(".json"):
        file_path = os.path.join(json_folder_path, filename)
        
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Combine all component texts into one document
        components = data.get("components", [])
        full_text = "[BREAK]".join(component["text"] for component in components if "text" in component)
        most_important_date = data.get("mostimportantdate", np.nan)
        company_id = data.get("companyid", np.nan)

        transcripts.append({
            "company_id": company_id,
            "date": most_important_date,
            "transcript": full_text,
        })

# Create DataFrame
transcripts_data = pd.DataFrame(transcripts)


  0%|          | 0/3869 [00:00<?, ?it/s]

In [8]:
print("first transcript date:", transcripts_data["date"].min())
print("last transcript date:", transcripts_data["date"].max())
print("number of transcripts:", len(transcripts_data))

first transcript date: 2006-10-19
last transcript date: 2025-02-05
number of transcripts: 3868


In [9]:
def text_preprocessing_transcript(text):
    """Write the text preprocessing function here. This should work through the `df.apply()` function"""
    return text

In [10]:
def sentiment_analysis_transcript(transcripts_data: pd.DataFrame):
    """This function should take in the news data and output the final csv file dataframe"""
    output_data = transcripts_data.copy()
    return output_data

In [11]:
## save the final output

# output_data = sentiment_analysis_transcript(news_data)
# output_data.to_csv("output_transcript_sentiment.csv", index=False)