In [33]:
import re
import nltk
import spacy
import textstat
from transformers import pipeline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("vader_lexicon")
nlp = spacy.load("en_core_web_sm")
sid = SentimentIntensityAnalyzer()
sentiment_model = pipeline("sentiment-analysis")

nltk.download('punkt')
nltk.download('vader_lexicon')


financial_terms = ["growth", "risk", "guidance", "margin", "revenue", "cash flow", "EPS", "outlook"]
positive_words = set(["strong", "growth", "positive", "profit", "beat", "up", "optimistic", "confident"])
negative_words = set(["decline", "loss", "negative", "down", "miss", "risk", "uncertain", "challenge"])

def count_forward_looking_statements(text):
    patterns = [r"\bexpect\b", r"\bforecast\b", r"\bproject\b", r"\bwill\b", r"\blooking ahead\b"]
    return sum(len(re.findall(pat, text, re.IGNORECASE)) for pat in patterns)

def extract_sections(text):
    """Roughly split transcript into opening, Q&A, and closing."""
    qna_start = re.search(r"Q&A|Question-and-Answer|Questions and Answers", text, re.IGNORECASE)
    closing_start = re.search(r"Closing Remarks|Final Remarks", text, re.IGNORECASE)

    start_qna = qna_start.start() if qna_start else int(0.6 * len(text))
    start_close = closing_start.start() if closing_start else int(0.85 * len(text))

    return {
        "opening": text[:start_qna],
        "qna": text[start_qna:start_close],
        "closing": text[start_close:]
    }

def sentiment_scores(text):
    return sid.polarity_scores(text)

def sentiment_volatility(sentences):
    scores = [sid.polarity_scores(s)["compound"] for s in sentences]
    return {
        "variance": float(np.var(scores)),
        "shift": float(scores[-1] - scores[0]) if scores else 0.0
    }

def extract_features(transcript):
    sections = extract_sections(transcript)
    sentences = nltk.sent_tokenize(transcript)

    # Sentiment scores
    total_sentiment = sentiment_scores(transcript)
    section_sentiments = {k: sentiment_scores(v) for k, v in sections.items()}

    # Sentiment volatility
    volatility = sentiment_volatility(sentences)

    # Count words
    word_tokens = nltk.word_tokenize(transcript.lower())
    pos_freq = sum(word in positive_words for word in word_tokens)
    neg_freq = sum(word in negative_words for word in word_tokens)

    # Financial term frequency
    term_freq = {term: transcript.lower().count(term) for term in financial_terms}

    # Forward-looking statements
    fwd_looking_count = count_forward_looking_statements(transcript)

    # Readability
    readability = {
        "flesch": textstat.flesch_reading_ease(transcript),
        "gunning_fog": textstat.gunning_fog(transcript)
    }

    # Analyst questions
    num_questions = transcript.count("?")
    question_sentences = [s for s in sentences if "?" in s]
    question_sentiments = [sid.polarity_scores(q)["compound"] for q in question_sentences]

    # Sentiment shift
    shift = volatility["shift"]

    return {
        "avg_sentiment": total_sentiment,
        "sentiment_by_section": section_sentiments,
        "positive_word_freq": pos_freq,
        "negative_word_freq": neg_freq,
        "forward_looking_statements": fwd_looking_count,
        "sentiment_volatility": volatility["variance"],
        "sentiment_shift": shift,
        "financial_term_freq": term_freq,
        "num_questions": len(question_sentences),
        "avg_question_sentiment": np.mean(question_sentiments) if question_sentiments else 0,
        "readability": readability,
        "transcript_length": len(word_tokens),
        "guidance_mentions": transcript.lower().count("guidance"),
        "surprise_announcement": "surprise" in transcript.lower() or "unexpected" in transcript.lower()
    }




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/new_work/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[nltk_data] Downloading package punkt to /Users/new_work/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/new_work/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [73]:

import pandas as pd


transcripts_df = pd.read_csv('data/transcripts_train.csv')
stock_prices_df = pd.read_csv('data/stock_prices_train.csv')

# Merge the DataFrames on a common column 'company' and 'date'
stock_prices_and_call_transcripts_df = pd.merge(transcripts_df, stock_prices_df, on=['company', 'date'], how='inner')



def flatten_dict(d, parent_key='', sep='_'):
    """Flatten nested dictionary."""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)



feature_rows = []

for idx, row in stock_prices_and_call_transcripts_df.iterrows():
    transcript = row['transcript']
    
    try:
        # Extract and flatten features
        features = extract_features(transcript)
        flat_features = flatten_dict(features)
        feature_rows.append(flat_features)
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        feature_rows.append({})  # Placeholder if something breaks

features_df = pd.DataFrame(feature_rows)

df_with_sentiment_features = pd.concat([stock_prices_and_call_transcripts_df.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)


df_with_sentiment_features.head()

output_file = "data/appended_with_sentiment_features.csv"
df_with_sentiment_features.to_csv(output_file, index=False)
print(f"Merged file saved to: {output_file}")



Merged file saved to: data/appended_with_sentiment_features.csv
