In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

"""
In this document, we assign sentiment scores to each document following the procedure developed in Garcia et al. (2022).

To process TF-IDFs, we should really go one by one as a new transcript is released, continuously updating the entire dtm matrix. 
To limit time complexity, we instead run the vectorization once, on the whole dataset. 
"""

In [None]:
def vectorize_transcripts(docs, include_unigrams=False, lm=False, ghr=False):
    """
    Docs is a list of transcripts.\n
    Returns bigram document-term-matrices, one for the positive dictionary, one for the negative.  
    """
    with open("..\data\\bigrams_dict.txt") as file: # Get bigrams from txt file 
        string = file.read()

    posneg = string.split("#")
    positive_voc = posneg[0].split(", ")
    negative_voc = posneg[1].split(", ")

    sws = list(set(stopwords.words('english'))) # We use this stopword dictionary, as Scikit warns that its stopword list is unreliable 

    if include_unigrams:
        if lm: # Use Loughran-Mcdonald unigrams
            lm_dict = pd.read_csv("..\data\Loughran-McDonald_MasterDictionary_1993-2023.csv")
            positive_LM_tokens = [token.lower() for token in lm_dict[lm_dict["Positive"] > 0]["Word"].tolist()]
            negative_LM_tokens = [token.lower() for token in lm_dict[lm_dict["Negative"] > 0]["Word"].tolist()]
            positive_voc.extend(positive_LM_tokens)
            negative_voc.extend(negative_LM_tokens)

        elif ghr: # Use Garcia, Hu, Roher (2022) unigrams
            positive_voc.extend([ "above", "across", "basis", "benefit", "cash", "congrats", "congratulations", "continue", "continued", "continues", "curious", "delivered", "driving", "drove", "exceeded", "exceeding", "expansion", "flow", "generated", "great", "grew", "growing", "growth", "helped", "helping", "income", "increase", "increased", "increasing", "job", "leverage", "lot", "margin", "momentum", "nice", "nicely", "operating", "outperformance", "outstanding", "over", "performance", "pretty", "proud", "raising", "really", "record", "repurchase", "results", "share", "solid", "sustainable", "terrific", "think", "up", "upside", "well", "years"])
            negative_voc.extend(["actions", "address", "affected", "affecting", "anticipated", "associated", "back", "believe", "below", "caused", "causing", "certain", "change", "changed", "changes", "confident", "costs", "decision", "decrease", "decreased", "down", "due", "dynamics", "expectations", "expected", "experienced", "factors", "fell", "goodwill", "happened", "headwinds", "however", "impact", "impacted", "impacting", "impacts", "issue", "issues", "longer", "lower", "necessary", "need", "not", "offset", "pressure", "pressures", "pronounced", "pushed", "related", "resolve", "revised", "short", "slipped", "soft", "softer", "softness", "steps", "taking", "temporary", "term", "timing", "transition", "trying", "understand"])

    # The below tokenizers 1) remove stop words, 2) lowercases, 3) tokenizes and 4) vectorizes (tf-idf scores for bigrams in the given vocabulary)
    pos_vectorizer = TfidfVectorizer(stop_words=sws,ngram_range=(1,2), vocabulary=positive_voc) # Remove stopwords, and create bigrams
    neg_vectorizer = TfidfVectorizer(stop_words=sws,ngram_range=(1,2), vocabulary=negative_voc)

    # Fit and transform the training set documents
    pos_dtm = pos_vectorizer.fit_transform(docs)
    neg_dtm = neg_vectorizer.fit_transform(docs)
    return pos_dtm, neg_dtm

def calculate_doc_sentiment(pos_dtm_j, neg_dtm_j, Nj):
    # Look up term frequencies for document j
    pos_tfij = pos_dtm_j
    neg_tfij = neg_dtm_j

    # Calculate sentiment score for given document
    pos_Sj = np.sum(pos_tfij)/Nj
    neg_Sj = np.sum(neg_tfij)/Nj

    return pos_Sj, neg_Sj

def get_transcripts(filepath, transcript_column="transcript"):
    """Returns a list consisting all transcripts at given filepath.\n
        Example use: Build in-sample TF-IDF scores using training_set.csv, and for each new transcript, run 
        construct_bigram_scores(list).
    """
    df = pd.read_csv(filepath)
    transcripts = df[transcript_column].tolist()

    return transcripts

def construct_bigram_scores(transcripts=list, include_unigrams=False, lm=False, ghr=False):  
    """
    Assumes all transcripts are sorted by date. \n
    transcripts is a list of transcripts. \n
    Builds dt tf-idf matrix for the entire dataset. 
    """

    train_sentiment_scores = [] # list of final sentiments

    # Vectorize the transcripts:
    dtm_positive, dtm_negative = vectorize_transcripts(transcripts, include_unigrams, lm, ghr)

    for j in range(len(transcripts)):
        Nj = len(transcripts[j])

        score_positive, score_negative = calculate_doc_sentiment(dtm_positive[j], dtm_negative[j], Nj)

        # Calculate positivity index: 
        final_sentiment = (score_positive-score_negative)/(score_positive+score_negative+1)

        train_sentiment_scores.append(final_sentiment)
        
    return train_sentiment_scores

Create column in final_dataset with GHR-bigram TF-IDFs:

In [None]:
sentiment_scores = construct_bigram_scores(transcripts=get_transcripts("..\data\\final_dataset.csv")) # Should use training data to avoid lookahead here!

sentiment_scores_b = sentiment_scores

# Standardize sentiment scores to be between -1 and 1:
from sklearn.preprocessing import MinMaxScaler

sentiment_scores = np.array(sentiment_scores)

scaler = MinMaxScaler(feature_range = (-1, 1))
standardized_scores = scaler.fit_transform(sentiment_scores.reshape(-1, 1))

df = pd.read_csv("..\data\\final_dataset.csv")
df["GHR-bigram-tf-idf"] = standardized_scores
df.to_csv("..\data\\final_dataset.csv", index=False)

Create column in final_dataset with GHR-bigram-unigram TF-IDFs:

In [None]:
sentiment_scores = construct_bigram_scores(transcripts=get_transcripts("..\data\\final_dataset.csv"), include_unigrams=True, ghr=True) # Should use training data to avoid lookahead here!

sentiment_scores_b = sentiment_scores

# Standardize sentiment scores to be between -1 and 1:
from sklearn.preprocessing import MinMaxScaler

sentiment_scores = np.array(sentiment_scores)

scaler = MinMaxScaler(feature_range = (-1, 1))
standardized_scores = scaler.fit_transform(sentiment_scores.reshape(-1, 1))

df = pd.read_csv("..\data\\final_dataset.csv")
df["GHR-bigram-unigram-tf-idf"] = standardized_scores
df.to_csv("..\data\\final_dataset.csv", index=False)

Create column in final_dataset with GHR-bigram-LM-unigram TF-IDFs:

In [None]:
sentiment_scores = construct_bigram_scores(transcripts=get_transcripts("..\data\\final_dataset.csv"), include_unigrams=True, lm=True) # Should use training data to avoid lookahead here!

sentiment_scores_b = sentiment_scores

# Standardize sentiment scores to be between -1 and 1:
from sklearn.preprocessing import MinMaxScaler

sentiment_scores = np.array(sentiment_scores)

scaler = MinMaxScaler(feature_range = (-1, 1))
standardized_scores = scaler.fit_transform(sentiment_scores.reshape(-1, 1))

df = pd.read_csv("..\data\\final_dataset.csv")
df["GHR-bigram-LM-unigram-tf-idf"] = standardized_scores
df.to_csv("..\data\\final_dataset.csv", index=False) # Save as column in final_dataset