In [None]:
import pandas as pd

master_dictionary = pd.read_csv("..\data\Loughran-McDonald_MasterDictionary_1993-2023.csv")
positive_words = [word.lower() for word in master_dictionary[master_dictionary["Positive"] > 0]["Word"].tolist()]
negative_words = [word.lower() for word in master_dictionary[master_dictionary["Negative"] > 0]["Word"].tolist()]

In [None]:
def get_count(document, filter=False):
    """Returns the number of positive or negative words per document"""
    positive_count, negative_count = 0, 0
    
    tokens = eval(document)

    if not filter:
        for word in tokens: 
            if word in positive_words:
                positive_count += 1
            elif word in negative_words:
                negative_count += 1
        
    else: # Follow UZH article: Don't count positive words that include a negation in the preceeding 3 words
        negations = ["no", "not", "none", "neither", "never", "nobody"]
        for i in range(len(tokens)): # 
            neighbor_words = []

            # add previous three words:
            if i >= 3:
                neighbor_words.extend([tokens[i-1], tokens[i-2], tokens[i-3]])

            # Don't count positive words if negation:
            neighbor_flag = False
            for negation in negations:
                if negation in neighbor_words: neighbor_flag = True
                if "n't" in negation: neighbor_flag = True # Flag if e.g. shouldn't preceedes a positive word
                
            if tokens[i] in positive_words and not neighbor_flag:
                positive_count += 1
            elif tokens[i] in negative_words:
                negative_count += 1

            # Could extend this to consider other cases, e.g. screening for "special" n-grams like good "morning"
            # as proposed by https://www.zora.uzh.ch/id/eprint/199785/1/SSRN-id2559157.pdf p. 8. 
            # The method above will not lead to considerable differences, but serves as an illustrative 
            # example of how to improve on the naive BoW approach without breaking its framework.
            # We will use this (enhanced) BoW approach going forward, considering this as part of the preprocessing step.

    return positive_count, negative_count

In [None]:
def get_BoW_score(document):
    """Returns sentiment score for each document"""
    positive_count, negative_count = get_count(document)
    positivity_score = (positive_count-negative_count)/(negative_count+positive_count+1)
    return positivity_score

def Improved_BoW_score(document):
    """Returns negativity score for each document controlling for negations in prior three tokens"""
    """We will use this one for inferences."""
    positive_count, negative_count = get_count(document, filter=True)
    positivity_score = (positive_count-negative_count)/(negative_count+positive_count+1)
    return positivity_score

def run_BoW(data):
    """Get BoW sentiment score for each document in cleaned data"""
    df = pd.read_csv(data)

    # naive_negativity_scores = []
    enhanced_positivity_scores = []
    ticker = []
    date = []

    for i in range(len(df)):
        document = df["tokenized_transcript"][i]
    
        # naive_negativity_scores.append(get_BoW_score(document))
        enhanced_positivity_scores.append(Improved_BoW_score(document))

        ticker.append(df["ticker"][i])
        date.append(df["Origin_Date"][i])

        if i%100 == 0: 
            print(f"Percentage done: {round(i*100/len(df), 4)}%.")

    return enhanced_positivity_scores, ticker, date

In [None]:
enhanced_positivity_scores, ticker, date = run_BoW("..\data\BoW-tokenized-transcripts.csv")

In [None]:
# Using the preprocessed transcripts rather than the raw data, we achieve a speedup of around 90% in the encoding.

# Standardize sentiment scores to be between -1 and 1:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

sentiment_scores = np.array(enhanced_positivity_scores)

scaler = MinMaxScaler(feature_range = (-1, 1))
standardized_scores = scaler.fit_transform(sentiment_scores.reshape(-1, 1))

lst = standardized_scores.tolist()
standardized_scores = [score[0] for score in lst]

bow_scores = pd.DataFrame(list(zip(ticker, date, standardized_scores)), columns=["ticker", "Origin_Date", "enhanced-bow"])

In [None]:
# Read in final_dataset:
df = pd.read_csv("..\data\\final_dataset.csv")
new_df = pd.merge(df, bow_scores,  how='left', left_on=['date_time','ticker'], right_on = ['Origin_Date','ticker'])
new_df.drop(columns=['Origin_Date'], axis=1, inplace=True)
# dropna, should have 15803 rows or so

In [None]:
new_df.to_csv("..\data\\final_dataset.csv", index=False) # Save final_dataset

In [None]:
# Create final_dataset without transcripts
fds = pd.read_csv("..\data\\final_dataset.csv")
fds.drop(columns=["transcript"], inplace=True)
fds

In [None]:
fds.to_csv("..\data\\no_transcript_final_dataset.csv", index=False) # Save no_transcript_final_dataset