In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import pandas as pd
analyzer = SentimentIntensityAnalyzer()

whole_dataset = pd.read_csv("..\data\\final_dataset.csv")

In [None]:
def run_vader(dataframe):
    """Generate VADER scores for each earnings call using pretrained VADER rules-based algorithm."""
    df_len = len(dataframe["ticker"])

    vader_scores = []
    i = 0

    for transcript in dataframe["transcript"]:
        i += 1
        
        transcript_polarity = analyzer.polarity_scores(transcript)
        neg = (transcript_polarity['neg'])
        pos = (transcript_polarity['pos'])

        # Calculate positivity sentiment:
        final_sentiment = (pos-neg)/(neg+pos+1)
        vader_scores.append(final_sentiment)

        if i%100 == 0:
            print(f"Percentage done: {round(i*100/df_len, 3)}%.")
            
    return vader_scores


In [None]:
vader_scores = run_vader(whole_dataset)

from sklearn.preprocessing import MinMaxScaler
import numpy as np

sentiment_scores = np.array(vader_scores)
"""
We normalize the data and avoid using the compound score to be consistent with our other sentiment indices. 
This introduces lookahead bias in magnitudes, but not in relative scores within a given time bucket, as scores are ranked
in relation to the max and min scores over the whole time series. But since we're interested in earnings call sentiments 
relative to others in a given period (portfolio holding time) and order is preserved in each time bucket,
this is not a substantial problem.   
"""
scaler = MinMaxScaler(feature_range = (-1, 1))
normalized_scores = scaler.fit_transform(sentiment_scores.reshape(-1, 1))

In [None]:
normalized_scores = normalized_scores.tolist()
normalized_scores = [score[0] for score in normalized_scores]

In [None]:
whole_dataset['vader-standardized'] = normalized_scores # Yes, should be vader-normalized!

Update final_dataset

In [None]:
whole_dataset.to_csv("..\data\\final_dataset.csv", index=False)

Create final dataset with no transcripts

In [None]:
df = pd.read_csv("..\data\\final_dataset.csv")
df.drop(columns="transcript", inplace=True)
# df
df.to_csv("..\data\\no_transcript_final_dataset.csv",index=False)