In [159]:
import pandas as pd
import numpy as np
import transformers
from tqdm import tqdm
import pysentiment2 as ps

In [160]:
data = pd.read_csv('EarningCallData/output.csv')

In [161]:
model_name = 'ProsusAI/finbert'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

test = " NVIDIA stock is going very bad, I am very sad"
tokenized = tokenizer(test, return_tensors='pt')
output = model(**tokenized)
# scores softmax
scores = output.logits.softmax(dim=1).detach().numpy()
scores[0]

array([0.01014265, 0.95804197, 0.03181531], dtype=float32)

In [162]:
def get_sentiment_sentence_bert(text):
    sentences = text.split('.')
    list_sentiment = []
    for sentence in sentences:
        tokenized = tokenizer(sentence, return_tensors='pt')
        output = model(**tokenized)
        scores = output.logits.softmax(dim=1).detach().numpy()
        list_sentiment.append(scores[0])
    return np.mean(list_sentiment, axis=0)

In [163]:
def get_sentiment_bert(data):
    texts = data['text']
    positives = []
    negatives = []
    neutrals = []
    polarities = []
    for i in tqdm(range(len(data))):
        text = texts[i]
        positive, negative, neutral = get_sentiment_sentence_bert(text)
        positives.append(positive)
        negatives.append(negative)
        neutrals.append(neutral)
        polarity = (positive - negative) / (positive + negative + neutral)
        polarities.append(polarity)

    
    # new column for sentiment
    data['positive_sentiment_bert'] = positives
    data['negative_sentiment_bert'] = negatives
    data['neutral_sentiment_bert'] = neutrals
    data['polarity_bert'] = polarities
    return data

In [164]:
get_sentiment_sentence_bert('''Good day, and welcome to the Apple Inc. Second Quarter Fiscal Year 2019 Earnings Conference Call. Today's call is being recorded. At this time, for opening remarks and introductions, I would like to turn the call over to Nancy Paxton, Senior Director of Investor Relations. Please go ahead.''')

array([0.11321773, 0.06799968, 0.81878257], dtype=float32)

In [165]:
get_sentiment_sentence_bert('''Thank you. Good afternoon, and thanks to everyone for joining us today. Speaking first is Apple's CEO, Tim Cook; and he'll be followed by CFO, Luca Maestri. After that, we'll open the call to questions from analysts.
Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including without limitation, those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation and future business outlook. Actual results or trends could differ materially from our forecast. For more information, please refer to the risk factors discussed in Apple's most recently filed periodic reports on Form 10-K and Form 10-Q and the Form 8-K filed with the SEC today along with the associated press release. Apple assumes no obligation to update any forward-looking statements or information, which speak as of their respective dates.
I'd now like to turn the call over to Tim for introductory remarks.''')

array([0.15933189, 0.0639887 , 0.77667946], dtype=float32)

In [166]:
words = ['margin', 'cost', 'revenue', 'earnings', 'growth', 'debt', 'dividend', 'cashflow']

def get_sentiment_topic(data, words):
    texts = data['text'].apply(lambda x: x.lower())
    positives = {}
    negatives = {}
    neutrals = {}
    polarities = {}
    for word in words:
        positives[word] = []
        negatives[word] = []
        neutrals[word] = []
        polarities[word] = []
    for i in tqdm(range(len(data))):
        text = texts[i]
        for word in words:

            if word in text:
                positive, negative, neutral = get_sentiment_sentence_bert(text)
                polarity = (positive - negative) / (positive + negative + neutral)
            else:
                positive, negative, neutral = -1, -1, -1
                polarity = -1
            positives[word].append(positive)
            negatives[word].append(negative)
            neutrals[word].append(neutral)
            polarities[word].append(polarity)
    for word in words:
        data[f'positive_sentiment_bert_{word}'] = positives[word]
        data[f'negative_sentiment_bert_{word}'] = negatives[word]
        data[f'neutral_sentiment_bert_{word}'] = neutrals[word]
        data[f'polarity_bert_{word}'] = polarities[word]
    return data

In [91]:
data = get_sentiment_bert(data)
data = get_sentiment_topic(data, words)

100%|██████████| 3537/3537 [23:54<00:00,  2.47it/s]  
100%|██████████| 3537/3537 [38:37<00:00,  1.53it/s]  


In [92]:
data.to_csv('EarningCallData/output_sentiment.csv')

### Merging

In [127]:
data = pd.read_csv('EarningCallData/output_sentiment.csv')

In [128]:
data.columns

Index(['Unnamed: 0', 'speaker', 'text', 'type', 'speaker_type',
       'speaker_company', 'speaker_role', 'company_name', 'date',
       'positive_sentiment_bert', 'negative_sentiment_bert',
       'neutral_sentiment_bert', 'polarity_bert',
       'positive_sentiment_bert_margin', 'negative_sentiment_bert_margin',
       'neutral_sentiment_bert_margin', 'polarity_bert_margin',
       'positive_sentiment_bert_cost', 'negative_sentiment_bert_cost',
       'neutral_sentiment_bert_cost', 'polarity_bert_cost',
       'positive_sentiment_bert_revenue', 'negative_sentiment_bert_revenue',
       'neutral_sentiment_bert_revenue', 'polarity_bert_revenue',
       'positive_sentiment_bert_earnings', 'negative_sentiment_bert_earnings',
       'neutral_sentiment_bert_earnings', 'polarity_bert_earnings',
       'positive_sentiment_bert_growth', 'negative_sentiment_bert_growth',
       'neutral_sentiment_bert_growth', 'polarity_bert_growth',
       'positive_sentiment_bert_debt', 'negative_sentiment_b

In [129]:
global_sentiment_cols = ['positive_sentiment_bert', 'negative_sentiment_bert', 'neutral_sentiment_bert', 'polarity_bert']

topic_sentiment_cols = [f'positive_sentiment_bert_{word}' for word in words] + [f'negative_sentiment_bert_{word}' for word in words] + [f'neutral_sentiment_bert_{word}' for word in words] + [f'polarity_bert_{word}' for word in words]

In [130]:
def mean_company_sentiment(col):
    return col[data['speaker_type'] == 'Corporate Participant'].mean()

def mean_analyst_sentiment(col):
    return col[data['speaker_type'] == 'Conference Participant'].mean()

def mean_presentation_sentiment(col):
    return col[data['type'] == 'presentation'].mean()

def mean_qa_sentiment(col):
    return col[data['type'] == 'qna'].mean()

def mean_topic_sentiment(col):
    return col[col != -1].mean()

In [131]:
mean_topic_sentiment(data['positive_sentiment_bert'])

0.30850450994062767

In [132]:
# for each transcript, average global sentiment, and average sentiment per section and per speaker

aggregations = dict()
for col in global_sentiment_cols:
    aggregations[col] = ['mean', 'std', mean_company_sentiment, mean_analyst_sentiment, mean_presentation_sentiment, mean_qa_sentiment]

for col in topic_sentiment_cols:
    aggregations[col] = [mean_topic_sentiment]

data = data.groupby(['company_name', 'date'])[global_sentiment_cols + topic_sentiment_cols].agg(
    aggregations,
)


In [133]:
data.columns = ['_'.join(col).strip() for col in data.columns.values]


In [134]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,positive_sentiment_bert_mean,positive_sentiment_bert_std,positive_sentiment_bert_mean_company_sentiment,positive_sentiment_bert_mean_analyst_sentiment,positive_sentiment_bert_mean_presentation_sentiment,positive_sentiment_bert_mean_qa_sentiment,negative_sentiment_bert_mean,negative_sentiment_bert_std,negative_sentiment_bert_mean_company_sentiment,negative_sentiment_bert_mean_analyst_sentiment,...,neutral_sentiment_bert_dividend_mean_topic_sentiment,neutral_sentiment_bert_cashflow_mean_topic_sentiment,polarity_bert_margin_mean_topic_sentiment,polarity_bert_cost_mean_topic_sentiment,polarity_bert_revenue_mean_topic_sentiment,polarity_bert_earnings_mean_topic_sentiment,polarity_bert_growth_mean_topic_sentiment,polarity_bert_debt_mean_topic_sentiment,polarity_bert_dividend_mean_topic_sentiment,polarity_bert_cashflow_mean_topic_sentiment
company_name,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAPL.OQ,2019-Apr-30,0.315635,0.107936,0.341883,0.247137,0.293498,0.318709,0.208156,0.119192,0.206948,0.204041,...,0.424185,,0.011091,0.142511,0.112086,0.077188,0.142511,0.142511,0.142511,
AAPL.OQ,2019-Jan-29,0.302514,0.120256,0.33685,0.211356,0.292223,0.30442,0.198709,0.094536,0.205133,0.176514,...,0.424185,,0.132753,0.101544,0.124304,0.142511,0.12885,0.142511,0.142511,
AAPL.OQ,2019-Jul-30,0.286711,0.130435,0.341883,0.155914,0.293498,0.285768,0.174392,0.078805,0.206948,0.097553,...,0.424185,,0.110102,0.142511,0.129643,0.142511,0.099923,0.142511,0.142511,
AAPL.OQ,2019-Oct-30,0.289265,0.128467,0.344239,0.174357,0.293498,0.288761,0.177545,0.073984,0.208271,0.1133,...,0.424185,,0.110133,0.142511,0.089168,0.142511,0.107666,0.142511,0.142511,
AAPL.OQ,2020-Apr-30,0.306463,0.112452,0.342747,0.226466,0.293498,0.308126,0.194835,0.078818,0.207433,0.164946,...,0.424185,,0.148952,0.142511,0.142511,0.142511,0.142511,0.142511,0.142511,


In [135]:
data.to_csv('EarningCallData/output_sentiment_aggregated.csv')

## TEST

In [82]:
# pysentiment analysis

# using HIV-4
def get_sentiment_text_hiv4(text):
    sentences = text.split('.')
    list_sentiment = []
    for sentence in sentences:
        hiv4 = ps.HIV4()
        tokens = hiv4.tokenize(sentence)
        score = list(hiv4.get_score(tokens).values())
        list_sentiment.append(score)
    return np.mean(list_sentiment, axis=0)
        

# using loughran mcdonald
def get_sentiment_text_lm(text):
    sentences = text.split('.')
    list_sentiment = []
    for sentence in sentences:
        lm = ps.LM()
        tokens = lm.tokenize(sentence)
        score = list(lm.get_score(tokens).values())
        list_sentiment.append(score)
    return np.mean(list_sentiment, axis=0)

In [83]:
def get_sentiment_pysentiment(data):
    texts = data['text']
    positives_hiv4 = []
    negatives_hiv4 = []
    polarities_hiv4 = []
    subjectivities_hiv4 = []
    positives_lm = []
    negatives_lm = []
    polarities_lm = []
    subjectivities_lm = []
    for i in tqdm(range(len(data))):
        text = texts[i]
        positive_hiv4, negative_hiv4, polarity_hiv4, subjectivity_hiv4 = get_sentiment_text_hiv4(text)
        positive_lm, negative_lm, polarity_lm, subjectivity_lm = get_sentiment_text_lm(text)
        positives_hiv4.append(positive_hiv4)
        negatives_hiv4.append(negative_hiv4)
        polarities_hiv4.append(polarity_hiv4)
        subjectivities_hiv4.append(subjectivity_hiv4)
        positives_lm.append(positive_lm)
        negatives_lm.append(negative_lm)
        polarities_lm.append(polarity_lm)
        subjectivities_lm.append(subjectivity_lm)
    data['positive_sentiment_hiv4'] = positives_hiv4
    data['negative_sentiment_hiv4'] = negatives_hiv4
    data['polarity_hiv4'] = polarities_hiv4
    data['subjectivity_hiv4'] = subjectivities_hiv4
    data['positive_sentiment_lm'] = positives_lm
    data['negative_sentiment_lm'] = negatives_lm
    data['polarity_lm'] = polarities_lm
    data['subjectivity_lm'] = subjectivities_lm
    return data

In [84]:
data = get_sentiment_pysentiment(data)

  0%|          | 2/3537 [01:00<29:29:36, 30.04s/it]


KeyboardInterrupt: 

In [73]:
test = " HELLOOOOOy"
get_sentiment_text_hiv4(test)

TypeError: unsupported operand type(s) for /: 'dict_values' and 'int'

In [75]:
get_sentiment_text_lm(test)

TypeError: unsupported operand type(s) for /: 'dict_values' and 'int'