In [None]:
from transformers import pipeline
import os
import pandas as pd
import time

In [None]:
classifier_bert = pipeline('sentiment-analysis')
classifier_finbert = pipeline('sentiment-analysis', model="ProsusAI/finbert")
classifier_roberta = pipeline('sentiment-analysis', model="cardiffnlp/twitter-roberta-base-sentiment")
classifier_roberta2 = pipeline('sentiment-analysis', model="textattack/roberta-base-SST-2")
classifier_roberta_imdb = pipeline('sentiment-analysis', model="textattack/roberta-base-imdb")
classifier_roberta_imdb2 = pipeline('sentiment-analysis', model="aychang/roberta-base-imdb")
classifier_xlnet = pipeline('sentiment-analysis', model="edwardgowsmith/xlnet-base-cased-best")
classifier_xlnet2 = pipeline('sentiment-analysis', model="textattack/xlnet-base-cased-SST-2")
classifier_electra = pipeline('sentiment-analysis', model="howey/electra-large-sst2")

us_fed_chairs = ["Jerome H Powell", "Janet L Yellen", "Ben S Bernanke", "Alan Greenspan"]
us_board_speakers = ["Alice M Rivlin", "Daniel K Tarullo", "Edward M Gramlich", 
                         "Edward W Kelley Jr", "Elizabeth A Duke", "Frederic S Mishkin",
                        "Jeremy C Stein", "Kevin M Warsh", "Lael Brainard",
                        "Laurence H Meyer", "Mark W Olson", "Randall S Kroszner",
                        "Robert W Ferguson Jr", "Sarah Bloom Raskin", "Susan Schmidt Bies"]
us_president = ["Thomas M Hoenig", "Timothy F Geithner", "William C Dudley", "William J McDonough", "Brian P Sack",
               "Charles I Plosser", "Ernest T Patrikis", "James McAndrews", "Joseph S Tracy", "Narayana Kocherlakota", 
                "Simon M Potter" ]
us_speekers_to_keep = us_fed_chairs + us_board_speakers + us_president

df = pd.read_csv("source/speech_index_manual.csv")
df = df.loc[df["author"].isin(us_speekers_to_keep)]
df.shape

In [None]:
max_word_intro_length = 315
t_init = time.time()
count = 0

def del_content(i, i_init, text):
    # recursively returns list of all indexes in the speech file that should be deleted.
    # tokens to be deleted are included in the two lsits below
    del_words = ["bis", "review"] + [str(i) for i in range(0,10)]
    del_end = ["/2000"]
    list_to_delete = [] #list of indexes to be returned and deleted
    if text[i].lower() in del_words:  #if word matches words in the delete list
        list_to_delete = list_to_delete + [i]
    elif text[i][-5:] in del_end:  #if end of word matches end of words in the delete list
        list_to_delete  = list_to_delete = [i]
    else:  # not a word we want to delete, return empty list
        return []  
    if i <= i_init:  # continue checking leftward
        list_to_delete = list_to_delete + del_content(i - 1, i_init, text)
    if i >= i_init: #continue checking rightward
        try:  # TODO - try/except for the special case of the deleted item being the last item in the text
            list_to_delete = list_to_delete + del_content(i + 1, i_init, text)
        except:
            pass
    return list_to_delete

def filter_content(text):
    star_counter = 0
    for i, letter in enumerate(text):
        if letter == "*":
            star_counter += 1
        if star_counter >= 3:
            text = text[i + 1:len(text)]
            break
    split_text = text.split()
    list_to_delete = []
    for i, word in enumerate(split_text):
        if word.lower() == "bis":
            list_to_delete = list_to_delete + del_content(i, i, split_text)
    # have to be careful -- deleting items of list I'm iterating over
    # delete indexes in descending order to avoid index out of range
    list_to_delete.sort()
    for item_to_delete in reversed(list_to_delete):
        split_text.pop(item_to_delete)
        
    # TEMP TODO - only taking 300 first words because of tokenization limit
    split_text = split_text[0:max_word_intro_length]
    return " ".join(split_text)

def sentiment_analysis(speech_file):
    with open("source/txt/" + speech_file, "r", encoding="utf8") as content:
        filtered_content = filter_content(content.read())
        bert_sent = classifier_bert(filtered_content)
        finbert_sent = classifier_finbert(filtered_content)
        robertat_sent = classifier_roberta(filtered_content)
        robertat_sent2 = classifier_roberta2(filtered_content)
        robertat_imdb_sent = classifier_roberta_imdb(filtered_content)
        robertat_imdb_sent2 = classifier_roberta_imdb2(filtered_content)
        xlnet_sent = classifier_xlnet(filtered_content)
        xlnet_sent2 = classifier_xlnet2(filtered_content)
        electra_sent = classifier_electra(filtered_content)
        df.loc[df["speech"] == speech_file.split(".")[0], ["bert_sent"]] = 2 if bert_sent[0]["label"] == "POSITIVE" else 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["bert_score"]] = bert_sent[0]["score"]
        if robertat_sent[0]["label"] == "LABEL_2":
            df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_sent"]] = 2
        elif robertat_sent[0]["label"] == "LABEL_1":
            df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_sent"]] = 1
        else:
            df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_sent"]] = 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_score"]] = robertat_sent[0]["score"]

        if finbert_sent[0]["label"] == "positive":
            df.loc[df["speech"] == speech_file.split(".")[0], ["finbert_sent"]] = 2
        elif finbert_sent[0]["label"] == "neutral":
            df.loc[df["speech"] == speech_file.split(".")[0], ["finbert_sent"]] = 1
        else:
            df.loc[df["speech"] == speech_file.split(".")[0], ["finbert_sent"]] = 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["finbert_score"]] = finbert_sent[0]["score"]

        df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_sent2"]] = 2 if robertat_sent2[0]["label"] == "LABEL_1" else 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_score2"]] = robertat_sent2[0]["score"]

        df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_imdb_sent"]] = 2 if robertat_imdb_sent[0]["label"] == "LABEL_1" else 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_imdb_score"]] = robertat_imdb_sent[0]["score"]

        df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_imdb_sent2"]] = 2 if robertat_imdb_sent2[0]["label"] == "pos" else 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["roberta_imdb_score2"]] = robertat_imdb_sent2[0]["score"]

        df.loc[df["speech"] == speech_file.split(".")[0], ["xlnet_sent"]] = 2 if xlnet_sent[0]["label"] == "LABEL_1" else 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["xlnet_score"]] = xlnet_sent[0]["score"]

        df.loc[df["speech"] == speech_file.split(".")[0], ["xlnet_sent2"]] = 2 if xlnet_sent2[0]["label"] == "LABEL_1" else 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["xlnet_score2"]] = xlnet_sent2[0]["score"]

        df.loc[df["speech"] == speech_file.split(".")[0], ["electra_sent"]] = 2 if electra_sent[0]["label"] == "LABEL_1" else 0
        df.loc[df["speech"] == speech_file.split(".")[0], ["electra_score"]] = electra_sent[0]["score"]

for root, dirst, files in os.walk("source/txt"):
    for i, speech_file in enumerate(files):
        if speech_file.split(".")[0] in df.speech.values:
            count += 1
            if count % 25 == 0:
                print("count: " + str(count) + " - time: " + str(time.time() - t_init))
            sentiment_analysis(speech_file)
            
            
                
df.to_csv("data_files_generated/genered_speech_sentiment.csv", index=False)

In [None]:
df_news = pd.read_csv("source/usnews.csv")
df_news.head()
print(df_news.loc[1])

In [None]:
def filter_content_news(text):
    split_text = text.split()
    split_text = split_text[0:300]
    return " ".join(split_text)

t_init = time.time()
count = 0
for i, row in df_news.iterrows():
    count += 1
    if count % 100 == 0:
        print("count: " + str(count) + " - time: " + str(time.time() - t_init))
    #print(row["texts"])
    filtered_content = filter_content_news(row["texts"])
    bert_sent = classifier_bert(filtered_content)
    robertat_sent = classifier_roberta(filtered_content)
    xlnet_sent = classifier_xlnet(filtered_content)
    electra_sent = classifier_electra(filtered_content)
    df_news.loc[i, ["news_bert_sent"]] = 2 if bert_sent[0]["label"] == "POSITIVE" else 0
    df_news.loc[i, ["news_bert_score"]] = bert_sent[0]["score"]
    if robertat_sent[0]["label"] == "LABEL_2":
        df_news.loc[i, ["news_roberta_sent"]] = 2
    elif robertat_sent[0]["label"] == "LABEL_1":
        df_news.loc[i, ["news_roberta_sent"]] = 1
    else:
        df_news.loc[i, ["news_roberta_sent"]] = 0
    df_news.loc[i, ["news_roberta_score"]] = robertat_sent[0]["score"]
    df_news.loc[i, ["news_xlnet_sent"]] = 2 if xlnet_sent[0]["label"] == "LABEL_1" else 0
    df_news.loc[i, ["news_xlnet_score"]] = xlnet_sent[0]["score"]
    df_news.loc[i, ["news_electra_sent"]] = 2 if electra_sent[0]["label"] == "LABEL_1" else 0
    df_news.loc[i, ["news_electra_score"]] = electra_sent[0]["score"]
    
df_news.to_csv("data_files_generated/generated_news_sentiment_1.csv", index=False)

In [None]:
df_news_generated = pd.DataFrame({'date':pd.date_range('01/01/1995', periods=10000)})
df_news_generated.head()

In [None]:

def get_prob(sent, score):
    return 1

#print(df_news["date"])
sent_list = ["news_bert_sent", "news_roberta_sent", "news_xlnet_sent", "news_electra_sent"]
score_list = ["news_bert_score", "news_roberta_score", "news_xlnet_score", "news_electra_score"]
weighted_score_list = [0 for i in range(len(sent_list))]
for i, row in df_news_generated.iterrows():
    #print(row["date"].date())
    #if str(row["date"].date()) in df_news.date.values:
    if str(row["date"].date()) in df_news["date"].values:
        #print(df_news[df_news["date"] == str(row["date"].date())].news_bert_score.values)
        articles = df_news[df_news["date"] == str(row["date"].date())]
        count_article = len(articles)
        wsj = 0
        wapo = 0
        econ = 0
        nonecon = 0
        bert_positive = 0
        bert_negative = 0
        xlnet_positive = 0
        xlnet_negative = 0
        electra_positive = 0
        electra_negative = 0
        for j in range(count_article):
            bert_positive += articles.news_bert_sent.values[j] / 2
            bert_negative += 1 - articles.news_bert_sent.values[j] / 2
            xlnet_positive += articles.news_xlnet_sent.values[j] / 2
            xlnet_negative += 1 - articles.news_xlnet_sent.values[j] / 2
            electra_positive += articles.news_bert_sent.values[j] / 2
            electra_negative += 1 - articles.news_electra_sent.values[j] / 2
            wsj += articles.wsj.values[j]
            wapo += articles.wapo.values[j]
            econ += articles.economy.values[j]
            nonecon += articles.noneconomy.values[j]
        
        df_news_generated.loc[i, "bert_positive"] = bert_positive
        df_news_generated.loc[i, "bert_negative"] = bert_negative
        df_news_generated.loc[i, "xlnet_positive"] = xlnet_positive
        df_news_generated.loc[i, "xlnet_negative"] = xlnet_negative
        df_news_generated.loc[i, "electra_positive"] = electra_positive
        df_news_generated.loc[i, "electra_negative"] = electra_negative
        
        df_news_generated.loc[i, "wsj"] = wsj
        df_news_generated.loc[i, "wapo"] = wapo
        df_news_generated.loc[i, "econ"] = econ
        df_news_generated.loc[i, "nonecon"] = nonecon

df_news_generated.to_csv("data_files_generated/generated_news_sentiment_2.csv", index=False)