In [None]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
import numpy as np 
import pandas as pd 
from Extract import LabelSimilar 
from Extract import Extract 
from numpy.dtypes import StringDType
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA 
import matplotlib.pyplot as plt 
import time 
import torch 

## Running on GPU 
device  = torch.device('cpu')



model = SentenceTransformer("FinLang/finance-embeddings-investopedia",device=device)
folder_path = 'clean_merged_data/'
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert", framework="pt",device=device) 


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps


In [2]:
# setting up initial data frame 
data_path = "1430_data/speeches.csv"
# putting it into dataframe 
speeches = pd.read_csv(data_path)
speeches = speeches.sample(frac=1).reset_index(drop=True)

unique_countries = speeches['country'].unique()
unique_countries
speeches


Unnamed: 0,reference,country,date,title,author,is_gov,text
0,r191212c_SNB,switzerland,2019-12-12,"Introductory remarks, news conference",maechler,0,I would like to begin by reviewing development...
1,r220524a_FOMC,united states,2022-05-24,Welcoming Remarks,powell,1,"Good morning, and welcome. It is a great pleas..."
2,r010402a_ECB,euro area,2001-04-02,Consolidation of the payment systems industry:...,london,0,I would first like to thank you for giving me ...
3,r171207a_BOE,united kingdom,2017-12-07,Making banks resolvable: the key to making res...,gracie,0,Resolution has come a long way since G20 Leade...
4,r170913a_ECB,euro area,2017-09-13,Karl-Otto Pöhl Lecture,praet,0,The monetary policy measures introduced by the...
...,...,...,...,...,...,...,...
7716,r171214a_SNB,switzerland,2017-12-14,"Introductory remarks, news conference",zurbrugg,0,"In my remarks today, I would like to address s..."
7717,r991019a_BOJ,japan,1999-10-19,On Recent Monetary Policy,yamaguchi,0,I am honored to be invited to this conference ...
7718,r131122a_ECB,euro area,2013-11-22,Opening speech at the European Banking Congres...,draghi,1,"Ladies and gentlemen, Thank you for inviting m..."
7719,r970313a_FOMC,united states,1997-03-13,we,rivlin,0,I am extremely pleased to have the opportunity...


In [3]:
speech_texts = speeches['text'].to_list()
cleaned_speeches = []
remove_index = []
# iterating and extracting sentences
i =0
for speech in speech_texts:
    extract = Extract(speech)
    sentences = extract.extract_sentences()

    if sentences==[]:
      remove_index.append(i)
    elif sentences!=[]:
      cleaned_speeches.append(sentences)
    i +=1




In [4]:
speeches = speeches.drop(index= remove_index)
speeches = speeches.reset_index(drop=True)
speeches

Unnamed: 0,reference,country,date,title,author,is_gov,text
0,r191212c_SNB,switzerland,2019-12-12,"Introductory remarks, news conference",maechler,0,I would like to begin by reviewing development...
1,r220524a_FOMC,united states,2022-05-24,Welcoming Remarks,powell,1,"Good morning, and welcome. It is a great pleas..."
2,r010402a_ECB,euro area,2001-04-02,Consolidation of the payment systems industry:...,london,0,I would first like to thank you for giving me ...
3,r171207a_BOE,united kingdom,2017-12-07,Making banks resolvable: the key to making res...,gracie,0,Resolution has come a long way since G20 Leade...
4,r170913a_ECB,euro area,2017-09-13,Karl-Otto Pöhl Lecture,praet,0,The monetary policy measures introduced by the...
...,...,...,...,...,...,...,...
7713,r171214a_SNB,switzerland,2017-12-14,"Introductory remarks, news conference",zurbrugg,0,"In my remarks today, I would like to address s..."
7714,r991019a_BOJ,japan,1999-10-19,On Recent Monetary Policy,yamaguchi,0,I am honored to be invited to this conference ...
7715,r131122a_ECB,euro area,2013-11-22,Opening speech at the European Banking Congres...,draghi,1,"Ladies and gentlemen, Thank you for inviting m..."
7716,r970313a_FOMC,united states,1997-03-13,we,rivlin,0,I am extremely pleased to have the opportunity...


In [None]:
# deriving vectors for each of the sentences 
speech_vecs = [np.zeros(shape=(len(cleaned_speeches[i]), 384)) for i in range(0,len(cleaned_speeches))]
# iterating and deriving setence level embeddings 
num_iters = 100
times = []
for i in range(num_iters):
    curr_time = time.time()
    curr_speech = cleaned_speeches[i] 
    embeddings = model.encode(curr_speech)
    speech_vecs[i]= embeddings
    end_time = time.time()
    it_time = curr_time - end_time
    times.append(it_time)
    print(f'speech: {i} encoded, avg time{np.mean(times)} seconds ')


output: (34, 768)
speech: 0 encoded, avg time-0.4997861385345459 seconds 
output: (17, 768)
speech: 1 encoded, avg time-0.30783402919769287 seconds 
output: (88, 768)
speech: 2 encoded, avg time-0.4586190382639567 seconds 
output: (65, 768)
speech: 3 encoded, avg time-0.47978848218917847 seconds 
output: (52, 768)
speech: 4 encoded, avg time-0.4598729610443115 seconds 
output: (134, 768)
speech: 5 encoded, avg time-0.5111297766367594 seconds 
output: (63, 768)
speech: 6 encoded, avg time-0.5073113782065255 seconds 
output: (94, 768)
speech: 7 encoded, avg time-0.5178166925907135 seconds 
output: (91, 768)
speech: 8 encoded, avg time-0.5350361929999458 seconds 


KeyboardInterrupt: 

In [6]:
# whether using subset and need to slice the embeddings list

using_slice = True ## edit to false when using full dataset 
# slicing dataset 
if using_slice:
    data_set = speech_vecs[:num_iters]
    assert(len(data_set)== num_iters)

elif using_slice == False:
    data_set = speech_vecs

# now generating embeddings for different topic areas 
key_sentences = pd.read_csv("key_vecs/1430_true_keys.csv")
topics = key_sentences['Factor'].unique()
topic_sentence_dict = {topic:[] for topic in topics}
for index, row in key_sentences.iterrows():
    curr_topic = row['Factor']
    curr_list = topic_sentence_dict[curr_topic]
    curr_list.append(row['Sentence'])

# map containing topic and key vector to compare to for similarity 
key_vecs = []

for term in topics :
    sentences = topic_sentence_dict[term]
    out_encodings = model.encode(sentences=sentences)
    mean_vec = np.mean(out_encodings, axis=0)

    key_vecs.append((term, mean_vec))

topics

array(['Inflation', 'Domestic Growth', 'Trade Balance',
       'Value of Currency'], dtype=object)

In [7]:
## finding the closest and labelling them 

label = LabelSimilar(key_vecs)
list_topics = [np.zeros(data_set[i].shape[0], dtype=StringDType()) for i in range (0, len(data_set))]
scores = [np.zeros(data_set[i].shape[0], dtype=np.float32) for i in range (0, len(data_set))]

a = 0 
for entry in data_set:
    curr_mat = list_topics[a]
    curr_score = scores[a]
    a +=1
    k = 0 
    for sentence in entry:
        dot_prod = label.cosine_similarity(sentence)
        curr_mat[k] = dot_prod[0]
        curr_score[k] = dot_prod[1]
        k +=1
        
topics_map = {topic:i for (topic,i) in zip(topics, range(0,len(topics)))}
topics_nums_arr =  [np.zeros(data_set[i].shape[0]) for i in range (0, len(data_set))]
j = 0 
for arr in list_topics:
    topic_arr = topics_nums_arr[j]
    j +=1 
    for i in range(0, arr.shape[0]):
        topic_arr[i] = topics_map[arr[i]]
        
print(topic_arr)


[3. 0. 0. 2. 0. 2. 2. 1. 1. 2. 1. 2. 1. 1. 1. 1. 1. 3. 3. 3. 0. 0. 3. 3.
 3. 1. 3. 3. 2. 3. 0. 1. 0. 3. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 3. 0. 3. 1. 1. 3. 1. 1. 1. 0. 3. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 3. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 2. 1. 0. 1. 0.
 2. 3. 1. 3. 1. 0. 1. 1.]


In [8]:
# initial grouping of vectors 
for i in range(0, len(data_set)):
    curr_matrix = data_set[i]
    curr_labels = topics_nums_arr[i]
    curr_labels= curr_labels.reshape((curr_labels.shape[0],1))
    print(curr_labels.shape)
    curr_scores = scores[i]
    curr_scores = curr_scores.reshape((curr_scores.shape[0],1))
    data_set[i] = np.concatenate((curr_matrix, curr_labels, curr_scores), axis=1)
    print(data_set[i].shape)


(54, 1)
(54, 386)
(100, 1)
(100, 386)
(947, 1)
(947, 386)
(33, 1)
(33, 386)
(126, 1)
(126, 386)
(185, 1)
(185, 386)
(17, 1)
(17, 386)
(110, 1)
(110, 386)
(42, 1)
(42, 386)
(128, 1)
(128, 386)


In [9]:
# Retrieving indexes of sentences matching the groupings by topic 

topic_group_index = []
for i in range(0, len(data_set)):
    curr_matrix = data_set[i]
    m1 = np.where(curr_matrix[:,384]==0)
    m2 = np.where(curr_matrix[:,384] ==1)
    m3 = np.where(curr_matrix[:,384]==2)
    m4 = np.where(curr_matrix[:,384]== 3)
    m5 = np.where(curr_matrix[:,384]== 4)
    m6 = np.where(curr_matrix[:,384]== 5)
    m7 = np.where(curr_matrix[:,384]== 6)
    m8 = np.where(curr_matrix[:,384]== 7)
    m9 = np.where(curr_matrix[:,384]== 8)
    m10 = np.where(curr_matrix[:,384]== 9)
    topic_indices = [m1, m2,m3,m4, m5,m6,m7,m8,m9,m10]
    topic_group_index.append(topic_indices)

len(topic_group_index[i])

10

In [10]:
import sys 
sys.executable

'/Users/prottoyachowdhury/anaconda3/envs/text310/bin/python'

In [11]:
topic_sentences = [[] for i in range(0, len(topics))]
for i in range(0, len(data_set)):
    curr_text = cleaned_speeches[i]
    curr_indices = topic_group_index[i]
    for k in range(0, len(topics)):
        indices:np.ndarray = curr_indices[k][0]
        
        related_sentences = [curr_text[j] for j in indices.astype(int)]
        topic_sentences[k].append(related_sentences)

sorted_data = pd.DataFrame(topic_sentences).T
sorted_data = sorted_data.rename(columns={i: topics[i] for i in range(0, len(topics))})
sorted_data['date'] = pd.to_datetime(speeches['date'][0:sorted_data.shape[0]])
sorted_data['country'] = speeches['country'][0:sorted_data.shape[0]]


In [12]:
# Assembling the dataframes that will contain data on the yields 
# declaring them as list then mapping through hashmap 

rate_df_list = []
for file in os.listdir('1430_rates'):
    rate_df = pd.read_csv('1430_rates/'+file)
    rate_df_list.append(rate_df)

country_names = [file.split(" ")[0].lower() for file in os.listdir('1430_rates')]
country_names[1] = 'euro area'
country_names[4] = 'united kingdom'
country_names[5] = 'united states'

# Declaring country and interest rate mapper 


In [13]:
# calculating percent change values in yields 

# Columns to drop 
drop_cols = ['Unnamed: 0', 'date']

def df_convert_pct(df: pd.DataFrame, period:int)-> pd.DataFrame:
    dates = df['date'].copy(deep=True) 
    cleaned_df = df.drop(columns=drop_cols)
    dates = pd.to_datetime(dates)
    percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
    
    percent_change = percent_change.iloc[period:, :].reset_index(drop=True)
    num_rows = percent_change.shape[0]
    #appending dates to the final df 
    percent_change['date'] = dates[0:num_rows]
    return percent_change






In [14]:
# Creating time lagged df lists for rates across all countries and regions 
one_month = {country_names[i]: df_convert_pct(rate_df_list[i], 1) for i in range(0, len(rate_df_list))}
two_month = {country_names[i]:df_convert_pct(rate_df_list[i],2) for i in range(0,len(rate_df_list))}
three_month = {country_names[i]: df_convert_pct(rate_df_list[i],3) for i in range(0, len(rate_df_list))}
five_month = {country_names[i]: df_convert_pct(rate_df_list[i],5) for i in range(0, len(rate_df_list))}
six_month = {country_names[i]: df_convert_pct(rate_df_list[i],6) for i in range(0, len(rate_df_list))}
seven_month = {country_names[i]: df_convert_pct(rate_df_list[i],7) for i in range(0, len(rate_df_list))}
eigth_month = {country_names[i]: df_convert_pct(rate_df_list[i],8) for i in range(0, len(rate_df_list))}
nine_month = {country_names[i]: df_convert_pct(rate_df_list[i],9) for i in range(0, len(rate_df_list))}
ten_month = {country_names[i]: df_convert_pct(rate_df_list[i],10) for i in range(0, len(rate_df_list))}
eleven_month = {country_names[i]: df_convert_pct(rate_df_list[i],11) for i in range(0, len(rate_df_list))}
twelve_month = {country_names[i]: df_convert_pct(rate_df_list[i],12) for i in range(0, len(rate_df_list))}

one_month['australia']

  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  percent_change = cleaned_df.pct_change(periods = period ).reset_index(drop=True)
  pe

Unnamed: 0,10y,3m_ib,3m_bb,24h_immed,date
0,-0.045568,-0.021329,-0.073345,-0.046105,1990-12-01
1,0.000868,-0.025985,-0.074530,0.000000,1991-01-01
2,-0.013010,-0.011188,-0.163062,0.000000,1991-02-01
3,-0.035149,-0.011314,-0.106163,-0.037500,1991-03-01
4,-0.020947,-0.056338,-0.103203,-0.049351,1991-04-01
...,...,...,...,...,...
395,-0.085408,-0.006849,,0.013986,2023-11-01
396,-0.009315,0.000000,,0.000000,2023-12-01
397,-0.001688,-0.002299,,0.000000,2024-01-01
398,-0.023183,0.002304,,0.000000,2024-02-01


In [15]:
# Creating mapping function that maps lagged rates to the speeches 
sorted_dates = sorted_data['date'].tolist()
sorted_countries = sorted_data['country'].tolist()

# Function to map rates 

def rate_mapper(column_name: str, lagged_rates:dict, lag_amount:str)->pd.DataFrame:
    rate_list = []
    ## iterating through list of dates and countries 
    for date, country in zip(sorted_dates, sorted_countries):
        curr_df:pd.DataFrame = lagged_rates[country]
        month = date.month 
        year = date.year
        try:
            row_slice = curr_df.loc[(curr_df['date'].dt.month == month) & (curr_df['date'].dt.year==year)]
          
            val_to_add = row_slice[column_name].values
            rate_list.append(float(val_to_add[0]))
        except:
            rate_list.append(np.nan)
            print('nan value')
    
    sorted_data[lag_amount+"_"+column_name] = rate_list
    return rate_list
    

rate_mapper('10y', one_month, 'one_month')
sorted_data


rate_mapper('10y', three_month, 'three_month')
rate_mapper('10y', six_month, 'six_month')
rate_mapper('10y', twelve_month, 'twelve_month')
rate_mapper('3m_ib', one_month, 'one_month')
rate_mapper('3m_ib', two_month, 'two_month')
rate_mapper('3m_ib', three_month, 'three_month')
rate_mapper('3m_ib', six_month, 'six_month')
rate_mapper('3m_ib', twelve_month, 'twelve_month')




[0.42477644492912514,
 0.22181146025877996,
 2.1999999999999997,
 0.8226371848456189,
 3.0,
 -0.20143884892086317,
 -0.9320754716981132,
 0.13547237076648844,
 0.034482758620689724,
 23.804337794185432]

In [25]:
sorted_data.iloc[0,3]

['On Christmas Eve 1818 "Silent Night" was performed for the first time, in a chapel in Oberndorf, near Salzburg and subsequently made its way throughout Europe and to the rest of the world.',
 "For this reason, I had welcomed the European Commission's decision to organise a public hearing on the issue, which took place last Wednesday.",
 'However, I am confident that the European Commission will make this possible through a legislative proposal.',
 'This would give banks a range of options to choose from as regards the processing of card payments.',
 'Third, various market initiatives have developed implementation standards and specifications for the various domains of card payments.',
 'However, I personally believe that the European cards market would also benefit from the power of choice.',
 'The Eurosystem has been calling for an additional European card scheme for four years.',
 'I would strongly urge the appropriate authorities to work on this issue so that the clarity needed at

In [17]:
# Compling list of strings into one string 
subset_text_df = sorted_data.iloc[:, 0:4]

def string_join(list_str: list[str]):
    out = ""
    for i in range(0, len(list_str)):
        curr_string = list_str[i]
        out += curr_string
    return out 


file_path = "clean_merged_data/"
subset_text_df.to_csv(file_path+"/initial.csv")

In [18]:
sentiment_df

NameError: name 'sentiment_df' is not defined

In [None]:
# Now using the finbert model to create vector outputs for sentiment 
sentiment_df = subset_text_df.copy()
sentiment_df

def sentiment_map(sentiment:tuple):
    label = sentiment['label']
    score = sentiment['score']
    if label == 'positive':
        return 30 * score 
    elif label == 'neutral':
        return 20 * score 
    elif label == 'negative':
        return 10 * score 
    

sentiment_df.iloc[0,0]
count =1
for index, row in sentiment_df.iterrows():
    start = time.time()
    for i in range(0, len(row)):
       
        values = []
        curr_list = row[i]
        for j in range(len(curr_list)):
            out = pipe(curr_list[j], num_workers=10)[0]
       
            val_add = sentiment_map(out)
            values.append(val_add)
     
        row[i] = np.mean(values)
        
   
    end = time.time()
    print(f'speech {count} sentiment done \n, time:{end-start} sec')
    count +=1 

  curr_list = row[i]
  row[i] = np.mean(values)


speech 1 sentiment done 
, time:1.1761541366577148 sec
speech 2 sentiment done 
, time:0.8228631019592285 sec
speech 3 sentiment done 
, time:1.1783568859100342 sec
speech 4 sentiment done 
, time:1.5702719688415527 sec
speech 5 sentiment done 
, time:1.914783000946045 sec
speech 6 sentiment done 
, time:1.5502479076385498 sec
speech 7 sentiment done 
, time:2.0138680934906006 sec
speech 8 sentiment done 
, time:0.5818321704864502 sec
speech 9 sentiment done 
, time:7.019974946975708 sec
speech 10 sentiment done 
, time:1.5091650485992432 sec


In [None]:
sentiment_df.to_csv('1.csv')

In [None]:
sentiment_df



Unnamed: 0,Inflation,Domestic Growth,Trade Balance,Value of Currency
0,22.893559,21.883068,18.465516,16.366768
1,15.93239,16.981548,16.572093,15.72388
2,12.645284,13.283747,16.620332,13.054433
3,17.127409,16.278965,18.98627,18.149541
4,15.819385,16.777114,15.697898,13.409928
5,14.477584,16.069378,14.656512,13.219095
6,14.312674,16.832058,15.810309,16.060913
7,12.838652,21.087051,19.533621,14.108101
8,16.575795,16.517491,18.001249,17.616931
9,14.760858,15.152376,14.420749,15.980881


In [None]:
sorted_data.iloc[:, 0:4]= sentiment_df
sorted_data
file_name = 'cleaned_sentiment.csv'
sorted_data.to_csv(file_name)