In [1]:
"""
PLEASE FOLLOW INSTRUCTIONS ON THIS CELL BEFORE RUNNING THIS NOTEBOOK.
"""

# In order for this file to work, this cell needs to be run first
# Remove the '#' on the last two lines and run the cell
# When it is done the Runtime needs to be restarted at Runtime -> Restart Runtime
# Please make the last two lines adding '#' at the beginning.
# Please wait until ALL commands have run

#!pip install spacy --upgrade
#!python -m spacy download en_core_web_md

'\nPLEASE FOLLOW INSTRUCTIONS ON THIS CELL BEFORE RUNNING THIS NOTEBOOK.\n'

In [2]:
import os
import spacy
import en_core_web_md
from io import open
import pandas as pd

# Introduction

This is one of three supporting notebooks that supports this thesis.

In this notebook has the code for all the sections in the methodology section before machine learning model training.

# 5.1 Data Collection

## 5.1.1 Text Retrieval

In [3]:
# Get files from github repository
!git clone https://github.com/luissejas/central_bank_thesis.git

Cloning into 'central_bank_thesis'...
remote: Enumerating objects: 311, done.[K
remote: Counting objects: 100% (204/204), done.[K
remote: Compressing objects: 100% (186/186), done.[K
remote: Total 311 (delta 15), reused 135 (delta 6), pack-reused 107[K
Receiving objects: 100% (311/311), 17.08 MiB | 12.40 MiB/s, done.
Resolving deltas: 100% (55/55), done.


In [4]:
path = r'/content/central_bank_thesis/text_files'

## 5.1.2 Labeling

In [5]:
# Creating the lists of the paths of both categories
adversity_files = []
prosperity_files = []

adversity_years = ['2006', '2007', '2008', '2009', '2010', '2020', '2021', '2022']


# Looping through the files to categorize them
for file in os.listdir(path):
    if file[0:4] in adversity_years: # file [0:4] are the character positions of the year
        adversity_files.append(os.path.join(path, file))
    else:
        prosperity_files.append(os.path.join(path, file))

# 5.2 Vocabulary Creation

Now that we have a list containing the full path the files,we will iterate through them and 'read' the file with spacy.

Spacy will read the file and annotate the text with part of speech tags (POS).

A complete list of POS tags can be found in the following link, https://universaldependencies.org/u/pos/

In this case, I do not want the model to be bound by context, so I will focus on four part-of-speech tags: adjective, adverb, noun and verb.

I also want to remove stopwords. Stopwords are words that do not add any predictive power to the text.

Lastly, I want to lemmatize each word. Lemmatization means to get the core meaning of each word, this reduces vocabulary size and provides more predictive power to the NLP model. For example "made" is a form from the verb "make", therefore with lemmatization "made" -> "make". Instead of using two words that have the same core meaning, it is more efficient to keep one reference with the same core meaning.

The code below will do the following:

1) Open the file

2) Let SpaCy read the file (also considered a document [doc])

3) Go through each word (token) in the doc and determine that it passes the filter: the required part of speech, not a stopword and not a repeated lemma of the word.

4) If the word passes all filters, it is added to the vocabulary

## 5.2.1 Lemmatization and Part-of-Speech Filters

In [6]:
# SpaCy requires the language pipeline to be loaded
nlp = en_core_web_md.load()

# Stopwords from spacy
stopwords = list(nlp.Defaults.stop_words)

# I also want to filter the desired POS tag
pos_list = ['ADJ', 'ADV', 'VERB', 'NOUN']

# Create an empty instance of the vocabulary 
vocabulary = []

In [7]:
file_path_list = []

# Looping through files in directory and keeping their full paths on a list.
# This helps you work with all of the files that are in the same place with different code
for file in os.listdir(path):
    file_path_list.append(os.path.join(path, file))

In [8]:
# Loop through files and create vocabulary

for file in file_path_list:

    file_content = open(file, mode='r', encoding='utf-8').read() # reading the file to get the text
    file_content_doc = nlp(file_content) # applying spacy to read the file

    # Now we want to go through each token (word) in the document, get its lemma and if it passes all filters,
    # add it to our vocabulary

    for token in file_content_doc:
        if token.lemma_ in stopwords: # If it is a stopword, skip and proceed to the next word
            pass
        else:
            if token.pos_ in pos_list: # Checking if the part-of-speech tag is one of the four listed
                if token.lemma_ in vocabulary: # If it is approved, I check if the word is already on the vocabulary
                    pass                      # If the word already is on the vocabulary, skip to the next.
                else:
                    vocabulary.append(token.lemma_) #If not, add it to the vocabulary
            else:
                pass

In [9]:
# Once we have completed our list, we export it to a pandas dataframe
vocabulary_df = pd.DataFrame({'word':vocabulary})

# Export it to csv
# The file is deliberately named this way because the vocabulary on the next section is the filtered one.
vocabulary_df.to_csv('vocabulary_v1.csv')

## 5.2.2 Vocabulary Refinement

In [10]:
# For this code to work, the vocabulary needs to be saved on the same folder as this notebook
# Otherwise, consider putting the full path of the csv file
revised_vocabulary = pd.read_csv('/content/central_bank_thesis/methodology_before_training/vocabulary.csv')

accepted_words = revised_vocabulary['word'].values.tolist()

excluded_words = []

# We have to loop through all files again, with the added filter of accepted and excluded_words, allowing more transparency during
# the process

for file in file_path_list:

    file_content = open(file, mode='r', encoding='utf-8').read() # reading the file to get the text
    file_content_doc = nlp(file_content) # applying spacy to read the file

    for token in file_content_doc:
        if token.lemma_ in stopwords: # If it is a stopword, skip and proceed to the next word
            pass
        else:
            if token.pos_ in pos_list: # Checking if the part-of-speech tag is one of the four listed
                if token.lemma_ in accepted_words or token.lemma_ in excluded_words: # Checking if the word is already on a list
                    pass
                else:
                    excluded_words.append(token.lemma_) #If not in vocabulary, add list to excluded words
            else:
                pass


In [11]:
# Exporting the excluded words for comparison with the accepted words

excluded_words_df = pd.DataFrame({'excluded_words':excluded_words})
excluded_words_df.to_csv('excluded_words.csv')

# 5.3 Data Transformation

## 5.3.1 Text Transformation

Text transformation needs to be carried out twice: once for the adversity files.

 It comprises five parts:

- 1) Sentence Filter

- 2) Dataframe Creator

- 3) Dataframe Cleaning

- 4) Text Block Creation

- 5) Labeled Dataset Creation

### 5.3.1.1 Sentence Filter

Sentence Filter extracts approved words from each sentence and returns a filtered sentence. Every word in the document will go through if statements that will check if the word's lemma is in the vocabulary and if the word has the desired part-of-speech tag. Then, the function will return the sentence with the approved words.

This function will only be called inside the next function (5.3.1.2)

In [12]:
def filter_fed_sentence (vocabulary: list, pos_list: list, sentence):
    """
    This function will analyze every sentence in a document as it is done in the create_dataframe process.

    The 'filtered_sentence' variable is a list made to store the approved words in a given sentence.

    If there are approved words in a sentence, they will be returned as string separated by space.

    If no words are found only the string '---filler--' will be returned. This is a key word that will be used to filter the
    sentences that are no longer useful.
    """
    filtered_sentence = []

    for token in sentence:
        if token.lemma_ in vocabulary and token.pos_ in pos_list:
            filtered_sentence.append(token.lemma_)


    if len(filtered_sentence) == 0:
        filtered_sentence.append("---filler---")
        return "".join(filtered_sentence)
    else:
        return " ".join(filtered_sentence)

### 5.3.1.2 Dataframe Creator

The Dataframe creator creates a pandas dataframe that returns the original sentence in one column to the extracted sentence in another column. This means that the original sentence will be on the same row as the extracted sentence. This process allows me to examine the results of my approved vocabulary, giving me the flexibility to adjust the vocabulary if I spot a word I do not consider relevant to the machine learning model.

In [13]:
def create_dataframe (filepath: list, vocabulary: list, pos_list: list, isAdversity: bool):
    """
    For this function, a dictionary will be used to create the dataset and at the end converted to a pandas dataframe.

    In every step before converting to a pandas dataframe, all steps will be stored in the dictionary

    Functions will do the following steps:

    1) Loop through each file in the file list

        1.1) Open a file

        1.2) Process its contents with SpaCy

            1.2.1) Loop through each sentence (as separated by SpaCy)

            1.2.2) Collect the original sentence

            1.2.3) Store the sentence into a column

            1.2.4) Filter the sentence as done by the function 'filter_fed_sentence'

            1.2.5) Store the result of a new function on another column but the same row
        
    2) Save the processed content into a DataFrame

    3) If the file is from the adversity years, export its content as .csv. The same is applied on the case of growth years

    4) Return the generated pandas dataframe
    """

    dataset_dict = {
    'original_sentence':['---filler---'],
    'filtered_sentence':['---filler---']
    }

    for file in filepath:
        file_content = open(file, mode='r', encoding='utf-8').read()
        file_content_doc = nlp(file_content)

        for sentence in file_content_doc.sents:
            dataset_dict['original_sentence'].append(sentence.text)
            dataset_dict['filtered_sentence'].append(filter_fed_sentence(vocabulary=vocabulary, pos_list=pos_list, sentence=sentence))
    
    dataset_df = pd.DataFrame(dataset_dict)

    if isAdversity:
        dataset_df.to_csv('adversity_sentences.csv')
    else:
        dataset_df.to_csv('prosperity_sentences.csv')
    
    return dataset_df

### 5.3.1.3 Dataframe Cleaning

The Dataframe Cleaning cleans the dataframe to only keep the filtered sentences. SpaCy processes the text from its pre-trained model and both the FOMC meeting and press conference transcripts do not have an ordinary text structure. This means there will be a number of sentences with non-relevant words and they need to be taken out of our dataset.

In [14]:
def clean_dataframe (df) -> list:
    """
    This function is to be used only after the contents of the sentences are considered satisfactory

    It will take as input a pandas dataframe (df variable) remove the original sentence and it will return a list of the values
    of the filtered sentences.
    """
    remove_column = df.drop(columns=['original_sentence'])

    final_df = remove_column[remove_column['filtered_sentence'] != '---filler---']

    sentence_list = final_df['filtered_sentence'].values.tolist()

    return sentence_list

### 5.3.1.4 Text Block Creation

Text Block Creation aims to avoid a problem that is specific to the Federal Reserve's case. Regardless whether the United States economy will be in a period of prosperity or adversity, words such as inflation and interest rates will be mentioned. The word inflation by itself does not provide information of the Federal Reserve's future actions or perspectives regarding inflation. Terms such as "high inflation" or "rising inflation" are more informative than the word inflation. Given the size of the data collected, it is not feasible to manually revise every filtered sentence. With some data exploration, I found out that the longest filtered sentence consisted of 72 words . I considered text length of 72 words to be long enough to provide the machine learning model with predictive power and allow the model to analyze detailed FOMC announcements.

In [15]:
def group_text_into_blocks (sentence_list: list) -> list:
    """
    This function will take in the list of filtered sentences as input variable and output another list

    Since the maximum filtered sentence length is 72 words, this function will attempt to group as many texts as possible
    with this length.

    The following steps will be taken:

    1) Create a list of lists from each sentence

        1.0) Loop through each sentence

        1.1) On each sentence, split the sentence into a list of words

        1.2) Append the list of words into a list

    2) Create Dataset from blocks

        2.0) Create two variables: dataset_variable, block_variable

        2.1) Loop through each list on the list created in 1)

            2.1.1) If the length of the list is lower than 72, add to the block_variable list

            2.1.2) Repeat until length of current sentence + block_variable size >= 72

            2.1.3) Append block_variable to dataset_variable. Note that on 2.1.2 if it exceeds 72 the current sentence 
            is not added to the current block and moved on to the next.

            2.1.4) Clear the block variable 
    """

    # Step 1)
    text_list = []

    for sentence in sentence_list:
        sentence_to_add = sentence.split()
        text_list.append(sentence_to_add)
    
    # Step 2)
    dataset_ = []
    block_ = []

    for sentence in text_list:
        if len(sentence) + len(block_) <= 72:
            for word in sentence:
                block_.append(word)
        else:
            text_to_add = " ".join(block_)
            dataset_.append(text_to_add)
            block_ = []
            for word in sentence:
                block_.append(word)
    
    return dataset_

### 5.3.1.5 Labeled Dataset Creation

Labeled Dataset Creation merges the two types of text (adversity and prosperity) into one file. The labeled dataset consists of the previously built block of texts with a label according to their period. 

In [16]:
def create_labeled_dataset(text_list: list, isAdversity: bool):
    """
    Function that takes the texts that are already made into blocks (text_list) and returns a pandas dataframe with 
    an additional label 0 or 1 depending on the classification.
    """

    if isAdversity:
        label = [0 for i in range(0, len(text_list))]
    else:
        label = [1 for i in range(0, len(text_list))]
    
    dataset_dict = {
        'text':text_list,
        'label':label
    }

    dataset_df = pd.DataFrame(dataset_dict)

    return dataset_df

### 5.3.1.6 Execution

In [17]:
# Loading revised vocabulary
vocabulary = revised_vocabulary['word'].values.tolist()

In [18]:
# Execution of all functions on the two groups of files: adversity and prosperity files

# Adversity files
adversity_df = create_dataframe(filepath=adversity_files, vocabulary=vocabulary, pos_list=pos_list, isAdversity=True)
clean_adversity_df_as_list = clean_dataframe(adversity_df)
adversity_text_blocks = group_text_into_blocks(clean_adversity_df_as_list)
adversity_dataset = create_labeled_dataset(text_list=adversity_text_blocks, isAdversity=True)
adversity_dataset.to_csv('adversity_dataset_unbalanced.csv')

# Prosperity files
prosperity_df = create_dataframe(filepath=prosperity_files, vocabulary=vocabulary, pos_list=pos_list, isAdversity=False)
clean_prosperity_df_as_list = clean_dataframe(prosperity_df)
prosperity_text_blocks = group_text_into_blocks(clean_prosperity_df_as_list)
prosperity_dataset = create_labeled_dataset(text_list=prosperity_text_blocks, isAdversity=False)
prosperity_dataset.to_csv('prosperity_dataset_unbalanced.csv')

## 5.3.2 Data Balancing and Shuffling

In [19]:
def balance_labeled_datasets(prosperity_dataset, adversity_dataset):
    """
    Function that compares the size of both datasets and performs the appropriate sampling procedure.

    It takes in the dataframes of both labels: prosperity and adversity.

    If one label has more data points than the other, it will be downsampled
    """
    if len(prosperity_dataset) == len(adversity_dataset):
        print("Both datasets are equal in size.")
        return
    elif len(prosperity_dataset) > len(adversity_dataset):
        print("Prosperity dataset larger than adversity dataset, it will be downsampled.")
        downsampled_labeled_df = prosperity_dataset.sample(len(adversity_dataset))
        return downsampled_labeled_df, adversity_dataset
    else:
        print("adversity dataset larger than the prosperity dataset, it will be downsampled.")
        downsampled_labeled_df = adversity_dataset.sample(len(prosperity_dataset))
        return downsampled_labeled_df, prosperity_dataset


def join_and_shuffle_datasets(prosperity_dataset, adversity_dataset):
    """
    Function joins the downsampled dataset with the other one and forms the final dataset.
    """
    joint_dataset = pd.concat([prosperity_dataset, adversity_dataset])
    shuffled_dataset = joint_dataset.sample(frac=1, random_state=45).reset_index()
    shuffled_dataset.drop(columns=['Unnamed: 0', 'index'], inplace=True)
    return shuffled_dataset


def check_dataset_balance(balanced_dataset):
    """
    Function takes in the balanced dataset, creates two sub-datasets according to the label.
    Then, it compares the size of each sub-dataset.
    If both sub-datasets are of equal size, it will raise no errors.
    """

    positive_label_dataset = balanced_dataset[balanced_dataset['label']==1]
    negative_label_dataset = balanced_dataset[balanced_dataset['label']==0]

    assert len(positive_label_dataset) == len(negative_label_dataset)

    balanced_dataset.to_csv('balanced_dataset.csv')

    return

In [20]:
# Do not forget to check the names of the output of the dataset
prosperity_df = pd.read_csv('prosperity_dataset_unbalanced.csv', index_col=False)
adversity_df = pd.read_csv('adversity_dataset_unbalanced.csv', index_col=False)

In [21]:
downsampled_dataset, normal_sample_dataset = balance_labeled_datasets(prosperity_dataset=prosperity_df, adversity_dataset=adversity_df)
machine_learning_dataset = join_and_shuffle_datasets(downsampled_dataset, normal_sample_dataset)
check_dataset_balance(machine_learning_dataset)

Prosperity dataset larger than adversity dataset, it will be downsampled.
