In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [5]:
root_folder = '/content/drive/My Drive/WordGeneration'

In [6]:
data = pd.read_csv(root_folder+'/FOMC2021.txt', sep="\n")

In [7]:
data.rename(columns={"Action to Adopt Changes to the Committee's Rules Regarding Availability of Information": "text"},
          inplace=True)
data["text"] = data["text"].str.replace("United States", "US")
data["text"] = data["text"].str.replace("U.S.", "US")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486 entries, 0 to 485
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    486 non-null    object
dtypes: object(1)
memory usage: 3.9+ KB


  after removing the cwd from sys.path.


In [8]:
len_text = 3
len_result = 1

In [9]:
func_folder = '/content/drive/My Drive/Colab Notebooks'

In [10]:
import sys
sys.path.append(func_folder)

In [11]:
import Contractions
from Contractions import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def clean(text):
    text = re.sub('[0-9]+.\t', '', str(text)) # removing paragraph numbers
    text = re.sub('U.S.', 'USA', str(text))
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(' ')])
    text = re.sub('\n ', '', str(text))
    text = re.sub('\n', ' ', str(text))
    text = re.sub("'s", '', str(text))
    text = re.sub("-", ' ', str(text))
    text = re.sub("— ", '', str(text))
    text = re.sub('\"', '', str(text))
    text = re.sub("Mr\.", 'Mr', str(text))
    text = re.sub("Mrs\.", 'Mrs', str(text))
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))

    return text

In [13]:
data['text_clean'] = data['text'].apply(clean)

In [14]:
def sequence_generator(texts,
                      training_length, 
                      result_length, 
                      max_train=100000,
                      start_end_tokens=False,
                      lower=True):

    tokenizer = Tokenizer(lower=lower)
    tokenizer.fit_on_texts(texts)

    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f'There are {num_words} unique words.')

    # import pickle
    # with open('tokenizer.pickle', 'wb') as handle:
    #     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    sequences = tokenizer.texts_to_sequences(texts)

    # Start-End tokens
    # x = word_idx["start_token"]
    # y = word_idx["end_token"]

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + result_length + 3)]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []
    training_seq_words = []
    labels_words = []

    for seq in new_sequences:

        if len(training_seq) < max_train:
            for i in range(training_length, len(seq) - result_length):
                # Extract the features and label
                extract = seq[i - training_length:i + result_length]
                training_seq.append(extract[:training_length])
                if start_end_tokens:
                    label_adj = [x] + extract[training_length:] + [y]
                else: label_adj = extract[training_length:]
                labels.append(label_adj)

                training_seq_words.append([idx_word[j] for j in extract[:training_length]])
                labels_words.append([idx_word[j] for j in extract[training_length:]])

    print(f'There are {len(training_seq)} training sequences.')

    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels, \
           training_seq_words, labels_words

In [15]:
word_idx, idx_word, num_words, word_counts, new_texts, sequences, features, labels, training_seq_words, labels_words = \
    sequence_generator(
    data['text_clean'].tolist(), training_length = len_text, result_length = len_result, lower=True)

There are 2954 unique words.
There are 55802 training sequences.


In [32]:
df = pd.DataFrame({'features': training_seq_words, 'labels': labels_words})

In [33]:
df['labels'] = df['labels'].map(lambda x: x[0])

In [34]:
df

Unnamed: 0,features,labels
0,"[by, unanimous, vote]",the
1,"[unanimous, vote, the]",committee
2,"[vote, the, committee]",approved
3,"[the, committee, approved]",a
4,"[committee, approved, a]",final
...,...,...
55797,"[pressures, and, inflation]",expectations
55798,"[and, inflation, expectations]",and
55799,"[inflation, expectations, and]",financial
55800,"[expectations, and, financial]",and


In [35]:
markov_matrix = pd.concat([df, pd.get_dummies(df['labels'])], axis=1)

In [36]:
markov_matrix = markov_matrix.drop(['labels'], axis=1)

In [37]:
markov_matrix['features'] = markov_matrix['features'].apply(" ".join)

In [39]:
transition_matrix = markov_matrix.groupby(markov_matrix.columns.tolist(),as_index=False).sum()

  return self._wrap_agged_manager(new_mgr)
