# Markov Word Generator from Scratch

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
root_folder = '/content/drive/My Drive/WordGeneration'

In [5]:
data = pd.read_csv(root_folder+'/FOMC2021.txt', sep="\n")

In [6]:
data.rename(columns={"Action to Adopt Changes to the Committee's Rules Regarding Availability of Information": "text"},
          inplace=True)
data["text"] = data["text"].str.replace("United States", "US")
data["text"] = data["text"].str.replace("U.S.", "US")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486 entries, 0 to 485
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    486 non-null    object
dtypes: object(1)
memory usage: 3.9+ KB


  after removing the cwd from sys.path.


In [7]:
len_text = 3
len_result = 1

In [8]:
func_folder = '/content/drive/My Drive/Colab Notebooks'

In [9]:
import sys
sys.path.append(func_folder)

In [10]:
import Contractions
from Contractions import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
def clean(text):
    text = re.sub('[0-9]+.\t', '', str(text)) # removing paragraph numbers
    text = re.sub('U.S.', 'USA', str(text))
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(' ')])
    text = re.sub('\n ', '', str(text))
    text = re.sub('\n', ' ', str(text))
    text = re.sub("'s", '', str(text))
    text = re.sub("-", ' ', str(text))
    text = re.sub("— ", '', str(text))
    text = re.sub('\"', '', str(text))
    text = re.sub("Mr\.", 'Mr', str(text))
    text = re.sub("Mrs\.", 'Mrs', str(text))
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))

    return text

In [12]:
data['text_clean'] = data['text'].apply(clean)

In [13]:
def sequence_generator(texts,
                      training_length, 
                      result_length, 
                      max_train=100000,
                      start_end_tokens=False,
                      lower=True):

    tokenizer = Tokenizer(lower=lower)
    tokenizer.fit_on_texts(texts)

    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f'There are {num_words} unique words.')

    # import pickle
    # with open('tokenizer.pickle', 'wb') as handle:
    #     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    sequences = tokenizer.texts_to_sequences(texts)

    # Start-End tokens
    # x = word_idx["start_token"]
    # y = word_idx["end_token"]

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + result_length + 3)]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []
    training_seq_words = []
    labels_words = []

    for seq in new_sequences:

        if len(training_seq) < max_train:
            for i in range(training_length, len(seq) - result_length):
                # Extract the features and label
                extract = seq[i - training_length:i + result_length]
                training_seq.append(extract[:training_length])
                if start_end_tokens:
                    label_adj = [x] + extract[training_length:] + [y]
                else: label_adj = extract[training_length:]
                labels.append(label_adj)

                training_seq_words.append([idx_word[j] for j in extract[:training_length]])
                labels_words.append([idx_word[j] for j in extract[training_length:]])

    print(f'There are {len(training_seq)} training sequences.')

    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels, \
           training_seq_words, labels_words

In [14]:
word_idx, idx_word, num_words, word_counts, new_texts, sequences, features, labels, training_seq_words, labels_words = \
    sequence_generator(
    data['text_clean'].tolist(), training_length = len_text, result_length = len_result, lower=True)

There are 2954 unique words.
There are 55802 training sequences.


In [15]:
df = pd.DataFrame({'features': training_seq_words, 'labels': labels_words})

In [16]:
df['labels'] = df['labels'].map(lambda x: x[0])

In [17]:
df

Unnamed: 0,features,labels
0,"[by, unanimous, vote]",the
1,"[unanimous, vote, the]",committee
2,"[vote, the, committee]",approved
3,"[the, committee, approved]",a
4,"[committee, approved, a]",final
...,...,...
55797,"[pressures, and, inflation]",expectations
55798,"[and, inflation, expectations]",and
55799,"[inflation, expectations, and]",financial
55800,"[expectations, and, financial]",and


In [21]:
markov_matrix = pd.concat([df, pd.get_dummies(df['labels'])], axis=1)

In [22]:
markov_matrix = markov_matrix.drop(['labels'], axis=1)

In [26]:
markov_matrix['features'] = markov_matrix['features'].apply(" ".join)

In [48]:
transition_matrix = markov_matrix.groupby('features',as_index=False)[markov_matrix.columns.tolist()].sum()

In [53]:
# transition_matrix = transition_matrix.div(transition_matrix.sum(axis=1), axis=0)

In [49]:
phrase_dict = transition_matrix['features'].to_dict()
word_dict = dict(enumerate(transition_matrix.columns.tolist()))

In [54]:
phrase_dict_reversed = {v: k for k, v in phrase_dict.items()}

In [33]:
from scipy import sparse

In [None]:
pd.DataFrame.sparse.to_coo(transition_matrix)
transition_matrix = sparse.csr_matrix(transition_matrix.to_coo())

In [37]:
def add_weights_temperature(input_weights, temperature):
    weights = np.where(input_weights == 0, 0, np.log(input_weights + 1e-10)) / temperature
    weights = np.exp(weights)
    return weights / np.sum(weights)

In [138]:
def return_next_word(prefix, temperature=1):
    prefix_ind = phrase_dict_reversed[prefix]
    weights = transition_matrix.iloc[prefix_ind].values[1:].astype('float64')
    prob = weights / sum(weights)
    if temperature != 1:
        weights = add_weights_temperature(prob, temperature)

    token_ind = np.random.choice(range(len(weights)), p=prob)+1
    next_word = word_dict[token_ind]
    return next_word

In [147]:
def generate_words(seed, length):
    next_word = return_next_word(seed)
    sentence = seed.split()
    
    for i in range(length):
        sentence.append(next_word)
        next_word = return_next_word(" ".join(sentence[-3:]))
    
    return " ".join(sentence)

In [149]:
generate_words("accumulated by households", 100)

"accumulated by households since the beginning of the period to negotiations on the debt limit by 480 billion market participants' estimates of the new date when the treasury would exhaust its extraordinary measures and cash balance were wide ranging but some estimates suggested the date might be as early as mid december most market participants anticipated adjustments to the pace of purchases if warranted by changes in the composition of the federal reserve bank of new york until instructed otherwise to execute transactions in the soma in accordance with the committee assessments of maximum employment and inflation and posed considerable risks to the"

In [150]:
generate_words("early as mid", 100)

'early as mid december most market participants anticipated that the economy were to evolve broadly as they anticipated they judged that the release of pent up demand could boost consumption growth further as social distancing restrictions were imposed to rein in a new wave of covid 19 in the us amid this progress and strong policy support indicators of economic activity and employment had continued to surge and expected that it would likely be appropriate in each subsequent month some participants preferred a somewhat faster pace of reductions that would result in reducing the monthly pace of the recovery indicators of economic activity'

# Extra

In [None]:
def create_transition_matrix(self):
    row_ind, col_ind, values = [], [], []

    for i in range(len(self.tokens[:-self.n])):
        ngram = ' '.join(self.tokens[i:i + self.n])
        ngram_ind = self.ngram2ind[ngram]
        next_word_ind = self.token2ind[self.tokens[i + self.n]]

        row_ind.extend([ngram_ind])
        col_ind.extend([next_word_ind])
        values.extend([1])

    S = scipy.sparse.coo_matrix((values, (row_ind, col_ind)), shape=(len(self.ngram2ind), len(self.token2ind)))
    return S

In [None]:
def return_next_word(self, prefix, temperature=1):
    prefix = self.check_prefix(prefix)
    prefix_ind = self.ngram2ind[prefix]
    weights = self.transition_matrix_prob[prefix_ind].toarray()[0]
    if temperature != 1:
        weights = self.add_weights_temperature(weights, temperature)

    token_ind = np.random.choice(range(len(weights)), p=weights)
    next_word = self.ind2token[token_ind]
    return next_word

In [None]:
state=np.array([[1.0, 0.0, 0.0]])
stateHist=state
dfStateHist=pd.DataFrame(state)
distr_hist = [[0,0,0]]
for x in range(50):
    state=np.dot(state,P)
    print(state)
    stateHist=np.append(stateHist,state,axis=0)
    dfDistrHist = pd.DataFrame(stateHist)
    dfDistrHist.plot()
plt.show()

In [None]:
import random
def get_next_term(t_s):
    return random.choices(t_s.index, t_s)[0]

def make_chain(t_m, start_term, n):
    chain = [start_term]
    for i in range(n-1):
        chain.append(get_next_term(t_m[chain[-1]]))
    return chain