This notebook demonstrates the process of getting the word embeddings matrix, which will be used in future analysis. This notebook is based on the 'CompaniesPostStemming.csv' file obtained in previous steps

In [1]:
import numpy as np
from functools import partial
from nltk.util import skipgrams
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

# Read in the dataset and do some preprocessing

In [None]:
companies = pd.read_csv('CompaniesPostStemming.csv')

#Add the data of the same company to create on long string of all filings over the 2 years
tickers = companies['ticker'].unique()

In [None]:
count = 0

companies2 = pd.DataFrame(columns=['ticker', 'text'])
for i in tickers:
    temp = companies[companies['ticker'] == i]
    companies2.loc[count] = [i, temp['finalText'].str.cat(sep = ' ')]
    count+=1
    
companies2['count'] = [len(text.split(' ')) for text in companies2['text']]

# Compute PMI matrix

In order to compute numeric vectors for words, we will need to construct the PMI matrix. To do this, we first need to compute the frequency of skip-grams, which are just generalizations of n-grams that allow tokens to be skipped within some specified window. Think of it as computing the frequency of times a pair of words are observed together within some defined window indicating how far apart the tokens are allowed at max to be apart from each other.

The following code shows how we can use the CountVectorizer from sklearn to count skip-grams of two words, with a window of four tokens:

In [None]:
skip4_bigrams = partial(skipgrams, n = 2, k = 4)

def my_analyzer(text):
    return skip4_bigrams(word_tokenize(text))

skipgram_vectorizer = CountVectorizer(analyzer = my_analyzer, token_pattern = '[a-zA-Z]+')
skipgram_count_matrix = skipgram_vectorizer.fit_transform(companies2.text)

For ease, we'll convert this into a DataFrame and see how exactly it's stored:

In [None]:
skipgram_df = pd.DataFrame(skipgram_count_matrix.toarray(),
                           columns = skipgram_vectorizer.get_feature_names_out())

#Remove all the columns with underscore inside
def contains_underscore(x):
    #x is a tuple of two elements, just like what the column looks like above
    return (('_' in x[0]) or ('_' in x[1]))

# Get a list of column names without underscores
columns_to_keep = [col for col in skipgram_df.columns if not contains_underscore(col)]

skipgram_df_2 = skipgram_df[columns_to_keep]

In [None]:
skipgram_long = pd.DataFrame({'counts': skipgram_df_2.sum(axis = 0).values})

skipgram_long[['term1', 'term2']] = pd.DataFrame(skipgram_df_2.columns.tolist(), 
                                                index = skipgram_long.index)

In [None]:
full_pairs_df = skipgram_long
del skipgram_long

In [None]:
full_pairs_df['skip_p'] = full_pairs_df['counts'] / full_pairs_df['counts'].sum()

unigram_vectorizer = CountVectorizer(token_pattern = '[a-zA-Z]+')
unigram_count_matrix = unigram_vectorizer.fit_transform(companies2.text)
unigram_long = pd.DataFrame({'uni_freq1': unigram_count_matrix.toarray().sum(axis = 0),
                             'term1': unigram_vectorizer.get_feature_names_out()})
unigram_long['uni_freq1'] = unigram_long['uni_freq1'] / unigram_long['uni_freq1'].sum()

unigram_long2 = unigram_long.rename(columns = {"uni_freq1": "uni_freq2", "term1": "term2"})

In [None]:
full_pairs_df = full_pairs_df.merge(unigram_long, how = "left", left_on = "term1", right_on = "term1")
full_pairs_df = full_pairs_df.merge(unigram_long2, how = "left", left_on = "term2", right_on = "term2")

full_pairs_df['pmi'] = np.log(full_pairs_df['skip_p'] / full_pairs_df['uni_freq1'] / full_pairs_df['uni_freq2'])

# Replace anything below 0 with 0, tend to focus on positive direction more than expected:
full_pairs_df['pmi'] = np.maximum(full_pairs_df['pmi'], 0)

In [None]:
skipgram_wide_matrix = pd.pivot_table(full_pairs_df, index='term1', columns='term2', values='pmi',
                                      fill_value = 0).values

# SVD to extract embeddings

In [None]:
skipgram_sparse = csr_matrix(skipgram_wide_matrix)
svd = TruncatedSVD(n_components = 500, n_iter = 10, random_state = 42)
svd_embeddings = svd.fit_transform(skipgram_sparse)

word_embeddings_df = pd.DataFrame(np.transpose(svd_embeddings))