# Bag-Of-Words Baseline Model

In this notebook, we will be using a Bag-of-Words model as a baseline sentiment model.

https://www.nber.org/system/files/working_papers/w25084/w25084.pdf

In [6]:
import numpy as np
import pandas as pd
import os
import pickle
import string

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucaskrenn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from string import punctuation
from os import listdir
from collections import Counter
#from keras.preprocessing.text import Tokenizer

In [9]:
with open('../Step1-Data/6-link_dict.pickle', 'rb') as f:
    parsed = pickle.load(f)

In [62]:
f = open("1-Sentences_75Agree.txt", 'r', encoding = "ISO-8859-1")
training_string = f.read()
training_string[:500]

"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral\nWith the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .@positive\nFor the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero "

In [70]:
training = training_string.split('\n')
training[:5]

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral',
 'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .@positive',
 "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .@positive",
 'In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .@positive',
 'Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .@positive']

In [71]:
training = [x.split(' .@') for x in training]
training[0]

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing',
 'neutral']

In [73]:
train_df = pd.DataFrame(training, columns = ['sentence', 'sentiment'])
train_df.head()

Unnamed: 0,sentence,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,With the new production plant the company woul...,positive
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,"In the third quarter of 2010 , net sales incre...",positive
4,Operating profit rose to EUR 13.1 mn from EUR ...,positive


In [76]:
positive = train_df[train_df['sentiment'] == 'positive']
positive.head()

Unnamed: 0,sentence,sentiment
1,With the new production plant the company woul...,positive
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,"In the third quarter of 2010 , net sales incre...",positive
4,Operating profit rose to EUR 13.1 mn from EUR ...,positive
5,"Operating profit totalled EUR 21.1 mn , up fro...",positive


In [78]:
len(positive)

862

In [80]:
negative = train_df[train_df['sentiment'] == 'negative']
negative.head()

Unnamed: 0,sentence,sentiment
393,Jan. 6 -- Ford is struggling in the face of sl...,negative
396,Pharmaceuticals group Orion Corp reported a fa...,negative
397,"However , the growth margin slowed down due to...",negative
482,2009 3 February 2010 - Finland-based steel mak...,negative
484,Result before taxes decreased to nearly EUR 14...,negative


In [82]:
len(negative)

405

# Tokenize the Training Data

In this section, I will be borrowing heavily from the following bag-of-words model tutorial: 

https://machinelearningmastery.com/deep-learning-bag-of-words-model-sentiment-analysis/

In [114]:
tokens = ' '.join(positive['sentence'].values).split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [119]:
tokens[:10]

['With',
 'new',
 'production',
 'plant',
 'company',
 'would',
 'increase',
 'capacity',
 'meet',
 'expected']

In [116]:
# define vocab
vocab = Counter()

In [117]:
vocab.update(tokens)

In [118]:
# print the top words in the vocab
print(vocab.most_common(50))

[('EUR', 356), ('mn', 239), ('profit', 170), ('The', 165), ('net', 158), ('sales', 146), ('said', 144), ('Finnish', 140), ('period', 136), ('million', 131), ('company', 130), ('year', 123), ('mln', 115), ('quarter', 103), ('rose', 85), ('increased', 81), ('compared', 68), ('loss', 67), ('operating', 64), ('corresponding', 64), ('Oyj', 59), ('percent', 58), ('increase', 57), ('share', 56), ('euro', 53), ('first', 52), ('Operating', 50), ('eur', 50), ('per', 42), ('market', 42), ('In', 41), ('Finland', 41), ('also', 39), ('operations', 38), ('HEL', 37), ('contract', 37), ('group', 37), ('new', 36), ('business', 35), ('services', 35), ('earlier', 34), ('grew', 32), ('today', 31), ('order', 30), ('third', 29), ('Group', 29), ('second', 29), ('last', 27), ('agreement', 26), ('yearonyear', 26)]


Let's try the sample given in the paper.

In [230]:
tokens = 'We expect demand to increase.'.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
#tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [231]:
tokens[:5]

['We', 'expect', 'demand', 'to', 'increase']

In [232]:
freqs1 = Counter()
freqs1.update(tokens)
freqs1.most_common()

[('We', 1), ('expect', 1), ('demand', 1), ('to', 1), ('increase', 1)]

In [233]:
tokens = 'We expect worldwide demand to increase.'.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
#tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [234]:
freqs2 = Counter()
freqs2.update(tokens)
freqs2.most_common()

[('We', 1),
 ('expect', 1),
 ('worldwide', 1),
 ('demand', 1),
 ('to', 1),
 ('increase', 1)]

In [235]:
all_words = set(freqs1.keys()) | set(freqs2.keys())

In [236]:
keep_track = pd.DataFrame(all_words, columns=['words'])
keep_track.head()

Unnamed: 0,words
0,to
1,worldwide
2,demand
3,increase
4,expect


In [237]:
keep_track['freqs1'] = [freqs1[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1
0,to,1
1,worldwide,0
2,demand,1
3,increase,1
4,expect,1


In [238]:
keep_track['freqs2'] = [freqs2[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1,freqs2
0,to,1,1
1,worldwide,0,1
2,demand,1,1
3,increase,1,1
4,expect,1,1


In [239]:
numerator = np.dot(keep_track['freqs1'].values,keep_track['freqs2'].values)

In [240]:
def euclidean(a):
    return np.sqrt(sum([x**2 for x in a]))

In [241]:
e1 = euclidean(keep_track['freqs1'].values)
e1

2.23606797749979

In [242]:
e2 = euclidean(keep_track['freqs2'].values)
e2

2.449489742783178

In [243]:
numerator / (e1*e2)

0.9128709291752769

Sanity Check:

In [230]:
tokens = 'We expect demand to increase.'.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
#tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [231]:
tokens[:5]

['We', 'expect', 'demand', 'to', 'increase']

In [232]:
freqs1 = Counter()
freqs1.update(tokens)
freqs1.most_common()

[('We', 1), ('expect', 1), ('demand', 1), ('to', 1), ('increase', 1)]

In [244]:
tokens = 'We expect weakness in sales.'.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
#tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [245]:
freqs2 = Counter()
freqs2.update(tokens)
freqs2.most_common()

[('We', 1), ('expect', 1), ('weakness', 1), ('in', 1), ('sales', 1)]

In [246]:
all_words = set(freqs1.keys()) | set(freqs2.keys())

In [247]:
keep_track = pd.DataFrame(all_words, columns=['words'])
keep_track.head()

Unnamed: 0,words
0,to
1,demand
2,sales
3,in
4,increase


In [248]:
keep_track['freqs1'] = [freqs1[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1
0,to,1
1,demand,1
2,sales,0
3,in,0
4,increase,1


In [249]:
keep_track['freqs2'] = [freqs2[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1,freqs2
0,to,1,0
1,demand,1,0
2,sales,0,1
3,in,0,1
4,increase,1,0


In [250]:
numerator = np.dot(keep_track['freqs1'].values,keep_track['freqs2'].values)

In [251]:
def euclidean(a):
    return np.sqrt(sum([x**2 for x in a]))

In [252]:
e1 = euclidean(keep_track['freqs1'].values)
e1

2.23606797749979

In [253]:
e2 = euclidean(keep_track['freqs2'].values)
e2

2.23606797749979

In [254]:
numerator / (e1*e2)

0.3999999999999999

Ok, now let's look at some real documents:

In [260]:
parsed['0000006201']

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,Link,Loc7,Loc8,Item7
49,0000006201-21-000014,2021-02-17,2020-12-31,2021-02-17T17:17:57.000Z,34,10-K,001-08400,21646186,,43925703,1,1,aal-20201231.htm,10-K 2020 02.17.21,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(158292, 158310), Item 7. Management], [(167...","[[(502641, 502662), ITEM 8A. CONSOLIDATED]]",ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...
150,0000006201-20-000023,2020-02-19,2019-12-31,2020-02-19T07:31:30.000Z,34,10-K,001-08400,20627428,,30851334,1,1,a10k123119.htm,10-K 2019 02.19.20,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(153128, 153146), Item 7. Management], [(156...","[[(414897, 414918), ITEM 8A. CONSOLIDATED]]",ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...
225,0000006201-19-000009,2019-02-25,2018-12-31,2019-02-25T07:31:34.000Z,34,10-K,001-08400,19628071,,30572408,1,0,a10k123118.htm,10-K 2018 02.25.19,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9505, 9523), Item 7. Management], [(12796, ...","[[(300867, 300888), ITEM 8A. CONSOLIDATED]]",ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...
315,0000006201-18-000009,2018-02-21,2017-12-31,2018-02-21T08:02:40.000Z,34,10-K,001-08400,18627088,,27914491,1,0,a10k123117.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9554, 9572), Item 7. Management], [(13606, ...","[[(293380, 293401), ITEM 8A. CONSOLIDATED]]",ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...
412,0001193125-17-051216,2017-02-22,2016-12-31,2017-02-22T08:01:43.000Z,34,10-K,001-08400,17627073,,24888480,1,0,d286458d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(9935, 9953), Item 7. Management], [(14047, ...","[[(297249, 297270), ITEM 8A. CONSOLIDATED]]",ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...
538,0001193125-16-474605,2016-02-24,2015-12-31,2016-02-24T08:04:10.000Z,34,10-K,001-08400,161450518,,26170400,1,0,d78287d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(17027, 17045), Item 7. Management], [(21453...","[[(398001, 398022), ITEM 8A. CONSOLIDATED]]",ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...
651,0001193125-15-061145,2015-02-25,2014-12-31,2015-02-25T08:02:34.000Z,34,10-K,001-08400,15645918,,39524925,1,0,d829913d10k.htm,FORM 10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(16174, 16192), Item 7. Management], [(23008...","[[(452689, 452710), ITEM 8A. CONSOLIDATED]]",
750,0000006201-14-000004,2014-02-28,2013-12-31,2014-02-28T07:52:16.000Z,34,10-K,001-08400,14651496,,47888955,1,0,aagaa10k-20131231.htm,10-K,https://www.sec.gov/Archives/edgar/data/6201/0...,"[[(15590, 15608), Item 7. Management], [(23363...",[],ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...


They appear to be in order...

In [266]:
# 2021
sample1 = parsed['0000006201']['Item7'].values[0]
sample1[:200]

'ITEM 7.  MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS  2020 Financial Overview Impact of Coronavirus (COVID-19)  COVID-19 has been declared a global health pan'

In [267]:
tokens = sample1.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [268]:
tokens[:5]

['ITEM', 'DISCUSSION', 'AND', 'ANALYSIS', 'OF']

In [269]:
freqs1 = Counter()
freqs1.update(tokens)
freqs1.most_common(5)

[('billion', 111),
 ('million', 105),
 ('aircraft', 91),
 ('net', 74),
 ('cash', 66)]

In [271]:
# 2020
sample2 = parsed['0000006201']['Item7'].values[1]
sample2[:200]

'ITEM 7.  MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS  Background Together with  our wholly-owned  regional  airline  subsidiaries  and  third-party  regional '

In [272]:
tokens = sample2.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [274]:
freqs2 = Counter()
freqs2.update(tokens)
freqs2.most_common(5)

[('aircraft', 87), ('million', 85), ('billion', 76), ('net', 61), ('fuel', 55)]

In [275]:
all_words = set(freqs1.keys()) | set(freqs2.keys())

In [276]:
keep_track = pd.DataFrame(all_words, columns=['words'])
keep_track.head()

Unnamed: 0,words
0,indefinite
1,delivery
2,utilization
3,exist
4,training


In [277]:
keep_track['freqs1'] = [freqs1[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1
0,indefinite,2
1,delivery,6
2,utilization,0
3,exist,2
4,training,1


In [278]:
keep_track['freqs2'] = [freqs2[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1,freqs2
0,indefinite,2,1
1,delivery,6,7
2,utilization,0,2
3,exist,2,1
4,training,1,0


In [279]:
numerator = np.dot(keep_track['freqs1'].values,keep_track['freqs2'].values)

In [280]:
def euclidean(a):
    return np.sqrt(sum([x**2 for x in a]))

In [281]:
e1 = euclidean(keep_track['freqs1'].values)
e1

407.40152184300933

In [282]:
e2 = euclidean(keep_track['freqs2'].values)
e2

340.4966960192125

In [283]:
numerator / (e1*e2)

0.9253247090256499

Finally, let's see 2020 versus 2019. Hopefully there will be a large difference in scores!

In [303]:
# 2020
sample1 = parsed['0000006201']['Item7'].values[1]
sample1[:200]

'ITEM 7.  MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS  Background Together with  our wholly-owned  regional  airline  subsidiaries  and  third-party  regional '

In [304]:
tokens = sample1.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [305]:
tokens[:5]

['ITEM', 'DISCUSSION', 'AND', 'ANALYSIS', 'OF']

In [306]:
freqs1 = Counter()
freqs1.update(tokens)
freqs1.most_common(5)

[('aircraft', 87), ('million', 85), ('billion', 76), ('net', 61), ('fuel', 55)]

In [307]:
# 2019
sample2 = parsed['0000006201']['Item7'].values[2]
sample2[:200]

'ITEM 7.  MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS  Background Together with our wholly-owned regional airline subsidiaries and third-party regional carrier'

In [308]:
tokens = sample2.split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [309]:
freqs2 = Counter()
freqs2.update(tokens)
freqs2.most_common(5)

[('billion', 155),
 ('income', 130),
 ('net', 129),
 ('million', 127),
 ('aircraft', 114)]

In [310]:
all_words = set(freqs1.keys()) | set(freqs2.keys())

In [311]:
keep_track = pd.DataFrame(all_words, columns=['words'])
keep_track.head()

Unnamed: 0,words
0,indefinite
1,delivery
2,utilization
3,exist
4,training


In [312]:
keep_track['freqs1'] = [freqs1[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1
0,indefinite,1
1,delivery,7
2,utilization,2
3,exist,1
4,training,0


In [313]:
keep_track['freqs2'] = [freqs2[x] for x in keep_track['words']]
keep_track.head()

Unnamed: 0,words,freqs1,freqs2
0,indefinite,1,0
1,delivery,7,7
2,utilization,2,0
3,exist,1,0
4,training,0,2


In [314]:
numerator = np.dot(keep_track['freqs1'].values,keep_track['freqs2'].values)

In [11]:
def euclidean(a):
    return np.sqrt(sum([x**2 for x in a]))

In [316]:
e1 = euclidean(keep_track['freqs1'].values)
e1

340.4966960192125

In [317]:
e2 = euclidean(keep_track['freqs2'].values)
e2

542.7504030399241

In [318]:
numerator / (e1*e2)

0.9353549029323915

In [319]:
len(sample1)

78241

In [320]:
len(sample2)

113339

Well, less dissimilar than 2020-2021, I suppose the timing was well before covid. However, I expected one of the two numbers to be a little lower. No matter, let us continue:

# One Company Cosine Similarity

In [324]:
scores = []
for i in range(len(parsed['0000006201']['Item7'].values)-1):
    ##
    sample1 = parsed['0000006201']['Item7'].values[i]
    tokens = sample1.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]

    freqs1 = Counter()
    freqs1.update(tokens)

    ##
    sample2 = parsed['0000006201']['Item7'].values[i+1]
    tokens = sample2.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]

    freqs2 = Counter()
    freqs2.update(tokens)

    all_words = set(freqs1.keys()) | set(freqs2.keys())

    keep_track = pd.DataFrame(all_words, columns=['words'])
    keep_track['freqs1'] = [freqs1[x] for x in keep_track['words']]
    keep_track['freqs2'] = [freqs2[x] for x in keep_track['words']]

    numerator = np.dot(keep_track['freqs1'].values,keep_track['freqs2'].values)

    e1 = euclidean(keep_track['freqs1'].values)

    e2 = euclidean(keep_track['freqs2'].values)

    cos_sim = numerator / (e1*e2)
    scores.append(cos_sim)
scores.append(np.nan)



In [325]:
scores

[0.9253247090256499,
 0.9353549029323915,
 0.9699103723997726,
 0.9596219119325394,
 0.9180383864701249,
 nan,
 nan,
 nan]

In [326]:
parsed['0000006201']['Cos_Sim'] = scores

# Now All Companies

In [12]:
for company in parsed.keys():    
    scores = []
    for i in range(len(parsed[company]['Item7'].values)-1):
        ##
        sample1 = parsed[company]['Item1a'].values[i] + ' ' + parsed[company]['Item7'].values[i]
        tokens = sample1.split()
        # remove punctuation from each token
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]
        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        # filter out short tokens
        tokens = [word for word in tokens if len(word) > 1]

        freqs1 = Counter()
        freqs1.update(tokens)

        ##
        sample2 = parsed[company]['Item1a'].values[i+1] + ' ' + parsed[company]['Item7'].values[i+1]
        tokens = sample2.split()
        # remove punctuation from each token
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]
        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        # filter out short tokens
        tokens = [word for word in tokens if len(word) > 1]

        freqs2 = Counter()
        freqs2.update(tokens)

        all_words = set(freqs1.keys()) | set(freqs2.keys())

        keep_track = pd.DataFrame(all_words, columns=['words'])
        keep_track['freqs1'] = [freqs1[x] for x in keep_track['words']]
        keep_track['freqs2'] = [freqs2[x] for x in keep_track['words']]

        numerator = np.dot(keep_track['freqs1'].values,keep_track['freqs2'].values)

        e1 = euclidean(keep_track['freqs1'].values)

        e2 = euclidean(keep_track['freqs2'].values)

        cos_sim = numerator / (e1*e2)
        scores.append(cos_sim)
    scores.append(np.nan)
    parsed[company]['Cos_Sim'] = scores



In [13]:
with open('6-link_dict.pickle', 'wb') as handle:
    pickle.dump(parsed, handle, protocol=pickle.HIGHEST_PROTOCOL)