# 10-Q SEC Filing Sentiment and Similiarity Metrics Example Notebook

...

In [None]:
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import multiprocessing as mp
import matplotlib.pyplot as plt
from ast import literal_eval
from tqdm.notebook import tqdm
from finpie import historical_prices
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# set plot styles
plt.style.use('seaborn')
plt.rcParams['text.color'] = 'black'
plt.rcParams[ "figure.figsize"] = (20, 10)

# load custom classes
from classes.sec_class import SecData
from classes.wordcloud_class import wordCloud
from classes.text_processing_class import textPreProcess

sec = SecData()
wc = wordCloud()
tpp = textPreProcess()


### To do:
- add 10-Ks to fill the gap
- cleaning:
    - clean docs of 'Table of Contents'
    - clean docs of Management’s Discussion and Analysis of Financial Condition and Results of Operations 
    - clean docs of forward looking statement or other common texts
    - clean docs of table description such as: ''the following table shows'
    - clean docs of table footnotes '(1)'
    - clean other things and improve regex..

## Download 10-Q SEC Filings

In [None]:
cik_dict = { 'AAPL': '0000320193' } # 'XOM': '0000034088', 'TSLA': '0001318605', 'JNJ': '0000200406' }
ten_qs = {}
for ticker, cik in tqdm(cik_dict.items()):
    ten_qs[ticker] = sec.get_10qs( cik )

In [None]:
ticker_dict = {}
for ticker in cik_dict.keys():
    print(ticker)
    mdna = {}
    for file_date, doc in tqdm(ten_qs[ticker].items()):
        mdna[file_date] = sec.get_mdna(doc)
    df = pd.DataFrame(mdna, index = ['MDnA']).transpose()
    ticker_dict[ticker] = df
    #display(df.head())
# example
print(ticker, ' example')
display(df.head())
print(df.iloc[0].values[0][:1000])

In [None]:
# Illustrative Example:
ticker = 'AAPL'
df = ticker_dict[ticker]

## Word Clouds

In [None]:
# create classes
wc = wordCloud()
tpp = textPreProcess()
tpp.textColumn = 'MDnA'

# load stopwords
stopword_files = glob.glob('./stopwords/*.txt')
stopwords = nltk_stopwords.words('english')
for file in stopword_files:
    stopwords += list(pd.read_csv(file).iloc[:,0])
stopwords = [ word.lower() for word in stopwords if type(word) == type("") ]
tpp.stopwords = stopwords




In [None]:

# split dataframe
dflist = np.array_split(df,  mp.cpu_count() )
with mp.Pool(mp.cpu_count()) as pool:
    dfs = list(tqdm(pool.imap( tpp.lemmatize,  [ d for d in dflist ] ), total = len(dflist) ))
df = pd.concat( dfs )

wordCloudDict = { f'{ticker}: 10-Q Management\'s Discussion and Analysis': df.lemmatised_text }
wc.create_word_cloud( wordCloudDict,  masks = f'./logos/{ticker.lower()}_logo', title = "company", columns = 1, rows = 1 )


...

## Lexica based sentiment example using Loughran McDonald Dictionary

In [None]:
# load dictionary
lmcd_dictionary = pd.read_csv('LoughranMcDonald_MasterDictionary_2018.csv')
columns = ['Word', 'Negative', 'Positive', 'Uncertainty', 'Litigious', 'Constraining', 'Superfluous', 'Interesting']
lmcd_dictionary = lmcd_dictionary[columns]
lmcd_dictionary.dropna(subset = ['Word'], inplace = True, axis = 0)
lmcd_dictionary.reset_index(drop = True, inplace = True)

def lmcd_sentiment(df_idx):
    df_idx = df_idx.copy()
    text = df_idx.text.values[0] #' '.join( literal_eval( df_idx.lemmatised_text.values[0] ) )
    for i, word in enumerate(lmcd_dictionary.Word):
        if word in text.upper():
            df_idx[columns[1:]] += lmcd_dictionary.loc[i, columns[1:]].values
    return pd.DataFrame(df_idx)

In [None]:
for col in columns[1:]:
    df[col] = 0
# concatenated texts...
df['text'] = [ ' '.join( literal_eval( txt ) ) for txt in df.lemmatised_text ]

# calculate lexica based sentiment
with mp.Pool(mp.cpu_count()) as pool:
    dfs = list(tqdm(pool.imap( lmcd_sentiment, [d for d in np.array_split(df, len(df))] ), total = len(df) ))
df = pd.concat( dfs )
df.index = pd.to_datetime(df.index)
df.sort_index(inplace = True)
display(df.head())

In [None]:
# plot absolute 
df.index = pd.to_datetime(df.index)
df.sort_index(inplace = True)
df[columns[1:]].plot()
plt.title(f'Absolute sentiment scores of {ticker}\'s 10-Q filings', fontsize = 24)
plt.show()

# Get price data
prices = historical_prices(ticker)
prices['adj_close'][df.index[0]:].plot()
plt.title(f'{ticker} stock price')
plt.show()

In [None]:
# add quarter on quarter returns to dataframe
# should really start at next days opening price but keeping this for simplicity for now
df['quarter_returns'] = prices['adj_close'].loc[df.index].pct_change().shift(1)

# rescale sentiment based on doc length -> "average sentiments"
df['doc_length'] = [ len(d) for d in df.lemmatised_text ]
for col in columns[1:]:
    df[col] = df[col] / df['doc_length']

# Show standardised sentiment
df.index = pd.to_datetime(df.index)
df.sort_index(inplace = True)
df[columns[1:]].plot()
plt.title(f'Sentiment scores of {ticker}\'s 10-Q filings', fontsize = 24)
plt.show()

# correlation between returns and average metrics
print('Correlation between returns and standardised metrics:')
display(df[columns[1:] + ['quarter_returns']].corr())
#sns.heatmap(df[columns[1:] + ['quarter_returns']].corr(), cmap = 'Blues')
#plt.show()

In [None]:
# calculating % change of metric from report to report
# superfluous has many zero values, still need to deal with this issue in percentage calculation,
# and for other companies other metrics might have missing values
df[columns[1:]] = df[columns[1:]].pct_change()

# correlation between percentage changes and returns
display(df[columns[1:] + ['quarter_returns']].corr())
plt.title('Heatmap of correlations between sentiment percentage changes and quarterly returns', fontsize = 24)
sns.heatmap(df[columns[1:] + ['quarter_returns']].corr(), cmap = 'Blues' )
plt.show()
df.to_csv('xom.csv')

## Similarity Metrics

### Cosine Similarity Scores of 10-Q's

In [None]:
# cosine similarity with tfidf or word2vec
tfidf = TfidfVectorizer()
tfidf_df = tfidf.fit_transform(df.text).toarray()
rows = []
for j in range(len(tfidf_df)):
    columns = []
    for i in range(len(tfidf_df)):
        columns.append( cosine_similarity(tfidf_df[j].reshape(1, -1), tfidf_df[i].reshape(1, -1))[0][0] )
    rows.append(columns)


In [None]:
cosine_similiarity = pd.DataFrame(rows, columns = df.index, index = df.index )
display(cosine_similiarity)

# similiarity heatmap plot
sns.heatmap(cosine_similiarity, cmap = 'Blues')
plt.title('Cosine Similiarity of 10-Q\'s based on TFIDF', fontsize = 24 )
plt.show()

# plot of similiarity of previous reports to most recent report
plt.plot(cosine_similiarity.iloc[:,-1])
plt.title('Cosine similarity of most recent 10-Q with previous 10-Qs', fontsize = 24 )
plt.show()

### Containment N-gram scores

In [None]:
# containment function
def calculate_containment(df1, df2, ngram_size):
    text1 = df1.text
    text2 = df2.text
    counts = CountVectorizer(analyzer='word', ngram_range=(ngram_size, ngram_size))
    ngrams = counts.fit_transform([text1, text2])
    
    ngram_array = ngrams.toarray()
    intersect = np.amin(ngram_array, axis=0)
    common_ngrams = sum(intersect)
    
    len_ngram_a = sum(ngram_array[0])
    
    containment_score = 1.0 * common_ngrams / len_ngram_a
    
    return containment_score

In [None]:
# 2-ngram containment
ngram = 2
rows = []
for j in range(len(df)):
    columns = []
    for i in range(len(df)):
        print(i)
        columns.append( calculate_containment( df.iloc[i], df.iloc[j], ngram ) )
    rows.append(columns)
containment_df = pd.DataFrame(rows, index = pd.to_datetime(df.index), columns = df.index )

# containment df
display(containment_df)

# containment heatmap
plt.title('Heatmap of 2-nrgam containment scores')
sns.heatmap(containment_df, cmap = 'Blues')
plt.show()


# plot of containment of previous reports to most recent report
plt.plot(containment_df.iloc[:,-1])
plt.title('2-gram containment of most recent 10-Q with previous 10-Qs', fontsize = 24 )
plt.show()