In [1]:
# !pip install textblob
import os
import re
import unicodedata
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup 
from lxml import html
from tqdm import tqdm
import glob
from inscriptis import get_text
import urllib.request
import copy
from textblob import TextBlob
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import workdays

In [146]:
#Identify stock tickers that include periods
for i in nasdaq_tickers[::-1]:
    if '.' in i:
        nasdaq_tickers.remove(i)

In [6]:
# Define function to normalize text, put text in lower case, remove numbers
#and special characters, and remove breaks and extra spaces
def Clean(text):
    text = unicodedata.normalize('NFKD', text)
    text = text.lower()
    text = re.sub('[^a-z]+', ' ', text)
    text = re.sub (r'\b\w\b', ' ', text)
    text = " ".join(text.split())
    return text

In [138]:
#Get the latest year on year cosine score and percent change in sentiment
def get_latest_yoy_metrics(tickers, k_or_q='k'):    
    scores = {}
    problems =[]
    for ticker in tqdm(tickers):
        if k_or_q=='k':
            rss = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + str(ticker) +'&type=10-k%25&dateb=&owner=exclude&start=0&count=40&output=atom'
        else:
            rss = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + str(ticker) +'&type=10-q%25&dateb=&owner=exclude&start=0&count=40&output=atom'
        #obtain SEC edgar index page links
        page = requests.get(rss)
        filing_dates = [i.string for i in BeautifulSoup(page.content, 'html.parser').find_all('filing-date')]
        indices = [i.string for i in BeautifulSoup(page.content, 'html.parser').find_all('filing-href')]
        if (len(indices)>=2) & (k_or_q=='k'):
            indices = indices[:2]
            filing_dates=filing_dates[:2]
        elif (len(indices)>=4) & (k_or_q=='q'):
            start = filing_dates[0]
            correct_year = [i for i in filing_dates if int(start.split('-')[0]) - int(i.split('-')[0]) == 1]
            month_diffs = [abs(int(start.split('-')[1]) - int(i.split('-')[1])) for i in correct_year]
            correct_date_index = correct_year[month_diffs.index(min(month_diffs))]
            correct_report = filing_dates.index(correct_date_index)
            indices = [indices[0],indices[correct_report]]
            filing_dates = [filing_dates[0],filing_dates[correct_report]]
        else:
            problems.append(ticker)
            continue
        #obtain links to relevant reports
        report_links = []
        for i in range(len(indices)):
            page = requests.get(indices[i])
            index_links = [j.get('href') for j in BeautifulSoup(page.content,'html.parser').find_all('a')]
            index_links = [j for j in index_links if ('/Archives/edgar/data' in j) & (j[-4:]=='.htm')]
            if len(index_links)>0:
                link = 'https://www.sec.gov' + index_links[0]
                report_links.append(link)
        #identify whether report has recently changed to an interactive format
        try: 
            format_change_dummy = 0
            if report_links[0][:22]!=report_links[1][:22]:
                format_change_dummy+=1
        except:
            problems.append(ticker)
            continue
        #obtain texts
        texts = []
        for i in report_links:
            html = urllib.request.urlopen(i).read().decode('utf-8')
            text = get_text(html)
            text = Clean(text)
            texts.append(text)
        #calculate cosine score and yoy % change in sentiment
        if len(texts)==2:
            sentiment_d = (TextBlob(texts[0]).sentiment.polarity-TextBlob(texts[1]).sentiment.polarity)/TextBlob(texts[1]).sentiment.polarity
            vec = CountVectorizer(input='content', stop_words='english')
            vectors = vec.fit_transform(texts)
            cosine = cosine_similarity(vectors[0],vectors[1]).item()
            scores.update({ticker:{'cosine':cosine,'sentiment_d':sentiment_d,'format_change':format_change_dummy,'dates':filing_dates}})
        else:
            problems.append(ticker)
    #create dataframe and convert metrics to float
    df = pd.DataFrame.from_dict(scores).transpose()
    df.cosine = df.cosine.astype(float)
    df.sentiment_d = df.sentiment_d.astype(float)
    df['format_change'] = df['format_change'].astype(int) 
    #calculate number of workdays between month/day-filing dates
    df['date_delta'] = df.dates.apply(lambda x: workdays.networkdays(pd.to_datetime(x[0]),pd.to_datetime(x[1]))+262)
    #remove rows for reports filed more than one month after previous year's date to account for database errors
    problems += list(df[df.date_delta.abs()>31].index)
    df = df[df.date_delta.abs()<=31]
    print('Reports unavailable for: ', problems)
    return df

In [170]:
#Get the links for the 10-K index pages for each company
def get_all_links(tickers, k_or_q='k'):    
    hrefs = {}
    for ticker in tqdm(tickers):
        if k_or_q=='k':
            rss = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + str(ticker) +'&type=10-k%25&dateb=&owner=exclude&start=0&count=40&output=atom'
        else:
            rss = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + str(ticker) +'&type=10-q%25&dateb=&owner=exclude&start=0&count=40&output=atom'
        page = requests.get(rss)
        filing_dates = [i.string for i in BeautifulSoup(page.content, 'html.parser').find_all('filing-date')]
        indices = [i.string for i in BeautifulSoup(page.content, 'html.parser').find_all('filing-href')]
        report_links = {}
        for i in range(len(indices)):
            page = requests.get(indices[i])
            index_links = [j.get('href') for j in BeautifulSoup(page.content,'html.parser').find_all('a')]
            index_links = [j for j in index_links if ('/Archives/edgar/data' in j) & (j[-4:]=='.htm')]
            if len(index_links)>0:
                link = 'https://www.sec.gov' + index_links[0]
                report_links.update({filing_dates[i]:link})
        hrefs.update({ticker:report_links})
    return hrefs 

In [None]:
def get_metrics(links):
    vec = CountVectorizer(input='content', stop_words='english')
    dic = {}
    problems = {}
    for ticker in tqdm(links):
        try:
            cosines = []
            for i in range(len(links_copy[ticker])):
                html = urllib.request.urlopen(links_copy[ticker][i]).read().decode('utf-8')
                text = get_text(html)
                text = Clean(text)
                texts.append(text)
            texts = []
            sentiments = []
            for i in texts:
                sentiments.append(TextBlob(i).sentiment.polarity)
            sentiment_deltas = [(sentiments[i]-sentiments[i+1])/sentiments[i+1] for i in range(len(sentiments)-1)]
            vectors = vec.fit_transform(texts)
            for i in range(vectors.shape[0]-1):
                cosines.append(cosine_similarity(vectors[i],vectors[i+1]).item())
            dic.update({ticker:[{'sentiment_ds':sentiment_deltas},{'cosine_score':cosines}]})
        except:
            print('problem')
            dic.update({ticker:'problem'})
    return dic

In [162]:
#Create cleaned local text files from 10-K links
vec = CountVectorizer(input='content', stop_words='english')
dic = {}
problems = {}
for ticker in tqdm([]):
    try:
        texts = []
        sentiments = []
        cosines = []
        for i in range(len(links_copy[ticker])):
            html = urllib.request.urlopen(links_copy[ticker][i]).read().decode('utf-8')
            text = get_text(html)
            text = Clean(text)
            texts.append(text)
        for i in texts:
            sentiments.append(TextBlob(i).sentiment.polarity)
        sentiment_deltas = [(sentiments[i]-sentiments[i+1])/sentiments[i+1] for i in range(len(sentiments)-1)]
        vectors = vec.fit_transform(texts)
        for i in range(vectors.shape[0]-1):
            cosines.append(cosine_similarity(vectors[i],vectors[i+1]).item())
        dic.update({ticker:[{'sentiment_ds':sentiment_deltas},{'cosine_score':cosines}]})
    except:
        print('problem')
        dic.update({ticker:'problem'})

100%|██████████| 491/491 [3:39:04<00:00, 21.44s/it]    


In [None]:
def ComputeJaccardSimilarity(words_A, words_B):
    # Count number of words in both A and B
    words_intersect = len(words_A.intersection(words_B))
    
    # Count number of words in A or B
    words_union = len(words_A.union(words_B))
    
    # Compute Jaccard similarity score
    jaccard_score = words_intersect / words_union
    
    return jaccard_score

In [None]:
#Obtain Jaccard Similarity Scores for consecutive 10-k reports
jaccard_scores = []
error = []
for i in tqdm(joinedpairs):
    text1 = str(pd.read_csv(i[0]))[26:-11]
    text2 = str(pd.read_csv(i[1]))[26:-11]
    words1 = set(text1.split())
    words2 = set(text2.split())
    jaccard_score = ComputeJaccardSimilarity(words1,words2)
    jaccard_scores.append(jaccard_score)