In [802]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup as Soup
import pandas as pd
import numpy as np
import os
import json
import glob
import datetime
import quandl
quandl.ApiConfig.api_key = "Uyaf1-7qy5o9EJt8z_xc"
    

In [803]:
blacklisted_ciks = ['0001062292']

In [804]:
class HTTParty:
    
    def __init__(self):
        return None
    
    def simple_get(self, url):
        """
        Attempts to get the content at `url` by making an HTTP GET request.
        If the content-type of response is some kind of HTML/XML, return the
        text content, otherwise return None
        """
        try:
            with closing(get(url, stream=True)) as resp:
                return resp.content

        except RequestException as e:
            log_error('Error during requests to {0} : {1}'.format(url, str(e)))
            return None


    def log_error(self, e):
        """
        It is always a good idea to log errors. 
        This function just prints them, but you can
        make it do anything.
        """
        print(e)


In [805]:
class RecentForms:
    # Used by FilingIndex to get recent forms
    def __init__(self, start=0, count=100):
        rss = """https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent
                 &CIK=&type=8-K&company=&dateb=&owner=exclude&start="""
        rss = rss + str(start) + "&count="
        rss = rss + str(count) + "&output=atom"
        res = HTTParty().simple_get(rss)
        soup = Soup(res,'xml')
        self.entries = soup.find_all('entry')
        
    def get_all(self):
        return pd.DataFrame.from_records([Entry(entry).to_dict() for entry in self.entries])
        
class Entry:
    # Wraps Soup objects with an interface that gives info on that form. 
    def __init__(self, soup_entry):
        self.entry = soup_entry
        return None
        
    def title(self):
        title_text = self.entry.find('title').get_text()
        return title_text.split('(')[0].split('-')[-1].strip().upper()
    
    def cik(self):
        title_text = self.entry.find('title').get_text()
        return str(title_text.split(')')[0].split('(')[-1])
        
    def link(self):
        return self.entry.find('link').get_attribute_list('href')[0].replace('-index.htm', '.txt')
        
    def summary(self):
        text = self.entry.find('summary').get_text()
        return ' --- '.join(text.split('<br>')[1:]).replace("\n", '')
        
    def updated(self):
        return self.entry.find('updated').get_text()
    
    def to_dict(self):
        return {
            'title': self.title(),
            'cik': self.cik(),
            'link': self.link(),
            'summary': self.summary(),
            'updated': self.updated()
        }


In [842]:
RecentForms(0).get_all()

Unnamed: 0,cik,link,summary,title,updated
0,0000949721,http://www.sec.gov/Archives/edgar/data/949721/...,Item 1.01: Entry into a Material Definitive Ag...,ATTIS INDUSTRIES INC.,2018-06-01T17:30:17-04:00
1,0001604191,http://www.sec.gov/Archives/edgar/data/1604191...,Item 5.02: Departure of Directors or Certain O...,"AZURRX BIOPHARMA, INC.",2018-06-01T17:28:43-04:00
2,0001223389,http://www.sec.gov/Archives/edgar/data/1223389...,Item 5.07: Submission of Matters to a Vote of ...,CONNS INC,2018-06-01T17:28:03-04:00
3,0001536256,http://www.sec.gov/Archives/edgar/data/1536256...,Item 1.01: Entry into a Material Definitive Ag...,FIRST CAPITAL REAL ESTATE TRUST INC,2018-06-01T17:27:25-04:00
4,0001701605,http://www.sec.gov/Archives/edgar/data/1701605...,Item 5.02: Departure of Directors or Certain O...,BAKER HUGHES A GE CO,2018-06-01T17:26:52-04:00
5,0000808362,http://www.sec.gov/Archives/edgar/data/808362/...,Item 5.02: Departure of Directors or Certain O...,BAKER HUGHES A GE CO LLC,2018-06-01T17:26:52-04:00
6,0001033012,http://www.sec.gov/Archives/edgar/data/1033012...,Item 8.01: Other Events --- Item 9.01: Financi...,FLAGSTAR BANCORP INC,2018-06-01T17:26:39-04:00
7,0000796764,http://www.sec.gov/Archives/edgar/data/796764/...,Item 1.01: Entry into a Material Definitive Ag...,"PREMIER EXHIBITIONS, INC.",2018-06-01T17:26:23-04:00
8,0001591890,http://www.sec.gov/Archives/edgar/data/1591890...,Item 8.01: Other Events,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2018-06-01T17:25:50-04:00
9,0000887359,http://www.sec.gov/Archives/edgar/data/887359/...,Item 1.01: Entry into a Material Definitive Ag...,VERICEL CORP,2018-06-01T17:22:12-04:00


In [806]:
class FilingIndex:
    def __init__(self, start=0):
        self.start = start
        return None
    
    # Gets recent forms idempotently and saves to filings_index.csv
    def update(self):
        filings_index_df = self.get()
        recent_filings_df = RecentForms(self.start).get_all()
        all_filings = recent_filings_df.append(filings_index_df)
        deduped_filings = all_filings.drop_duplicates(subset=['link'])
        new_filing_count = len(deduped_filings) - len(filings_index_df)
        deduped_filings.to_csv('filings_index.csv', index=False)
        return new_filing_count
    
    def get(self):
        try:
            return pd.read_csv('filings_index.csv')
        except: 
            return pd.DataFrame.from_dict({})

In [807]:
class CompanyTickerMapping:
    # Reads from a static csv: 'company_ticker_mapping.csv'
    def __init__(self, ticker_mapping=None):
        if( type(ticker_mapping) == pd.core.frame.DataFrame ):
            self.ticker_mapping = ticker_mapping 
        else:
            self.ticker_mapping = pd.read_csv('company_ticker_mapping.csv')
        
    def get(self):
        return self.ticker_mapping
    
    def ticker_symbol_from_cik(self, cik):
        return self.where_equal('cik', cik).ticker_symbol.values[0]
    
    def where_equal(self, column, value):
        df = self.ticker_mapping
        return df.loc[df[column] == value]
    def where_in(self, column, list_of_values):
        df = self.ticker_mapping
        return df.loc[df[column].isin(list_of_values)]

In [808]:
class FilingText:
    def __init__(self):
        return None
    
    # Saves the document text of the last n filings to the local filing_texts folder
    def update(self, num=-1):
        updated = []
        # Write each filing to file
        for filing in self.ticker_symbol_dates(num):
            ticker_symbol = filing['ticker_symbol']
            date =  filing['date']
            filename = filing['filename']
            link = filing['link']
            if os.path.isfile(filename):
                continue
            else:
                updated.append(filing)
                doc_text = str(HTTParty().simple_get(filing['link']))
                f = open(filename,'w')
                f.write(doc_text)
                f.close()
        return updated
    
    def ticker_symbol_dates(self, num=-1):
        tm = CompanyTickerMapping().get()
        # Get a list of ciks that we have stock ticker symbols for
        ciks = tm.cik 
        # Find recent filings 
        filings = FilingsIndex().get()[:num]
        fwks = filings.loc[filings['cik'].isin(ciks.values)]
        output = []
        # Write each filing to file
        for filing in fwks.to_records():
            ticker_symbol = CompanyTickerMapping(ticker_mapping=tm).ticker_symbol_from_cik(filing.cik)
            date = str(filing.updated).split('T')[0]
            filename = f'filing_texts/{ticker_symbol}_{date}'
            output.append({'ticker_symbol': ticker_symbol, 'date': date, 'filename': filename, 'link': filing.link})
        return output
            

In [817]:
class PriceCsv:
    def __init__(self):
        self.range = range(-10, 2)
        
    def get(self, ticker_symbol):
        try:
            return pd.read_csv(f'prices/{ticker_symbol}.csv')
        except:
            return pd.DataFrame.from_dict({})
        
    def update_all(self):
        all_tickers = FilingText().ticker_symbol_dates()
        self.update(all_tickers)
    
    def update(self, list_of_objects=None):
        # If list_of_objects isn't passed in, try to update all prices
        for obj in list_of_objects:
            filing_date = obj['date']
            ticker_symbol = obj['ticker_symbol']
            existing_prices = self.get(ticker_symbol)
            date_range = self.__date_range(filing_date)
            # 'Continue' here if any price in the date_range is in the future 
            if datetime.datetime.strptime(date_range[-1], '%Y-%m-%d') >= datetime.datetime.now():
                continue
            # 'Continue' here if date_range is filled in for this ticker_symbol in the csv. 
            if self.__is_in_range(date_range, symbol):
                continue
            try:
                recent_prices = self.__av_fetch(date_range, ticker_symbol)
            except:
                recent_prices = self.__quandl_fetch(date_range, ticker_symbol)
            if len(recent_prices) == 0:
                continue
            all_prices = recent_prices.append(existing_prices)
            deduped_prices = all_prices.drop_duplicates(subset=['date'])
            deduped_prices.to_csv(f'prices/{ticker_symbol}.csv', index=False)
        return list_of_objects
    
    def __date_range(self, date):
        dates = []
        for delta in self.range:
            date_delta = datetime.timedelta(days=delta)
            date_string = datetime.datetime.strptime(date, '%Y-%m-%d').date()
            dates.append(str(date_string + date_delta))
        return dates
       
    def __quandl_fetch(self, dates, symbol):
        gte = dates[0]
        lte = dates[-1]
        data = quandl.get_table('WIKI/PRICES', qopts = { 'columns': ['ticker', 'date', 'close', 'open', 'high', 'low'] }, ticker = [symbol], date = { 'gte': gte, 'lte': lte })
        return data
    
    def __av_fetch(self, dates, symbol):
        url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&apikey=QPL6YTN5VA6V7MP8'
        response = json.loads(HTTParty().simple_get(url))
        response = response['Time Series (Daily)']
        records = []
        for date in dates:
            info = response.get(date, {})
            if len(info.keys()) == 0:
                records.append({
                    'date': date,
                    'ticker': symbol,
                    'open': 'N/A',
                    'high': 'N/A',
                    'low': 'N/A',
                    'close': 'N/A'
                 })
                continue
            obj = {
                'date': date,
                'ticker': symbol,
                'open': info['1. open'],
                'high': info['2. high'],
                'low': info['3. low'],
                'close': info['4. close']
            }
            records.append(obj)
        df = pd.DataFrame.from_records(records)
        df = df.sort_values(by='date')
        return df
    
    def __is_in_range(self, date_range, symbol):
        df = self.get(symbol)
        try:
            dates_included_already = len(df.loc[df['date'].isin(date_range)])
        except:
            dates_included_already = 0
        return( dates_included_already == len(date_range) )




In [834]:
class Cron:
    def __init__(self, start=0):
        self.start = start
        return None
    
    def update(self):
        total_updated = 0
        for i in range(0, 100):
            doc_start = i*100
            n_updated = FilingIndex(doc_start).update()
            objs_updated = FilingText().update(n_updated)
            prices_updated = PriceCsv().update(objs_updated)
            total_updated = total_updated + len(prices_updated)
            if len(prices_updated) == 0:
                return total_updated
        return total_updated
            

In [836]:
Cron().update()

0

0

In [825]:
for i in range(11, 10000):
    print(len(Cron(i*100).update()))

0
0
0
0
0
0
0
0
0
0
0
0
0
0


KeyboardInterrupt: 

In [None]:
PriceCsv().update(FilingText().ticker_symbol_dates())

datetime.datetime(2018, 6, 2, 11, 56, 27, 729161)

In [743]:
# rm filing_texts/* && rm filings_index.csv && touch filings_index.csv && rm prices/*

ERROR:root:Line magic function `%touch` not found.


rm: prices/*: No such file or directory
