In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests, json, re, csv
from datetime import datetime as dt

In [2]:
def getREITinfo():
    """This function queries the wikipedia page
    for the most its list of most recent public
    traded REITs. Pulls the 'name', 'REIT desc.'
    and 'ticker' and returns a zipped list.
    """
    
    url = 'https://en.wikipedia.org/wiki/List_of_public_REITs_in_the_United_States'
    edgarurl = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany'

    reit_ticks = []
    names = []
    types = []
    urllist = []
    
    data = requests.get(url)

    soup = BeautifulSoup(data.text, 'lxml')
    raw = (soup.findAll(class_='external text'))

    raw = [str(r) for r in raw]
    raw = [r.split(',') for r in raw]
    for r in raw:
        reit_ticks.append((r[0].split('nofollow">')[1].rstrip('</a>')))

    namerows = soup.findAll('tr')
    for n in namerows[1:]:
        names.append(str(n.findAll('td')[0].text).replace("\n",''))
        types.append(str(n.findAll('td')[1].text).replace("\n",''))
    final_list = list(zip(names, types, reit_ticks))

    df = pd.DataFrame(final_list, columns=['company','type','ticker'])
    
    for i in df.ticker:
        urllist.append(edgarurl.format(i))
    df['company_url'] = urllist
    return df

In [3]:
df = getREITinfo()
df.shape

(75, 4)

In [4]:
df.loc[df.type=='Office']

Unnamed: 0,company,type,ticker,company_url
38,Alexandria Real Estate Equities,Office,ARE,https://www.sec.gov/cgi-bin/browse-edgar?CIK=A...
39,Boston Properties,Office,BXP,https://www.sec.gov/cgi-bin/browse-edgar?CIK=B...
40,Brandywine Realty Trust,Office,BDN,https://www.sec.gov/cgi-bin/browse-edgar?CIK=B...
41,Corporate Office Properties Trust,Office,OFC,https://www.sec.gov/cgi-bin/browse-edgar?CIK=O...
42,Mack-Cali Realty Corporation,Office,CLI,https://www.sec.gov/cgi-bin/browse-edgar?CIK=C...
43,SL Green Realty,Office,SLG,https://www.sec.gov/cgi-bin/browse-edgar?CIK=S...


In [5]:
def spec10Klinks(x, x_type):
    """Function that returns specific type from dataframe
    Parameters
    ----------
    x : Pandas DataFrame.
        First parameter
    x_type : String
        Must match string from pd.series 'type'
    
    Returns
    -------
    dataframe
    """
    vals = x['type'].unique()
    if x_type not in vals:
        return 'Check that "x_type" is in:{}'.format(vals)
    else:
        predf = x.copy()
        slicedf = predf.loc[predf.type == x_type]
    #return slicedf

    return slicedf

In [6]:
office = spec10Klinks(df, 'Office')
office.shape

(6, 4)

In [7]:
office.head()

Unnamed: 0,company,type,ticker,company_url
38,Alexandria Real Estate Equities,Office,ARE,https://www.sec.gov/cgi-bin/browse-edgar?CIK=A...
39,Boston Properties,Office,BXP,https://www.sec.gov/cgi-bin/browse-edgar?CIK=B...
40,Brandywine Realty Trust,Office,BDN,https://www.sec.gov/cgi-bin/browse-edgar?CIK=B...
41,Corporate Office Properties Trust,Office,OFC,https://www.sec.gov/cgi-bin/browse-edgar?CIK=O...
42,Mack-Cali Realty Corporation,Office,CLI,https://www.sec.gov/cgi-bin/browse-edgar?CIK=C...


In [8]:
def get10KRecent(urllist):
    base10k = 'https://www.sec.gov'
    desc_types = ['10-K', 'FORM 10-K']
    fyear = dt.now().year
    year = str(fyear)[-2:]
    _10kList = []
    recent10k = []

    for url in urllist:
        html = requests.get(url)
        soup = BeautifulSoup(html.text,"lxml")
        table = soup.findAll(class_="tableFile2")
        for r in table:
            row = r.findAll('tr')
            for d in row:
                data = d.findAll('td')
                if len(data) > 0:
                    if data[0].text in desc_types:
                        _10Ksuffix = data[1].a['href']
                        _10kList.append(base10k + _10Ksuffix)
    for x in _10kList:
        if str(x[-19:-17]) == year:
            recent10k.append(x)
    return recent10k

In [10]:
recent = get10KRecent(office.company_url)
recent

['https://www.sec.gov/Archives/edgar/data/1035443/000103544319000041/0001035443-19-000041-index.htm',
 'https://www.sec.gov/Archives/edgar/data/1037540/000165642319000006/0001656423-19-000006-index.htm',
 'https://www.sec.gov/Archives/edgar/data/790816/000156459019003921/0001564590-19-003921-index.htm',
 'https://www.sec.gov/Archives/edgar/data/860546/000086054619000007/0000860546-19-000007-index.htm',
 'https://www.sec.gov/Archives/edgar/data/924901/000110465919009587/0001104659-19-009587-index.htm',
 'https://www.sec.gov/Archives/edgar/data/1040971/000104097119000007/0001040971-19-000007-index.htm']