# Assigment 7

we will analise the similarity score for 5 companies in five five years

##  Downloading the index files

In [1]:
import os
import pandas as pd
import itertools
import sys

In [2]:
SEC_GOV_URL = 'http://www.sec.gov/Archives'
FORM_INDEX_URL = os.path.join(SEC_GOV_URL,'edgar','full-index','{}','QTR{}','form.idx')

In [3]:
def download_index(year_start,year_end):
    
    df_index = pd.DataFrame()
    
    for year, qtr in itertools.product(range(year_start,year_end+1),range(1,5)):
        
        index_url = FORM_INDEX_URL.format(year,qtr)
        
        #Printing the progress
        sys.stdout.write("\rDownloading " + index_url )
        sys.stdout.flush()
        
        df = pd.read_fwf(index_url, skiprows=10, header =None , \
                         colspecs=[(0,11),(12,73),(74,85), (86,96), (98,150)], \
                         names= ['form_type','company_name','cik','date','file'],\
                         encoding='latin-1') 
        
        df = df[df.form_type == '10-K']
        
        df_index = pd.concat([df_index,df],ignore_index=True)
        
    return df_index
    

In [4]:
#Setting the year to download
year_start = 2011
year_end = 2015
#Creating the dataframe
data = download_index(year_start,year_end)

Downloading http://www.sec.gov/Archives/edgar/full-index/2015/QTR4/form.idx

In [5]:
data.head()

Unnamed: 0,form_type,company_name,cik,date,file
0,10-K,1ST CONSTITUTION BANCORP,1141807,2011-03-23,edgar/data/1141807/0001214659-11-000973.txt
1,10-K,1ST SOURCE CORP,34782,2011-02-17,edgar/data/34782/0000034782-11-000008.txt
2,10-K,"1st Century Bancshares, Inc.",1420525,2011-03-15,edgar/data/1420525/0001420525-11-000013.txt
3,10-K,1st FRANKLIN FINANCIAL CORP,38723,2011-03-30,edgar/data/38723/0000038723-11-000025.txt
4,10-K,"1st United Bancorp, Inc.",1415277,2011-02-14,edgar/data/1415277/0001171200-11-000120.txt


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41407 entries, 0 to 41406
Data columns (total 5 columns):
form_type       41407 non-null object
company_name    41407 non-null object
cik             41407 non-null int64
date            41407 non-null object
file            41407 non-null object
dtypes: int64(1), object(4)
memory usage: 1.6+ MB


In [7]:
companies = ['APPLE INC','ORACLE CORP','MICROSOFT CORP','Google Inc.','YAHOO INC']

In [8]:
data = data[data.company_name.isin(companies)]

## Downloading MDA

In [9]:
from bs4 import BeautifulSoup
import unicodedata
import re
import requests

from tqdm import tqdm
tqdm.pandas(desc="Processing")

In [10]:
def get_mda(url):
    SEC_GOV_URL = 'http://www.sec.gov/Archives/'
    url = SEC_GOV_URL + url
    #print(url)
    text = requests.get(url).text
    text = text_process(text)
    text = parsing_job(text)
    return text

In [11]:
def text_process(text):
    """
    Preprocess Text
    """
    #Remove the  html tags
    try:
        soup = BeautifulSoup(text, "html.parser" )
    except:
        soup = BeautifulSoup(text, "html5lib" )
    text = soup.get_text("\n")
    
    # Normalize
    text = unicodedata.normalize("NFKD", text) 
    text = '\n'.join(text.splitlines())
    
    # Convert to upper
    text = text.upper() # Convert to upper

    # Take care of breaklines & whitespaces combinations due to beautifulsoup parsing
    text = re.sub(r'[ ]+\n', '\n', text)
    text = re.sub(r'\n[ ]+', '\n', text)
    text = re.sub(r'\n+', '\n', text)

    # Reformat item headers
    text = text.replace('\n.\n','.\n') # Move Period to beginning

    text = text.replace('\nI\nTEM','\nITEM')
    text = text.replace('\nITEM\n','\nITEM ')
    text = text.replace('\nITEM  ','\nITEM ')

    text = text.replace(':\n','.\n')

    # Math symbols for clearer looks
    text = text.replace('$\n','$')
    text = text.replace('\n%','%')
    
    return text

In [12]:
def parsing_job(text):
    mda, end= parse_mda(text)    
    # Parse second time if first parse results in index 
    if mda and len(mda.encode('utf-8')) < 1000:
        mda, _ = parse_mda(text, start=end)
    return mda

def parse_mda(text, start=0):
        debug = False
        """
            Return Values
        """

        mda = ""
        end = 0

        """
            Parsing Rules
        """

        # Define start & end signal for parsing
        item7_begins = [ '\nITEM 7.', '\nITEM 7 –','\nITEM 7:', '\nITEM 7 ', '\nITEM 7\n' ]
        item7_ends   = [ '\nITEM 7A' ]
        if start != 0:
            item7_ends.append('\nITEM 7') # Case: ITEM 7A does not exist
        item8_begins = [ '\nITEM 8'  ]

        """
            Parsing code section
        """
        text = text[start:]

        # Get begin
        for item7 in item7_begins:
            begin = text.find(item7)
            if debug:
                print(item7,begin)
            if begin != -1:
                break

        if begin != -1: # Begin found
            for item7A in item7_ends:
                end = text.find(item7A, begin+1)
                if debug:
                    print(item7A,end)
                if end != -1:
                    break

            if end == -1: # ITEM 7A does not exist
                for item8 in item8_begins:
                    end = text.find(item8, begin+1)
                    if debug:
                        print(item8,end)
                    if end != -1:
                        break

            # Get MDA
            if end > begin:
                mda = text[begin:end].strip()
            else:
                end = 0

        return mda, end

In [13]:
data['mda'] = data['file'].progress_apply(get_mda)

Processing: 100%|██████████| 25/25 [04:43<00:00, 14.66s/it]


In [14]:
data.to_json('./data/mda_5year.json')

# Analising MDA

We need to calcule the similary score in the mda, just load the file and.....

In [17]:
import pandas as pd
data = pd.read_json('./data/mda_5year.json')

In [18]:
data.head()

Unnamed: 0,cik,company_name,date,file,form_type,mda
11165,1288776,Google Inc.,2012-01-26,edgar/data/1288776/0001193125-12-025336.txt,10-K,ITEM 7.\nMANAGEMENTS DISCUSSION AND ANALYSIS ...
14355,1011006,YAHOO INC,2012-02-29,edgar/data/1011006/0001193125-12-086972.txt,10-K,ITEM 7.\nMANAGEMENTS DISCUSSION AND ANALYSIS ...
15289,1341439,ORACLE CORP,2012-06-26,edgar/data/1341439/0001193125-12-284007.txt,10-K,ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSI...
16135,789019,MICROSOFT CORP,2012-07-26,edgar/data/789019/0001193125-12-316848.txt,10-K,ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS O...
16501,320193,APPLE INC,2012-10-31,edgar/data/320193/0001193125-12-444068.txt,10-K,ITEM 7.\nMANAGEMENTS DISCUSSION AND ANALYSIS ...
