In [3]:
"""
This code extracts the MD&A sections from 10K financial statements.  The list of paths for the respective 10K's
are obtained from the SEC's master files giving paths to all of the public documents that are filed with the SEC
in each quarter.  The repository includes that actual download links (i.e. downloadindex.sas7bdat and downloadlist.txt) 
that we use in our study.  Our links include all filings classified as '10-K','10-K/A','10-K405/A','10-K405','10-KSB',
'10-KSB/A','10KSB','10KSB/A','10KSB40','10KSB40/A' from 2002 to 2016. 
"""

import csv
import requests
import re
import os

In [4]:
######################################################################################
######################################################################################
# This section is for functions that are used throughout the scrape
######################################################################################
######################################################################################

########################## Obtain file Information ###################################
def parse(file1, file2):
    hand=open(file1)
    IDENTITY=""
    for line in hand:
        line=line.strip()
        if re.findall('^COMPANY CONFORMED NAME:',line):
            k = line.find(':')
            comnam=line[k+1:]
            comnam=comnam.strip()
            IDENTITY='<HEADER>\nCOMPANY NAME: '+str(comnam)+'\n'                                         
            break
        
    hand=open(file1)
    for line in hand:
        line=line.strip()
        if re.findall('^CENTRAL INDEX KEY:',line):
            k = line.find(':')
            cik=line[k+1:]
            cik=cik.strip()
            #print cik
            IDENTITY=IDENTITY+'CIK: '+str(cik)+'\n'
            break
        
    hand=open(file1)
    for line in hand:
        line=line.strip()
        if re.findall('^STANDARD INDUSTRIAL CLASSIFICATION:',line):
            k = line.find(':')
            sic=line[k+1:]
            sic=sic.strip()
            siccode=[]
            for s in sic: 
                if s.isdigit():
                    siccode.append(s)    
            #print siccode
            IDENTITY=IDENTITY+'SIC: '+''.join(siccode)+'\n'
            break
        
    hand=open(file1)
    for line in hand:
        line=line.strip()
        if re.findall('^CONFORMED SUBMISSION TYPE:',line):
            k = line.find(':')
            subtype=line[k+1:]
            subtype=subtype.strip()
            #print subtype
            IDENTITY=IDENTITY+'FORM TYPE: '+str(subtype)+'\n'
            break
            
    hand=open(file1)
    for line in hand:
        line=line.strip()
        if re.findall('^CONFORMED PERIOD OF REPORT:',line):
            k = line.find(':')
            cper=line[k+1:]
            cper=cper.strip()
            #print cper
            IDENTITY=IDENTITY+'REPORT PERIOD END DATE: '+str(cper)+'\n'
            break
            
    hand=open(file1)
    for line in hand:
        line=line.strip()
        if re.findall('^FILED AS OF DATE:',line):
            k = line.find(':')
            fdate=line[k+1:]
            fdate=fdate.strip()
            #print fdate                                
            IDENTITY=IDENTITY+'FILE DATE: '+str(fdate)+'\n'+'</HEADER>\n'
            break
            
    with open(file2, 'a') as f:
        f.write(str(IDENTITY))
        f.close()
    hand.close()

In [5]:
###########################  DELETE HEADER INFORMATION  #######################################

def headerclean(temp, temp1):
    mark0=0
    strings1=['</SEC-HEADER>','</IMS-HEADER>']
    hand=open(temp)
    hand.seek(0)
    for x, line in enumerate(hand):
        line=line.strip()
        if any(s in line for s in strings1):
            mark0=x
            break
    hand.seek(0)
    
    newfile=open(temp1,'a')
    for x, line in enumerate(hand):
        if x>mark0:
            newfile.write(line)
    hand.close()
    newfile.close()
    
    newfile=open(temp1,'r')
    hand=open(temp,'a')        
    for line in newfile:
        if "END PRIVACY-ENHANCED MESSAGE" not in line:
            hand.write(line)                
    hand.close()                
    newfile.close()

In [6]:
###########################  XBRL Cleaner  ###################################################

def xbrl_clean(cond1, cond2, str0):
    locations=[0]
    #print locations
    placement1=[]
    str0=str0.lower()
    for m in re.finditer(cond1, str0):
        a=m.start()
        placement1.append(a)
    #print placement1
    
    if placement1!=[]:
        placement2=[]
        for m in re.finditer(cond2, str0):
            a=m.end()
            placement2.append(a)
    #    print placement2
        
        len1=len(placement1)
        placement1.append(len(str0))
        
        for i in range(len1):
            placement3=[]
            locations.append(placement1[i])
            for j in placement2:
                if (j>placement1[i] and j<placement1[i+1]):
                    placement3.append(j)
                    break
            if placement3!=[]:
                locations.append(placement3[0])
            else:
                locations.append(placement1[i])
    
    #print locations
    return locations


In [7]:
###########################  Table Cleaner  ###################################################

def table_clean(cond1, cond2, str1):
    Items0=["item 7", "item7", "item8", "item 8"]
    Items1=["item 1", "item 2","item 3","item 4","item 5","item 6","item 9", "item 10", "item1", "item2","item3","item4","item5","item6","item9", "item10"]
    
    str2=str1.lower()
    placement1=[]
    for m in re.finditer(cond1, str2):
        a=m.start()
        placement1.append(a)
    n=len(placement1)
    placement1.append(len(str2))
    
    placement2=[]
    for m in re.finditer(cond2, str2):
        a=m.end()
        placement2.append(a)
        
    if (placement1!=[] and placement2!=[]):
        current=str1[0:placement1[0]]
        
        for i in range(n):
            begin=placement1[i]
            for j in placement2:
                if j>begin:
                    end=j
                    break
            
            if end=="":
                current=current+str1[begin:placement1[i+1]]
            else:
                str2=""
                str2=str1[begin:end].lower()
                str2=str2.replace("&nbsp;"," ")
                str2=str2.replace("&NBSP;"," ")
                p = re.compile(r'&#\d{1,5};')
                str2=p.sub("",str2)
                p = re.compile(r'&#.{1,5};')
                str2=p.sub("",str2)
                if any(s in str2 for s in Items0):
                    if not any(s in str2 for s in Items1):
                        current=current+str2
                    
                current=current+str1[end:placement1[i+1]]
                end=""
    else:
        current=str1
    return current

In [8]:
###############################################################################################
###############################################################################################
# This section is the actual program
###############################################################################################
###############################################################################################
'''
This is the filepath of where you would like the text files of possible MD&A sections to be saved.  It is also the location of the downloadlist.txt file
that includes all of the filing links.
'''
filepath="C:\\Users\\DIVYANSHU\\Desktop\\risk"   

###############################################################################
#This is the master download file that include all of the links to SEC filings.
###############################################################################
download=os.path.join(filepath,"downloadlist.txt")


#############################################################################################################
#The are just hosting text files that can be ignored.  You need them to recored the data as the program runs.
#############################################################################################################
temp=os.path.join(filepath,"temp.txt")
temp1=os.path.join(filepath,"newfile.txt")

#################################################################################
#This is the file that records the number of sections for each respective filing.
#################################################################################
LOG=os.path.join(filepath,"DOWNLOADLOG.txt")
with open(LOG,'w') as f:
    f.write("Filer\tSECTIONS\n")
    f.close()

In [18]:
######## Download the filing ############
with open(download, 'r') as txtfile:
    reader = csv.reader(txtfile, delimiter=',')
    for line in reader:
        #print line
        FileNUM=line[0].strip()
        Filer=os.path.join(filepath, str(line[0].strip())+".txt")
        url = 'https://www.sec.gov/Archives/' + line[1].strip()
        r = requests.get(url)
        string = str(r.content)
        with open(temp, 'w') as f:
            f.write(string)
        f.close()
        
##### Obtain Header Information on Filing ######################        
        
        parse(temp, Filer)
        headerclean(temp, temp1)
        
##### ASCII Section ######################        
    
        with open(temp,'r') as f:
            str1=f.read()
            output=str1
            locations_xbrlbig=xbrl_clean("<type>zip", "</document>", output)
            locations_xbrlbig.append(len(output))
            if locations_xbrlbig!=[]:
                str1=""
                if len(locations_xbrlbig)%2==0:
                    for i in range(0,len(locations_xbrlbig),2):
                        str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]

        f.close
        output=str1
        locations_xbrlbig=xbrl_clean("<type>graphic", "</document>", output)
        locations_xbrlbig.append(len(output))
        
        if locations_xbrlbig!=[0]:
            str1=""
            if len(locations_xbrlbig)%2==0:
                for i in range(0,len(locations_xbrlbig),2):
                    str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]
        
        output=str1
        locations_xbrlbig=xbrl_clean("<type>excel", "</document>", output)
        locations_xbrlbig.append(len(output))
        
        if locations_xbrlbig!=[0]:
            str1=""
            if len(locations_xbrlbig)%2==0:
                for i in range(0,len(locations_xbrlbig),2):
                    str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]
                    
        output=str1
        locations_xbrlbig=xbrl_clean("<type>pdf", "</document>", output)
        locations_xbrlbig.append(len(output))
        
        if locations_xbrlbig!=[0]:
            str1=""
            if len(locations_xbrlbig)%2==0:
                for i in range(0,len(locations_xbrlbig),2):
                    str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]
        
        output=str1
        locations_xbrlbig=xbrl_clean("<type>xml", "</document>", output)
        locations_xbrlbig.append(len(output))
        
        if locations_xbrlbig!=[0]:
            str1=""
            if len(locations_xbrlbig)%2==0:
                for i in range(0,len(locations_xbrlbig),2):
                    str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]

        output=str1
        locations_xbrlbig=xbrl_clean("<type>ex", "</document>", output)
        locations_xbrlbig.append(len(output))
        
        if locations_xbrlbig!=[0]:
            str1=""
            if len(locations_xbrlbig)%2==0:
                for i in range(0,len(locations_xbrlbig),2):
                    str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]
                    
######Remove <DIV>, <TR>, <TD>, and <FONT>###########################
                   
        p = re.compile(r'(<DIV.*?>)|(<DIV\n.*?>)|(<DIV\n\r.*?>)|(<DIV\r\n.*?>)|(<DIV.*?\n.*?>)|(<DIV.*?\n\r.*?>)|(<DIV.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<div.*?>)|(<div\n.*?>)|(<div\n\r.*?>)|(<div\r\n.*?>)|(<div.*?\n.*?>)|(<div.*?\n\r.*?>)|(<div.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<TD.*?>)|(<TD\n.*?>)|(<TD\n\r.*?>)|(<TD\r\n.*?>)|(<TD.*?\n.*?>)|(<TD.*?\n\r.*?>)|(<TD.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<td.*?>)|(<td\n.*?>)|(<td\n\r.*?>)|(<td\r\n.*?>)|(<td.*?\n.*?>)|(<td.*?\n\r.*?>)|(<td.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<TR.*?>)|(<TR\n.*?>)|(<TR\n\r.*?>)|(<TR\r\n.*?>)|(<TR.*?\n.*?>)|(<TR.*?\n\r.*?>)|(<TR.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<tr.*?>)|(<tr\n.*?>)|(<tr\n\r.*?>)|(<tr\r\n.*?>)|(<tr.*?\n.*?>)|(<tr.*?\n\r.*?>)|(<tr.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<FONT.*?>)|(<FONT\n.*?>)|(<FONT\n\r.*?>)|(<FONT\r\n.*?>)|(<FONT.*?\n.*?>)|(<FONT.*?\n\r.*?>)|(<FONT.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<font.*?>)|(<font\n.*?>)|(<font\n\r.*?>)|(<font\r\n.*?>)|(<font.*?\n.*?>)|(<font.*?\n\r.*?>)|(<font.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<P.*?>)|(<P\n.*?>)|(<P\n\r.*?>)|(<P\r\n.*?>)|(<P.*?\n.*?>)|(<P.*?\n\r.*?>)|(<P.*?\r\n.*?>)')
        str1=p.sub("",str1)
        p = re.compile(r'(<p.*?>)|(<p\n.*?>)|(<p\n\r.*?>)|(<p\r\n.*?>)|(<p.*?\n.*?>)|(<p.*?\n\r.*?>)|(<p.*?\r\n.*?>)')
        str1=p.sub("",str1)
        str1=str1.replace("</DIV>","")
        str1=str1.replace("</div>","")
        str1=str1.replace("</TR>","")
        str1=str1.replace("</tr>","")
        str1=str1.replace("</TD>","")
        str1=str1.replace("</td>","")
        str1=str1.replace("</FONT>","")
        str1=str1.replace("</font>","")
        str1=str1.replace("</P>","")
        str1=str1.replace("</p>","")
        
############# Remove XBRL Sections #########################
                
        output=str1
        locations_xbrlsmall=xbrl_clean("<xbrl", "</xbrl.*>", output)
        locations_xbrlsmall.append(len(output))
        
        if locations_xbrlsmall!=[0]:
            str1=""
            if len(locations_xbrlsmall)%2==0:
                for i in range(0,len(locations_xbrlsmall),2):
                    str1=str1+output[locations_xbrlsmall[i]:locations_xbrlsmall[i+1]]
        
############# Remove Teble Sections #########################

        output1=table_clean('<table','</table>',str1)
        
############# Remove Newlines and Carriage Returns #########################

        str1=str1.replace("\\n"," ")
        str1=str1.replace("\\r"," ")
        str1=str1.replace("\\t"," ")
        p = re.compile(r'<.*?>')
        str1=p.sub("",str1)
        
############# Remove '<a' and '<hr' and <sup Sections #########################        
        
        str1=str1.replace("&nbsp;"," ")
        str1=str1.replace("&NBSP;"," ")
        str1=str1.replace("&LT;","LT")
        str1=str1.replace("&#60;","LT")
        str1=str1.replace("&#160;"," ")
        str1=str1.replace("&AMP;","&")
        str1=str1.replace("&amp;","&")
        str1=str1.replace("&#38;","&")
        str1=str1.replace("&APOS;","'")
        str1=str1.replace("&apos;","'")
        str1=str1.replace("&#39;","'")
        str1=str1.replace('&QUOT;','"')
        str1=str1.replace('&quot;','"')
        str1=str1.replace('&#34;','"')
        str1=str1.replace("\t"," ")
        str1=str1.replace("\v","")
        str1=str1.replace("&#149;"," ")
        str1=str1.replace("&#224;","")
        str1=str1.replace("&#145;","")
        str1=str1.replace("&#146;","")
        str1=str1.replace("&#147;","")
        str1=str1.replace("&#148;","")
        str1=str1.replace("&#151;"," ")
        str1=str1.replace("&#153;","") 
        str1=str1.replace("&#111;","")
        str1=str1.replace("&#153;","")
        str1=str1.replace("&#253;","")
        str1=str1.replace("&#8217;","")
        str1=str1.replace("&#32;"," ")
        str1=str1.replace("&#174;","")
        str1=str1.replace("&#167;","")
        str1=str1.replace("&#169;","")
        str1=str1.replace("&#8220;","")
        str1=str1.replace("&#8221;","")
        str1=str1.replace("&rsquo;","")
        str1=str1.replace("&lsquo;","")
        str1=str1.replace("&sbquo;","")
        str1=str1.replace("&bdquo;","")
        str1=str1.replace("&ldquo;","")
        str1=str1.replace("&rdquo;","")
        str1=str1.replace("\'","")
        p = re.compile(r'&#\d{1,5};')
        str1=p.sub("",str1)
        p = re.compile(r'&#.{1,5};')
        str1=p.sub("",str1)
        str1=str1.replace("_"," ")
        str1=str1.replace("and/or","and or")
        str1=str1.replace("-\n"," ")
        p = re.compile(r'\s*-\s*')
        str1=p.sub(" ",str1)
        p = re.compile(r'(-|=)\s*')
        str1=p.sub(" ",str1)
        p = re.compile(r'\s\s*')
        str1=p.sub(" ",str1)
        p = re.compile(r'(\n\s*){3,}')
        str1=p.sub("\n\n",str1)
        p = re.compile(r'<.*?>')
        str1=p.sub("",str1)

################################## MD&A Section #####################################################
        
        item7={}
        item7[1]="ITEM 1A. RISK FACTORS "
        item7[2]="ITEM 1A.RISK FACTORS "
        item7[3]="ITEM1A. RISK FACTORS "
        item7[4]="ITEM1A.RISK FACTORS "
        item7[5]="ITEM 1A.RISK FACTORS"
        item7[6]="ITEM 1A.RISK FACTORS"
        item7[7]="ITEM 1A. RISKFACTORS "
        item7[8]="ITEM1A. RISKFACTORS "
        item7[9]="ITEM1A.RISKFACTORS "
        item7[10]="ITEM 1A.  RISK FACTORS"
        item7[11]="ITEM 1A.RISK FACTORS"
        item7[12]="ITEM1A.RISK FACTORS"
        item7[13]="ITEM 1A.RISK FACTORS "
        item7[14]="ITEM 1A.RISK FACTORS "
        item7[15]="ITEM1A.RISKFACTORS"
        item7[16]=" ITEM 1A.RISK FACTORS "
        item7[17]=" ITEM 1A. RISKFACTORS "
        item7[18]=" ITEM 1A. RISK FACTORS "
        item7[19]=" ITEM 1A.RISKFACTORS "
        item7[20]="Item 1A.Risk Factors "
        item7[21]="Item 1A. Risk Factors "
        item7[22]="Item1A. Risk Factors "
        item7[23]=" Item1A.Risk Factors "
        item7[24]="Item 1A. Risk Factors"
        item7[25]="Item1A. Risk Factors "
        item7[26]=" Item 1A. Risk Factors"
        item7[27]="Item 1A. Risk Factors"
        item7[28]="Item1A. RiskFactors"
        item7[29]="Item 1A.RiskFactors"
        item7[30]=" Item 1A.Risk Factors "
        item7[31]=" Item 1A.Risk Factors"
        item7[32]="Item 1A.Risk factors."
        item7[33]="Item 1A. Risk factors."
        item7[34]=" Item 1A.Risk factors. "
        item7[35]="Item1A.Risk factors."
        item7[36]=" Item 1A. Risk factors."
        item7[37]=" Item 1A. Risk factors."
        item7[38]="Item 1A. Risk factors."
        item7[39]="Item 1A. Risk factors. "
        item7[40]="Item 1A.  Risk factors."
        item7[41]="Item 1A.  Risk factors. "
        item7[42]=" Item 1A.Risk  factors."
        item7[43]="Item 1A .Risk factors. "
        item7[44]="Item 1A. Risk factors."
        item7[45]=" Item 1A. Risk factors. "
        item7[46]="Item1A. Risk  factors."
        item7[47]=" Item 1A.Risk factors."
        item7[48]="Item 1A.Risk  factors."
        item7[49]=" ITEM 1A. RISK  FACTORS "
        item7[50]=" ITEM 1A.RISK  FACTORS "
        item7[51]="ITEM1A. RISK FACTORS "
        item7[52]=" ITEM 1A.RISKFACTORS "
        item7[53]="ITEM 1A.RISK FACTORS"
        item7[54]=" ITEM1A.RISKFACTORS "
        item7[55]=" ITEM 1A.RISK  FACTORS "
        item7[56]="Item 1A. Risk Factors"
        item7[57]="Item 1A. Risk FactorsThe"
        item7[58]="Item1A. Risk  Factors "
        item7[59]=" Item 1A. Risk Factors"
        item7[60]="Item1A. Risk  Factors "
        item7[61]=" Item 1A.Risk  Factors "
        item7[62]=" The risks and uncertainties described below "
        item7[63]="Our businesses routinely encounter and address risks"
        item7[64]=" The risks and uncertainties described below"
        item7[65]="The risks and uncertainties described below"
        item7[66]="The risks and uncertainties described below "
        item7[67]="business is subject to certain risks"
        item7[68]="Item 1A. Risk FactorsThe Company is subject to a number of"
        item7[69]=" The risks described below "
        item7[70]="Item 1A. Risk Factors The Company"
        item7[71]="Item 1A. Risk Factors The Company "
        item7[72]="Item 1A. Risk Factors Various"
        item7[73]=" Item 1A. Risk Factors Various "
        item7[74]=" ITEM 1A.(.)*RISK FACTORS"
        item7[75]="Risk Factors of"
        item7[76]="Item 1A.(.)*Risk Factors."
        item7[77]="Item 1A. RISK FACTORS"
        item7[78]="ITEM 1A.  RISKFACTORS"
        item7[79]="RISK FACTORS."
        item7[80]="RISKFACTORS."
        item7[81]="RISK FACTORS. "
        item7[82]=" RISK FACTORS."
        item7[83]="ITEM 1a.  risk factors"
        item7[84]="ITEM 1a. risk factors"
        item7[85]="The following discussion of risk factors"
        item7[86]="Item 1A. Risk factors"
        item7[87]="Item 1A. Risk factors "
        item7[88]=" Item 1A. Risk factors "
        item7[89]=" Item 1A. Risk factors"
        item7[90]="RiskFactors of"
        item7[91]="Risk Factors of "
        item7[92]=" RiskFactors of "
        item7[93]=" RiskFactors of"
        item7[94]="Risk Factors of"
        item7[95]="ITEM 1A.  RISK FACTORS "
        item7[96]=" ITEM 1A.RISK FACTORS "
        item7[97]=" ITEM 1A.RISK FACTORS"
        item7[98]="ITEM 1A.RISK FACTORS"
        item7[99]="ITEM 1A / RISK FACTORs"
        item7[100]="ITEM 1A/ RISK FACTORs "
        item7[101]="ITEM 1A/ RISK FACTORs"
        item7[102]="ITEM 1A /RISK FACTORs "
        item7[103]="ITEM 1A / RISK FACTORs"
        item7[104]="Item 1A: Risk Factors"
        item7[105]=" Item 1A: Risk Factors "
        item7[106]="Item 1A: Risk Factors "
        item7[107]=" Item 1A: Risk Factors"
        item7[108]="ITEM 1A: Risk Factors"
        item7[109]=" ITEM 1A: Risk Factors "
        item7[110]=" ITEM 1A: Risk Factors"
        item7[111]="ITEM 1A: Risk Factors "
        item7[112]=" ITEM 1A: Risk factors "
        item7[113]="ITEM 1A: Risk factors"
        item7[114]="ITEM 1A: Risk factors "
        item7[115]=" ITEM 1A: Risk factors"
        item7[116]="Item 1A: Risk Factors "
        item7[117]=" Item 1A: Risk Factors "
        item7[118]=" Item 1A: Risk Factors"
        item7[119]="Item 1A: Risk Factors"
        item7[120]="Item 1A/ Risk Factors"
        item7[121]="Item 1A / Risk Factors"
        item7[122]="Item 1A/ RiskFactors"
        item7[123]="Item 1A / RiskFactors"
        item7[124]="Item 1A/ Risk factors"
        item7[125]="Item 1A/ Risk factors"
        item7[126]="Item 1A /Risk factors"
        item7[127]="Item 1A/Risk factors"
        item7[128]="Item 1A./ Risk factors"
        item7[129]="Item 1A./Risk factors"
        item7[130]="Item 1A./ Risk Factors"
        item7[131]="Item 1A./Risk Factors"
        item7[132]="RISK FACTORS"
        item7[133]="RISK FACTORS "
        
        

        
        
        item8={}
        item8[1]="ITEM 1B.(.)*Unresolved Staff Comments"
        item8[2]="ITEM 1B.(.)*UNRESOLVED STAFF COMMENTS"
        item8[3]="ITEM 1B. UNRESOLVED STAFF COMMENTS"
        item8[4]="ITEM 1B. Unresolved Staff Comments"
        item8[5]="ITEM 2. PROPERTIES"
        item8[6]="ITEM 2.PROPERTIES"
        item8[7]=" ITEM 1B.(.)*Unresolved Staff Comments"
        item8[8]="Item 2.Properties"
        item8[9]=" ITEM 2. PROPERTIES "
        item8[10]="Unresolved Staff Comments Not Applicable."
        item8[11]="Unresolved Staff Comments None"
        item8[12]="Unresolved Staff CommentsNone."
        item8[13]="ITEM 2. DESCRIPTION OF PROPERTIES."
        item8[14]="ITEM 2.(.)*DESCRIPTION OF PROPERTIES. "
        item8[15]=" ITEM 2.DESCRIPTION OF PROPERTIES. "
        item8[16]=" ITEM 2. DESCRIPTION OF PROPERTIES."
        item8[17]="ITEM 1B. Unresolved Staff Comments"
        item8[18]="ITEM 1B. Unresolved Staff Comments.Not applicable."
        item8[19]=" ITEM 1B. Unresolved Staff Comments."
        item8[20]="ITEM 1B. Unresolved Staff Comments Not applicable"
        item8[21]="ITEM 1B. Unresolved Staff CommentsNot applicable "
        item8[22]="ITEM 1B. Unresolved Staff CommentsNot applicable"
        item8[23]="ITEM 1B. Unresolved Staff Comments Not applicable "
        item8[24]="ITEM 1B.Unresolved Staff Comments Not applicable "
        item8[25]="ITEM 1B.Unresolved Staff Comments Not applicable"
        item8[26]="ITEM 1B.Unresolved Staff Comments Not Applicable"
        item8[27]="ITEM 2. Properties LocationSquare"
        item8[28]="ITEM 2. Properties Location Square"
        item8[29]="Item 1B. UNRESOLVED STAFF COMMENTS"
        item8[30]="Item 1B. UNRESOLVED STAFF COMMENTS "
        item8[31]="Item 1B. UNRESOLVED STAFF COMMENTS.None."
        item8[32]="Item 1B. UNRESOLVED STAFF COMMENTS. None."
        item8[33]="Item 1B. UNRESOLVED STAFF COMMENTS. None "
        item8[34]="Item 1B. UNRESOLVED STAFF COMMENTS. Not applicable"
        item8[35]="Item 1B. UNRESOLVED STAFF COMMENTS. Not Applicable"
        item8[36]="Item 1B.(.)*Unresolved Staff Comments.None."
        item8[37]="Item 1B.(.)*Unresolved Staff Comments. None."
        item8[38]="Item 1B. Unresolved Staff Comments. None. "
        item8[39]="Item 1B. Unresolved Staff Comments. None."
        item8[40]="Item 1B. Unresolved Staff Comments.None. "
        item8[41]="Item1B.UNRESOLVED STAFF COMMENTS."
        item8[42]="Item1B.UNRESOLVED STAFF COMMENTS. "
        item8[43]="Item1B. UNRESOLVED STAFF COMMENTS."
        item8[44]="Item 1B.UNRESOLVED STAFF COMMENTS."
        item8[45]="ITEM 1B.(.)*UNRESOLVED STAFFCOMMENTS"
        item8[46]="ITEM 1B.UNRESOLVED STAFFCOMMENTS"
        item8[47]="ITEM 1B. UNRESOLVED STAFFCOMMENTS"
        item8[48]="ITEM 1B. UNRESOLVED STAFFCOMMENTS"
        item8[49]="ITEM 1B. UNRESOLVED STAFFCOMMENTS"
        item8[50]="ITEM 1B.UNRESOLVEDSTAFFCOMMENTS"
        item8[51]="ITEM 1B.  UNRESOLVEDSTAFFCOMMENTS"
        item8[52]="UNRESOLVEDSTAFFCOMMENTS"
        item8[53]=" STAFF COMMENTS."
        item8[54]="UNRESOLVED STAFFCOMMENTS "
        item8[55]="UNRESOLVED STAFF COMMENTS None."
        item8[56]="UNRESOLVED STAFF COMMENTS."
        item8[57]=" UNRESOLVED STAFF COMMENTS."
        item8[58]=" UNRESOLVED STAFF COMMENTS. "
        item8[59]=" UNRESOLVED STAFF COMMENTS"
        item8[60]=" UNRESOLVED STAFFCOMMENTS "
        item8[61]="Item 1B. Unresolved Staff Comments"
        item8[62]=" Item 1B. Unresolved Staff Comments"
        item8[63]=" Item 1B. Unresolved Staff Comments "
        item8[64]="Item 1B. Unresolved Staff Comments "
        item8[65]="ITEM 1B. UNRESOLVED STAFF COMMENTS"
        item8[66]=" ITEM 1B. UNRESOLVED STAFF COMMENTS"
        item8[67]="ITEM 1B. UNRESOLVED STAFF COMMENTS "
        item8[68]=" ITEM 1B. UNRESOLVED STAFF COMMENTS "
        item8[69]="ITEM 1B.  UNRESOLVED STAFFCOMMENTS"
        item8[70]="ITEM 1B.  UNRESOLVED STAFFCOMMENTS "
        item8[71]=" ITEM 1B.  UNRESOLVED STAFFCOMMENTS "
        item8[72]=" ITEM 1B.  UNRESOLVED STAFFCOMMENTS"
        item8[73]="ITEM 1B.  UNRESOLVED STAFF COMMENTS"
        item8[74]=" ITEM 1B.  UNRESOLVED STAFF COMMENTS "
        item8[75]="ITEM 1B.  UNRESOLVED STAFF COMMENTS "
        item8[76]=" ITEM 1B.  UNRESOLVED STAFF COMMENTS"
        item8[77]="Unresolved Staff Comments"
        item8[78]=" Unresolved Staff Comments "
        item8[79]=" Unresolved Staff Comments"
        item8[80]="Unresolved Staff Comments "
        item8[81]="ITEM 1B.UNRESOLVED STAFF COMMENTS"
        item8[82]=" ITEM 1B.UNRESOLVED STAFF COMMENTS "
        item8[83]="ITEM 1B.UNRESOLVED STAFF COMMENTS Not Applicable."
        item8[84]=" ITEM 1B.UNRESOLVED STAFF COMMENTS"
        item8[85]="ITEM 1B.UNRESOLVED STAFF COMMENTS Not Applicable. "
        item8[86]="ITEM 1B / UNRESOLVED STAFF COMMENTS"
        item8[87]="ITEM 1B /UNRESOLVED STAFF COMMENTS"
        item8[88]="ITEM 1B/ UNRESOLVED STAFF COMMENTS"
        item8[89]="ITEM 1B/UNRESOLVED STAFF COMMENTS"
        item8[90]=" ITEM 1B / UNRESOLVED STAFF COMMENTS "
        item8[91]="Unresolved StaffComments"
        item8[92]=" Unresolved StaffComments"
        item8[93]=" Unresolved StaffComments "
        item8[94]="Unresolved StaffComments "
        item8[95]="Unresolved Staff"
        item8[96]="ITEM 2.  PROPERTIES"
        item8[97]="ITEM 2.  PROPERTIES "
        item8[98]="Item 2. Properties"
        item8[99]="ITEM 1B. UNRESOLVED"
        item8[100]=" ITEM 1B. UNRESOLVED "
        item8[101]=" ITEM 1B. UNRESOLVED"
        item8[102]="ITEM 1B. UNRESOLVED "
        item8[103]="1B.Unresolved Staff"
        item8[104]=" UNRESOLVED STAFFCOMMENTS"
        item8[105]="UNRESOLVED STAFFCOMMENTS"
        item8[106]=" UNRESOLVED STAFFCOMMENTS "
        item8[107]="UNRESOLVED STAFF COMMENTS."
        item8[108]="UNRESOLVEDSTAFF COMMENTS."
        item8[109]="UNRESOLVED STAFFCOMMENTS."
        item8[110]="UNRESOLVED STAFF COMMENTS "
        item8[111]="UNRESOLVED STAFF COMMENTS. "
        item8[112]=" UNRESOLVED STAFF COMMENTS. "
        item8[113]="Unresolved Staff"
        item8[114]="Unresolved Staff "
        item8[115]=" Unresolved Staff"
        item8[116]=" Unresolved Staff "
        item8[117]="ITEM 1B. UNRESOLVED STAFF"
        item8[118]="ITEM 1B. UNRESOLVED STAFF "
        item8[119]=" ITEM 1B. UNRESOLVED STAFF"
        item8[120]=" ITEM 1B. UNRESOLVED STAFF "
        item8[121]="Item 2.  PROPERTIES"
        item8[122]="UNRESOLVED STAFF COMMENTS"
        item8[123]="PROPERTIES"
        

        
        
        

        


        

        look={" see ","See Part"," See "," See","See Part " " refer to ", " included in "," contained in ","see","see in", "in part of","RISK","FACTORS","10 K", " 10 K"," 10 K ","part","Part"}
        
        a={}
        c={}
       
        #lstr1=str1.lower()
        #print(lstr1[0:1000])
        for j in range(1,134):
            a[j]=[]
            for m in re.finditer(item7[j], str1,flags= re.S):
                #print(m)
                if not m:
                    print(j)
                    break
                else:
                    substr1=str1[m.start()-12:m.start()]
                    substr2=str1[m.start()-50:m.start()+100]
                    #print(substr2)
                    if not any(s in substr1 for s in look):   
                        #print substr1
                        b=m.start()
                        a[j].append(b)
        #print i
    
        list1=[]
        for value in a.values():
            for thing1 in value:
                list1.append(thing1)
        list1.sort()
        if len(list1)>1:
            list1 = [x for x in list1 if x>=15000]
        else:
            list1 = list1
            
        if len(list1)==1:
            list1 = list1
        elif len(list1)>1:
            del list1[1:-1]
            list1.pop()
        else:
            list1=list1
            
        if len(list1)==0:
            list1=[0]
        else:
            list1 = list1
            
        print(list1)
        
        for j in range(1,123):
            c[j]=[]
            for m in re.finditer(item8[j], str1,flags=re.S):
                if not m:
                    break
                else:
                    substr1=str1[m.start()-15:m.start()]
                    substr2=str1[m.start()-50:m.start()+100]
                    #print(substr2)
                    if not any(s in substr1 for s in look):   
                        #print substr1
                        b=m.start()
                        c[j].append(b)
        list2=[]
        for value in c.values():
            for thing2 in value:
                list2.append(thing2)
        list2.sort()
        if len(list2)>=1:
            list2 = [x for x in list2 if x>20000 and x<250000 and x-list1[0]<180000]
        else:
            list2=list2
            
        if len(list2)==1:
            list2 = list2
        else:
            del list2[0:-1]
        print(list2)
        
        locations={}
        ##print(list2[0:100])
        if list2==[]:
            print ("NO risk")
        else:
            if list1==[]:
                print ("NO risk factors")
            else:
                for k0 in range(len(list1)):
                    locations[k0]=[]
                    locations[k0].append(list1[k0])
                for k0 in range(len(locations)):
                    for item in range(len(list2)):
                        if locations[k0][0]<=list2[item]:
                            locations[k0].append(list2[item])
                            break
                    if len(locations[k0])==1:
                        del locations[k0]
        
        if locations=={}:
            with open(LOG,'a') as f:
                f.write(str(FileNUM)+"\t"+"0\n")
                f.close()
        else:
            sections=0
            for k0 in range(len(locations)): 
                substring2=str1[locations[k0][0]:locations[k0][1]]
                substring3=substring2.split()
                #print(substring3[0:100])
                if len(substring3)>5:
                    sections=sections+1
                    with open(Filer,'a') as f:
                        f.write("<SECTION>\n")
                        f.write(substring2+"\n")
                        f.write("</SECTION>\n")
                        f.close()
            with open(LOG,'a') as f:
                    f.write(str(FileNUM)+"\t"+str(sections)+"\n")
                    f.close()
        print (FileNUM)

[23033]
[49131]
1
[85669]
[183929]
2
[41653]
[100459]
3
[46770]
[85061]
4
[100138]
[147244]
5
[56225]
[101444]
6
[30891]
[57298]
7
[20928]
[35932]
8
[0]
[102384]
9
[20197]
[41858]
10
[31893]
[69232]
11
[26806]
[46195]
12
[70484]
[152615]
13
[64873]
[65110]
14
[33071]
[59430]
15
[34876]
[52634]
16
[36859]
[86081]
17
[100373]
[197948]
18
[0]
[21573]
19
[59398]
[81169]
20
[29767]
[64130]
21
[37248]
[77049]
22
[16276]
[67292]
23
[29357]
[65558]
24
[109249]
[161518]
25
[20638]
[41853]
26
[38584]
[94348]
27
[50473]
[71201]
28
[85669]
[183929]
29
[31077]
[52125]
30
[33608]
[61302]
31
[118493]
[194938]
32
[67191]
[107377]
33
[133379]
[177035]
34
[182886]
[]
NO risk
35
[24073]
[50574]
36
[58773]
[146229]
37
[31480]
[87431]
38
[92027]
[159545]
39
[29796]
[75814]
40
[31035]
[50808]
41
[26603]
[79003]
42
[27739]
[54152]
43
[102429]
[194954]
44
[28362]
[48378]
45
[37652]
[57608]
46
[30445]
[159842]
47
[20380]
[34260]
48
[28093]
[28179]
49
[46523]
[103659]
50
[36304]
[75245]
51
[31790]
[51458]
52
[2