In [None]:
import bs4 as bs  
from urllib.request import urlopen, Request, urlopen
from urllib.parse import urlparse,ParseResult
import re
import heapq 
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
class ParseWeb():
    
    def __init__(self, verbose=True, debug=False, printSites=False):
        
        self.printSites = printSites
        self.verbose = verbose
        self.debug = debug
        
        if self.verbose:
            self.printSites = True
        if self.debug:
            self.verbose = True
            self.printSites = True
            
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
            
        
    def text( self, url ):
        
        article_text = ""
        links = []
        
        if type(url) is str:
            # first time we see this URL
            
            if not url or not str(url) or len(url.split('.')) < 2:
                if self.verbose: print( 'Not valid URL [%s]' % url )
                return ''

            sites = self.format_url( url )


            # Get the primary page and any good link
            for site in sites:
                text, links = self.extract_text( site, links=True )
                article_text += text
                
        else:
            # update to the entry
            sites = url
            
        # If we got links then try those too ( only from base path )
        for x in links:
            if '@' in x:
                continue
            elif self.test_url( x ):
                sites.append( x )
                article_text += self.extract_text( x )
            elif self.test_url( site + x ):
                sites.append( site + x )
                article_text += self.extract_text( site + x )
            elif self.test_url( site + '/' + x ):
                sites.append( site + '/' + x )
                article_text += self.extract_text( site + '/' + x )
                

        if self.printSites: print( sites )
        if self.debug: print( article_text )
            
        if not article_text:
            if self.verbose: print('ERROR: no text came out of website: %s' % sites )
        
        if article_text:
            return sites, self.parse_text( article_text )
        else:
            return sites, ''

        
    def extract_text( self, url, links=False ):
        if self.debug: print( 'READ: %s' % url )
            
        article_text = ""
        validlinks = []

        article = urlopen(Request(url, headers=self.headers),  timeout=4).read()
        parsed_article = bs.BeautifulSoup(article,'lxml')
        paragraphs = parsed_article.find_all('p')
        for p in paragraphs:  
            article_text += p.text
            
        if self.debug: print( article_text )

        if links:
            for l in parsed_article.findAll('a', attrs={'href': re.compile('(about|mission|description|info)')}):
                #if self.debug: print( 'Found link: %s' % l.get('href') )
                validlinks.append(l.get('href'))
            
            return article_text, set(validlinks)
        
        else:
        
            return article_text


    def test_url( self, url ):
        try:
            if self.debug:  print( 'Request HEAD from URL: %s' % url )
            req = urlopen(Request(url, method='HEAD', headers=self.headers),  timeout=4)
            if req.getcode() < 400 :
                if self.debug:  print( 'Found: %s %s' % (url,req.getcode()) )
                return True
            else:
                if self.debug: 
                    print( 'ERROR: %s' % req.getcode() )
                    print( 'ERROR: returned %s' % req )
        except:
            if self.debug: print( 'Failure on request' )
        
        return False
        

    def format_url(self, url):

        urllist = []

        p = url.lower().strip()
        
        for bad in ['http://', 'https://', 'www.']:
            p = p.replace( bad, '' )
            
        p = urlparse( p )
        
        if self.debug: print( p )

        netloc = p.netloc or p.path
        path = p.path if p.netloc else ''

        for method in ['http', 'https']:
            test = ParseResult(method, netloc, path, *p[3:]).geturl()

            if self.test_url( test ):
                if self.debug: print( 'Add %s to list' % test )
                urllist.append( test )

            if not urllist and not 'www' in test :
                test = ParseResult(method, 'www.'+netloc, path, *p[3:]).geturl()
                if self.test_url( test ):
                    if self.debug: print( 'Add %s to list' % test )
                    urllist.append( test )
                
        # If found then get the last only
        if urllist:
            return urllist[-1:]
        
        return urllist




    def parse_text( self, article_text ):

        # Removing Square Brackets and Extra Spaces
        article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  
        article_text = re.sub(r'\s+', ' ', article_text)  

        # Removing special characters and digits
        formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
        formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)  

        sentence_list = nltk.sent_tokenize(article_text)  

        stopwords = nltk.corpus.stopwords.words('english')

        word_frequencies = {}  
        for word in nltk.word_tokenize(formatted_article_text):  
            if word not in stopwords:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

        maximum_frequncy = max(word_frequencies.values())

        for word in word_frequencies.keys():  
            word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

        sentence_scores = {}  
        for sent in sentence_list:  
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_frequencies.keys():
                    if len(sent.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word]
                        else:
                            sentence_scores[sent] += word_frequencies[word]

        summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

        summary = ' '.join(summary_sentences)  
        if self.debug: print(summary)  

        return summary

In [None]:


#------------------------------------------------------------------------------#
# Test function
#------------------------------------------------------------------------------#
if False:
    """
    Tests class to parse website data
    """
    
    #print( ' Start Test ParseWeb() ')
    url = 'www.ymca.net'
    #url = 'www.noahcdc.org'
    #url = 'releafmichigan.org'

    parser = ParseWeb( verbose=False, debug=False, printSites=False )
    
    websites, text = parser.text( url )
    print( websites ) 
    print( text ) 
    
    
#------------------------------------------------------------------------------#
