<a href="https://colab.research.google.com/github/patelami3431/Web-Crawler/blob/master/HLT_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import urllib
from urllib import request
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from urllib.error import HTTPError
from nltk import word_tokenize
import math

In [2]:
def prepare_soup(url):
    try:
        p = request.urlopen(url)
        if(p.getcode() == 403):
            return False
        data = p.read()
        soup = BeautifulSoup(data, 'html.parser')
        return soup
    except HTTPError as e:
        return False


In [3]:
def print_paras(soup):
    text1 = ""
    for p in soup.select("p"):
        text1 += p.get_text()
    #     print(p)
    text1 = re.sub(r"(?<=\.)(?<=\+)(?<=\\)\(https?:\/\/.+\)"," ", text1)
#     print(text1)
    return text1

In [4]:
def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True

In [5]:
def find_links(soup):
    s = []
    for link in soup.find_all('a'):
#             print(link.get('href'))
            s.append(link.get('href'))
    return s

In [6]:
def all_text(soup):
    # kill all script and style elements
    for script in soup(['style', 'script', '[document]', 'head', 'title']):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = text.splitlines()
    # break multi-headlines into a line each
    chunks = (phrase for line in lines for phrase in line.split(" "))
    # drop blank lines
    text = ' '.join(chunk for chunk in chunks if chunk)
    return text

In [7]:
def sent_tokenizer(filename):
    file = open(filename,"r")
    content = file.read()
    content = content.lower()
    content = ''.join([i for i in content if not i.isdigit()])
    punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''
    for x in content: 
        if x in punctuations: 
            content = content.replace(x, " ") 
    content = re.sub(r'[^a-zA-Z .]+', ' ',content)
    sent = sent_tokenize(content)
    with open('tokenzied_'+filename, 'w+') as f:
        for s in sent:
            f.write(s+'\n')
    f.close()

In [23]:
def create_tf_idf_dict(text, num_docs):
    tf_dict = {}
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w.isalpha()and w not in stop_words]
    for t in tokens:
        if t in tf_dict:
            tf_dict[t] += 1
        else:
            tf_dict[t] = 1
    
    for t in tf_dict.keys():
        tf_dict[t] = tf_dict[t] / len(tokens)
        
    
    vocab = set(tokens)
    idf_dict = {}
    for term in vocab:
        temp = ['x' for voc in vocab if term in voc]
        idf_dict[term] = math.log((1+num_docs) / (1+len(temp)))
        
    tf_idf = {}
    for t in tf_dict.keys():
        tf_idf[t] = tf_dict[t] * idf_dict[t]
        
    return tf_idf

In [27]:
def web_crawler():
    soup = prepare_soup("https://en.wikipedia.org/wiki/List_of_diets")
    s = find_links(soup)
    counter = 0
    final = []
    crawl = True
    i = 0
    while (crawl):
        for link in s:
            if link and ('diet' in link or 'Diet' in link)  and link.startswith('http'):
                final.append(link)
                counter += 1
        if (counter > 15):
            crawl = False
            break
        else:
            soup1 = prepare_soup(final[i])
            while soup1 is False:
                i += 1
                soup1 = prepare_soup(final[i])
                s = find_links(soup1)
                continue
            i += 1
            s = find_links(soup1)                
            

#     print(counter)
    
    file = "file_"
    j = 0
    files = []
    for url in final:
        soup2 = prepare_soup(url)
        if soup2 is False:
            continue
        text = all_text(soup2)
        filename = file + str(j) + ".txt"
        if text:
            with open(filename, "w+") as f:
                f.write(text)
            files.append(filename)
            f.close()
            j += 1
    print("The total number of files:",j) 
    vocab = []
    text = ''
    for f in files:
        sent_tokenizer(f)
        with open(filename, "r") as f:
            text += f.read() + " "
    
    tf_idf_dict = create_tf_idf_dict(text, len(files))
    
    l = 0
    for k in sorted(tf_idf_dict, key=lambda k:tf_idf_dict[k], reverse= True):
        if l < 35:
            print (k,"->",tf_idf_dict[k])
            l += 1
    

In [28]:
web_crawler()

The total number of files: 19
diet -> 0.06544162413850448
Retrieved -> 0.04318261935367092
diets -> 0.03618156360240079
March -> 0.022689172880742348
February -> 0.020493446472928573
foods -> 0.0175658112625102
weight -> 0.013906267249487245
food -> 0.013348024469150383
The -> 0.012815254991246925
dietary -> 0.012442449644278063
BBC -> 0.011710540841673469
News -> 0.011710540841673469
consumed -> 0.010978632039068877
carbohydrates -> 0.009514814433859695
meat -> 0.009045390900600197
eating -> 0.0087829056312551
based -> 0.0087829056312551
also -> 0.00805099682865051
consumption -> 0.00805099682865051
vegetarian -> 0.00783933878052017
Vitamin -> 0.00783933878052017
Diet -> 0.006609795110234697
October -> 0.0065871792234413265
Weight -> 0.0065871792234413265
April -> 0.0065871792234413265
calorie -> 0.0060302606004001315
people -> 0.0058552704208367345
habits -> 0.0058552704208367345
medical -> 0.0058552704208367345
Atkins -> 0.0058552704208367345
gluten -> 0.0058552704208367345
PMID -> 