<a href="https://colab.research.google.com/github/patelami3431/Web-Crawler/blob/master/HLT_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
import urllib
from urllib import request
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from urllib.error import HTTPError
from nltk import word_tokenize
import math

In [2]:
def prepare_soup(url):
    try:
        p = request.urlopen(url)
        if(p.getcode() == 403):
            return False
        data = p.read()
        soup = BeautifulSoup(data, 'html.parser')
        return soup
    except HTTPError as e:
        return False


In [4]:
def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True

In [5]:
def find_links(soup):
    s = []
    for link in soup.find_all('a'):
#             print(link.get('href'))
            s.append(link.get('href'))
    return s

In [6]:
def all_text(soup):
    # kill all script and style elements
    for script in soup(['style', 'script', '[document]', 'head', 'title']):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = text.splitlines()
    # break multi-headlines into a line each
    chunks = (phrase for line in lines for phrase in line.split(" "))
    # drop blank lines
    text = ' '.join(chunk for chunk in chunks if chunk)
    return text

In [62]:
def sent_tokenizer(filename):
    file = open(filename,"r")
    content = file.read()
    content = content.lower()
    content = ''.join([i for i in content if not i.isdigit()])
    punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''
    for x in content: 
        if x in punctuations: 
            content = content.replace(x, " ") 
    content = re.sub(r'[^a-zA-Z .]+', ' ',content)
    sents = sent_tokenize(content)
    with open('tokenized_'+filename, 'w+') as f:
        for s in sents:
            if "retrieved" in s or "register" in s or "reply" in s:
                continue
            else:
                f.write(s+'\n')
    f.close()

In [64]:
def create_tf_idf_dict(text, num_docs):
    tf_dict = {}
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w.isalpha()and w not in stop_words]
    wnl = WordNetLemmatizer()
    tokens = [wnl.lemmatize(t) for t in tokens]
    for t in tokens:
        if t in tf_dict:
            tf_dict[t] += 1
        else:
            tf_dict[t] = 1
    
    for t in tf_dict.keys():
        tf_dict[t] = tf_dict[t] / len(tokens)
        
    
    vocab = set(tokens)
    idf_dict = {}
    for term in vocab:
        temp = ['x' for voc in vocab if term in voc]
        idf_dict[term] = math.log((1+num_docs) / (1+len(temp)))
        
    tf_idf = {}
    for t in tf_dict.keys():
        tf_idf[t] = tf_dict[t] * idf_dict[t]
        
    return tf_idf

In [65]:
def web_crawler():
    soup = prepare_soup("https://en.wikipedia.org/wiki/List_of_diets")
    s = find_links(soup)
    counter = 0
    final = []
    crawl = True
    i = 0
    while (crawl):
        for link in s:
            if link and ('diet' in link or 'Diet' in link)  and link.startswith('http'):
                final.append(link)
                counter += 1
        if (counter > 15):
            crawl = False
            break
        else:
            soup1 = prepare_soup(final[i])
            while soup1 is False:
                i += 1
                soup1 = prepare_soup(final[i])
                s = find_links(soup1)
                continue
            i += 1
            s = find_links(soup1)                
            

#     print(counter)
    
    file = "file_"
    j = 0
    files = []
    for url in final:
        soup2 = prepare_soup(url)
        if soup2 is False:
            continue
        text = all_text(soup2)
        filename = file + str(j) + ".txt"
        if text:
            with open(filename, "w+") as f:
                f.write(text)
            files.append(filename)
            f.close()
            j += 1
    print("The total number of files:",j) 
    vocab = []
    text = ''
    for f in files:
        sent_tokenizer(f)
    
    for f in files:
        with open("tokenized_" + f, "r") as file:
            text += file.read() + " "
        file.close()
            
    text = text.lower()
    text = re.sub(r'[.?!,:;()\-]', ' ', text)
    
    tf_idf_dict = create_tf_idf_dict(text, len(files))
    
    l = 0
    for k in sorted(tf_idf_dict, key=lambda k:tf_idf_dict[k], reverse= True):
        if l < 35:
            print (k,"->",tf_idf_dict[k])
            l += 1
    

In [68]:
web_crawler()

The total number of files: 19
calorie -> 0.011977634655698533
weight -> 0.0098717830943706
jpg -> 0.007831530351802887
loss -> 0.006279005510421639
food -> 0.005748800347489017
people -> 0.005693352628077471
kidney -> 0.0055545048746037575
protein -> 0.005412969507863761
also -> 0.005297799943866659
eating -> 0.005232504592018032
kb -> 0.004991004380078739
may -> 0.004910504309432307
fasting -> 0.004910504309432307
much -> 0.004837121687878254
carbohydrate -> 0.004837121687878254
atkins -> 0.00471505109569112
atkin -> 0.004335819632444725
body -> 0.004186003673614426
disease -> 0.004175125260590145
need -> 0.004080236050122188
blood -> 0.004030934739898545
cancer -> 0.004030934739898545
medicine -> 0.004030934739898545
type -> 0.0039157651759014435
vegetarian -> 0.0035109007873144406
based -> 0.0035109007873144406
week -> 0.00332827136166432
meal -> 0.0033005028965036815
many -> 0.003226233155910567
drink -> 0.003226233155910567
patient -> 0.003131343945442609
woman -> 0.00310957822792