<a href="https://colab.research.google.com/github/patelami3431/Web-Crawler/blob/master/HLT_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import urllib
from urllib import request
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from urllib.error import HTTPError
from nltk import word_tokenize
import math

In [0]:
def prepare_soup(url):
    try:
        p = request.urlopen(url)
        if(p.getcode() == 403):
            return False
        data = p.read()
        soup = BeautifulSoup(data, 'html.parser')
        return soup
    except HTTPError as e:
        return False


In [0]:
def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True

In [0]:
def find_links(soup):
    s = []
    for link in soup.find_all('a'):
#             print(link.get('href'))
            s.append(link.get('href'))
    return s

In [0]:
def all_text(soup):
    # kill all script and style elements
    for script in soup(['style', 'script', '[document]', 'head', 'title']):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = text.splitlines()
    # break multi-headlines into a line each
    chunks = (phrase for line in lines for phrase in line.split(" "))
    # drop blank lines
    text = ' '.join(chunk for chunk in chunks if chunk)
    return text

In [0]:
def sent_tokenizer(filename):
    file = open(filename,"r",encoding="utf-8")
    content = file.read()
    content = content.lower()
    content = ''.join([i for i in content if not i.isdigit()])
    punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''
    for x in content: 
        if x in punctuations: 
            content = content.replace(x, " ") 
    content = re.sub(r'[^a-zA-Z .]+', ' ',content)
    sents = sent_tokenize(content)
    with open('tokenized_'+filename, 'w+') as f:
        for s in sents:
            if "retrieved" in s or "register" in s or "reply" in s:
                continue
            else:
                f.write(s+'\n')
    f.close()

In [0]:
def create_tf_idf_dict(text, num_docs):
    tf_dict = {}
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w.isalpha()and w not in stop_words]
    wnl = WordNetLemmatizer()
    tokens = [wnl.lemmatize(t) for t in tokens]
    for t in tokens:
        if t in tf_dict:
            tf_dict[t] += 1
        else:
            tf_dict[t] = 1
    
    for t in tf_dict.keys():
        tf_dict[t] = tf_dict[t] / len(tokens)
        
    
    vocab = set(tokens)
    idf_dict = {}
    for term in vocab:
        temp = ['x' for voc in vocab if term in voc]
        idf_dict[term] = math.log((1+num_docs) / (1+len(temp)))
        
    tf_idf = {}
    for t in tf_dict.keys():
        tf_idf[t] = tf_dict[t] * idf_dict[t]
        
    return tf_idf

In [0]:
def Know_base(filename):
    file = open(filename,"r",encoding="utf-8")
    content = file.read()
    content = content.lower()
    content = ''.join([i for i in content if not i.isdigit()])
    punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''
    for x in content: 
        if x in punctuations: 
            content = content.replace(x, " ") 
    content = re.sub(r'(?<=[.])(?=[^\s])', r' ', content)
    content = re.sub(r'[^a-zA-Z .]+', ' ',content)
    sents = sent_tokenize(content)
    for s in sents:
        if len(s)>200:
            sents.remove(s)
    return sents

In [0]:
def web_crawler():
    soup = prepare_soup("https://en.wikipedia.org/wiki/List_of_diets")
    s = find_links(soup)
    counter = 0
    final = []
    crawl = True
    i = 0
    while (crawl):
        for link in s:
            if link and ('diet' in link or 'Diet' in link)  and link.startswith('http'):
                final.append(link)
                counter += 1
        if (counter > 15):
            crawl = False
            break
        else:
            soup1 = prepare_soup(final[i])
            while soup1 is False:
                i += 1
                soup1 = prepare_soup(final[i])
                s = find_links(soup1)
                continue
            i += 1
            s = find_links(soup1)                
            

    
    file = "file_"
    j = 0
    files = []
    for url in final:
        soup2 = prepare_soup(url)
        if soup2 is False:
            continue
        text = all_text(soup2)
        filename = file + str(j) + ".txt"
        if text:
            with open(filename, "w+",encoding="utf-8") as f:
                f.write(text)
            files.append(filename)
            f.close()
            j += 1
    print("The total number of files:",j) 
    vocab = []
    terms = []
    text = ''
    
    for f in files:
        sent_tokenizer(f)
        for s in Know_base(f):
            terms.append(s)
    for f in files:
        with open("tokenized_" + f, "r") as file:
            text += file.read() + " "
        file.close()
            
    text = text.lower()
    text = re.sub(r'[.?!,:;()\-]', ' ', text)
    
    tf_idf_dict = create_tf_idf_dict(text, len(files))
    
    l = 0
    for k in sorted(tf_idf_dict, key=lambda k:tf_idf_dict[k], reverse= True):
        if l < 35:
            print (k,"->",tf_idf_dict[k])
            l += 1
    Term_list = ['calorie', 'weight', 'food', 'eating', 'people', 'fasting', 'disease', 'body','protein','meal']
    Term_dict = {}
    a = []
    for t in Term_list:
        for s in terms:
            if t in s:
                a.append((t, s))
    for t, s in a:
        Term_dict.setdefault(t, []).append(s)
    print('\nKnowledge Base for top 10 terms:')
    return Term_dict  

In [0]:
web_crawler()

The total number of files: 19
calorie -> 0.012004654585491315
weight -> 0.009894052506805265
jpg -> 0.007849197228975091
loss -> 0.006293170100754955
food -> 0.005710553143163021
kidney -> 0.005567035089129382
people -> 0.005515989528944311
protein -> 0.00542518043767396
also -> 0.005309751066659621
eating -> 0.005244308417295795
type -> 0.005078892324630941
may -> 0.004921581745462208
fasting -> 0.004921581745462208
much -> 0.004848033582602262
carbohydrate -> 0.004848033582602262
atkins -> 0.0047256876156082095
atkin -> 0.004345600657282304
kb -> 0.004308715178936897
body -> 0.004276128401795033
disease -> 0.004184543780578443
blood -> 0.004040027985501886
cancer -> 0.004040027985501886
medicine -> 0.004040027985501886
need -> 0.0038041307096167657
vegetarian -> 0.0035188209063955088
based -> 0.0035188209063955088
week -> 0.003405274899482386
meal -> 0.0033079483862942707
many -> 0.0032335111031742514
drink -> 0.0032335111031742514
patient -> 0.003138407835433832
woman -> 0.003116593

{'calorie': ['people tend to lose weight quickly on low carb diets because they restrict their calories to about   a day.',
  'it s called the   diet because five days of the week are normal eating days  while the other two restrict calories to   per day.',
  'many people find this way of eating to be easier to stick to than a traditional calorie restricted diet   .',
  'for five days per week  you eat normally and don t have to think about restricting calories.',
  'then  on the other two days  you reduce your calorie intake to a quarter of your daily needs.',
  'this is about  calories per day for women  and  for men.',
  'summary the   diet involves eating normally for five days per week  then restricting your calorie intake to   calories on the other two days.',
  'one important benefit is that intermittent fasting seems to be easier to follow than continuous calorie restriction  at least for some people     .',
  'one study showed that the   diet caused weight loss similar to regu

In [0]:
line = "This is a text.This is another text,it has no space after the comma."
re.sub(r'(?<=[.])(?=[^\s])', r' ', line)

'This is a text. This is another text,it has no space after the comma.'