# 1. Crawler

## 1.0. Related example

This code shows `wget`-like tool written in python. Run it from console (`python wget.py`), make it work. Check the code, reuse, and modify for your needs.

In [1]:
import argparse
import os
import re
import requests


def wget(url, filename):
    # allow redirects - in case file is relocated
    resp = requests.get(url, allow_redirects=True)
    # this can also be 2xx, but for simplicity now we stick to 200
    # you can also check for `resp.ok`
    if resp.status_code != 200:
        print(resp.status_code, resp.reason, 'for', url)
        return

    # just to be cool and print something
    print(*[f"{key}: {value}" for key, value in resp.headers.items()], sep='\n')
    print()

    # try to extract filename from url
    if filename is None:
        # start with http*, ends if ? or # appears (or none of)
        m = re.search("^http.*/([^/\?#]*)[\?#]?", url)
        filename = m.group(1)
        if not filename:
            raise NameError(f"Filename neither given, nor found for {url}")

    # what will you do in case 2 websites store file with the same name?
    if os.path.exists(filename):
        raise OSError(f"File {filename} already exists")

    with open(filename, 'wb') as f:
        f.write(resp.content)
        print(f"File saved as {filename}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='download file.')
    parser.add_argument("-O", type=str, default=None, dest='filename', help="output file name. Default -- taken from resource")
    parser.add_argument("url", type=str, default=None, help="http://sprotasov.ru/data/iu.txt")
    args = parser.parse_args()
    wget(args.url, args.filename)

### 1.0.1. How to parse a page?

If you build a crawler, you might follow one of the approaches:
1. search for URLs in the page, assuming this is just a text.
2. search for URLs in the places where URLs should appear: `<a href=..`, `<img src=...`, `<iframe src=...` and so on.

To follow the first approach you can rely on some good regular expression. [Like this](https://stackoverflow.com/a/3809435).

To follow the second approach just read one of these: [short answer](https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup) or [exhaustive explanation](https://hackersandslackers.com/scraping-urls-with-beautifulsoup/).

## 1.1. [15] Download and persist #
Please complete a code for `load()`, `download()` and `persist()` methods of `Document` class. What they do:
- for a given URL `download()` method downloads binary data and stores in `self.content`. It returns `True` for success, else `False`.
- `persist()` method saves `self.content` somewhere in file system. We do it to avoid multiple downloads (for caching in other words).
- `load()` method loads data from hard drive. Returns `True` for success.

Tests checks that your code somehow works.

**NB Passing the test doesn't mean you correctly completed the task.** These are **criteria, which have to be fullfilled**:
1. URL is a unique identifier (as it is a subset of URI). Thus, documents with different URLs should be stored in different files. Typical errors: documents from the same domain are overwritten to the same file, URLs with similar endings are downloaded to the same file, etc.
2. The document can be not only a text file, but also a binary. Pay attention that if you download `mp3` file, it still can be played. Hint: don't hurry to convert everything to text.

In [2]:
# based on https://stackoverflow.com/questions/53101597/how-to-download-binary-file-using-requests
#based on https://stackoverflow.com/questions/12474406/python-how-to-get-the-content-type-of-an-url

import requests
from urllib.parse import quote
import hashlib
import pathlib





class Document:


    def __init__(self, url):
        self.content = None
        self.url = url
        self.response = None
        self.headers = None

    def get_filename(self):
        filename = hashlib.sha256(self.url.encode()).hexdigest()
        #print(self.headers)
        content_type = "html"
        if "content-type" in self.headers:
            content_type = self.headers["content-type"]

        extension = ""
        if "html" in content_type:
            extension = "html"
        elif "text" in content_type:
            extension = "txt"
        else:
            extension = content_type.split('/')[-1]
        return filename+'.'+extension

    def get(self):
        if not self.load():
            if not self.download():
                print(f"This url : {self.url} can't be obtained due to {self.response.status_code}")
                #raise FileNotFoundError(self.url)
            else:
                self.persist()

    def download(self):
        # download self.url content, store it in self.content and return True in case of success
            self.response = requests.get(self.url, allow_redirects=True)
            if self.response.status_code == 200:
                self.content = self.response.content
                r = requests.head(self.url)
                self.headers = r.headers
                return True
            return False

    def persist(self):
        # write document content to hard drive
        try:
            open(self.get_filename(), "wb").write(self.content)
        except Exception as e:
            print(f"Error writing content from {self.url} with error {e}")


    def load(self):
        # load content from hard drive, store it in self.content and return True in case of success
        try :
            if self.headers is None:
                return False

            name = self.get_filename()
            with open(name, "rb") as file:
                self.content = file.readlines()
        except Exception as e:
            print(self.get_filename())
            print(f"Error loading content from hard drive from {self.url} due to error {e}")
            return False
        return True



### 1.1.1. Tests ###

In [4]:
doc = Document("http://sprotasov.ru/data/iu.txt")

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"

## 1.2. [10] Parse HTML
`BeautifulSoap` library is a de facto standard to parse XML and HTML documents in python. Use it to complete `parse()` method that extracts document contents. You should initialize:
1. `self.anchors` list of tuples `('text', 'url')` met in a document. Be aware, there exist relative links (e.g. `../content/pic.jpg`). Use `urllib.parse.urljoin()` to fix this issue.
2. `self.images` list of images met in a document. Again, links can be relative to current page.
3. `self.text` should keep plain text of the document without scripts, tags, comments and so on. You can refer to [this stackoverflow answer](https://stackoverflow.com/a/1983219) for details.

**NB All these 3 criteria must be fulfilled to get full point for the task.**

In [5]:
# Based on https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
# Based on https://www.projectpro.io/recipes/download-image-from-webpage-beautiful-soup

from bs4 import BeautifulSoup, SoupStrainer
from bs4.element import Comment
from urllib.parse import urljoin
import validators

def is_valid(link):
    return type(link) is str and validators.url(link)


class HtmlDocument(Document):

    def __init__(self, url):
        super().__init__(url)

    def parse(self):
        self.anchors = []

        for link in BeautifulSoup(self.response.text, parse_only=SoupStrainer('a')):
            if link.has_attr('href'):
                if "http" not in link['href'] and is_valid(urljoin(self.url,link['href'])):
                    self.anchors.append((link.text,urljoin(self.url,link['href'])))
                elif is_valid(link['href']):
                    self.anchors.append((link.text,link['href']))

        self.images = []

        for link in BeautifulSoup(self.response.text, parse_only=SoupStrainer('img')):
            if link.has_attr('src'):
                if "http" not in link['src']:
                 self.images.append(urljoin(self.url ,link['src']))
                else:
                    self.images.append(link['src'])

        self.text = BeautifulSoup(self.response.text, 'html.parser').text

### 1.2.1. Tests

In [6]:
doc = HtmlDocument("http://sprotasov.ru")
doc.get()
doc.parse()

assert "just few links" in doc.text, "Error parsing text"
assert "http://sprotasov.ru/images/gb.svg" in doc.images, "Error parsing images"
assert any(p[1] == "https://twitter.com/07C3" for p in doc.anchors), "Error parsing links"


## 1.3. [10] Document analysis ##
Complete the code for `HtmlDocumentTextData` class. Implement word and sentence splitting (use any method you can propose). 

**Criteria to succeed in the task**: 
1. Your `get_word_stats()` method should return `Counter` object.
2. Don't forget to lowercase your words for counting.
3. Sentences should be obtained from inside `<body>` tag only.

In [9]:
# based on https://www.geeksforgeeks.org/how-to-scrape-all-the-text-from-body-tag-using-beautifulsoup-in-python/

# based on https://www.reddit.com/r/learnpython/comments/1928vp/errors_in_my_downloader_script_giving_me_trouble/

from collections import Counter
import string
import re

class HtmlDocumentTextData:
    
    def __init__(self, url):
        self.doc = HtmlDocument(url)
        self.doc.get()
        self.doc.parse()

    
    def get_sentences(self):

        soup = BeautifulSoup(self.doc.response.content, "html.parser")

        # Get the whole body tag

        tag = soup.body
        result =[]

        for s in tag.strings:

           # new_string = s.translate(str.maketrans('','',string.punctuation))
            new_string = re.sub(r'[^\w\s]', '', s)
            result.append(new_string)

        return result
    
    def get_word_stats(self):
        #TODO return Counter object of the document, containing mapping {`word` -> count_in_doc}
        sentences = self.get_sentences()
        words = []
        for sentence in sentences:
            for word in sentence.split():
                words.append(word.lower())

        #print(words)
        words_counter = Counter(words)
        return words_counter

### 1.3.1. Tests ###

In [10]:
doc = HtmlDocumentTextData("https://innopolis.university/")

doc.get_sentences()
print(doc.get_word_stats().most_common(10))
assert [x for x in doc.get_word_stats().most_common(10) if x[0] == 'иннополис'], 'иннополис should be among most common'

[('и', 44), ('в', 22), ('иннополис', 20), ('с', 13), ('на', 12), ('университет', 11), ('университета', 11), ('центр', 10), ('для', 9), ('образование', 8)]


## 1.4. [15] Crawling ##

Method `crawl_generator()` is given starting url (`source`) and max depth of search. It should return a **generator** of `HtmlDocumentTextData` objects (return a document as soon as it is downloaded and parsed). You can benefit from `yield obj_name` python construction. Use `HtmlDocumentTextData.anchors` field to go deeper.

In [17]:
# based on https://www.geeksforgeeks.org/web-crawling-using-breadth-first-search-at-a-specified-depth/

from queue import Queue

class Crawler:

    def __init__(self):
        self.intern_extern_links = set()
        self.visited = {}


    def crawl_generator(self, source, depth=1):
        if depth == 0:
            src = HtmlDocumentTextData(source)
            if src.doc.response.status_code != 200:
                print(f"skipping {source} due to {src.doc.response.status_code}")
                return
            elif src.doc.get_filename()[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
                return
            yield src

        elif depth == 1:
            src = HtmlDocumentTextData(source)
            links = src.doc.anchors
            for link in links:
                current_doc = HtmlDocumentTextData(link)
                if  current_doc.doc.response.status_code != 200:
                        print(f"skipping {current_doc.doc.url} due to {current_doc.doc.response.status_code}")
                        continue
                elif current_doc.doc.get_filename()[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
                        print(f"skipping {current_doc.doc.url} don't like its extension")
                        continue
                yield  current_doc

        else:
            queue = [source]
            for i in range(depth):
                for j in range(len(queue)):
                    link = queue.pop(0)
                    current_doc = HtmlDocumentTextData(link)
                    if  current_doc.doc.response.status_code != 200:
                        print(f"skipping {current_doc.doc.url} due to {current_doc.doc.response.status_code}")
                        continue
                    elif current_doc.doc.get_filename()[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
                        print(f"skipping {current_doc.doc.url} don't like its extension")
                        continue
                    urls = current_doc.doc.anchors
                    for u in urls:
                        if not (u in self.visited):
                            queue.append(u)
                            self.visited[u] = True
                    yield current_doc


### 1.4.1. Tests ###

In [18]:




crawler = Crawler()
counter = Counter()

for c in crawler.crawl_generator("https://innopolis.university/", 2):
    print(c.doc.url)

    if c.doc.url[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
        print("Skipping", c.doc.url)
        continue
    counter.update(c.get_word_stats())
    print(len(counter), "distinct word(s) so far")
    
print("Done")

print(counter.most_common(20))
assert [x for x in counter.most_common(20) if x[0] == 'innopolis'], 'innopolis sould be among most common'

https://innopolis.university/
575 distinct word(s) so far


InvalidSchema: No connection adapters were found for "('\\n\\n', 'https://innopolis.university/')"

In [7]:
#Installing libraries
!pip install validators
!pip install httplib2

Collecting validators
  Using cached validators-0.20.0.tar.gz (30 kB)
Building wheels for collected packages: validators
  Building wheel for validators (setup.py): started
  Building wheel for validators (setup.py): finished with status 'done'
  Created wheel for validators: filename=validators-0.20.0-py3-none-any.whl size=19582 sha256=f9975a88f9a17f32714268afb4945d427a221b539e35a9d5774ad52173f6039a
  Stored in directory: c:\users\maria\appdata\local\pip\cache\wheels\f2\ed\dd\d3a556ad245ef9dc570c6bcd2f22886d17b0b408dd3bbb9ac3
Successfully built validators
Installing collected packages: validators
Successfully installed validators-0.20.0


You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


Collecting httplib2
  Using cached httplib2-0.21.0-py3-none-any.whl (96 kB)
Installing collected packages: httplib2
Successfully installed httplib2-0.21.0


You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.
