### Python Crawler to receive notification when a update occurs - Script Module

Requirements:
* pip install python3
* pip install sendgrid
* pip install requests==2.22.0 beautifulsoup4==4.8.1

This script expect you have the following files in the project directory

* emailInformation.txt: Here you should add the From email address in the first line and the To email address in the second line
* sendgrid.env must be in your git ignore file, and your API KEY must be saved in your environmental variables

Source: 
* https://www.twilio.com/blog/web-scraping-and-parsing-html-in-python-with-beautiful-soup
* https://www.geeksforgeeks.org/scheduling-python-scripts-on-linux/
* https://app.sendgrid.com/guide/integrate/langs/python

In [6]:
import re

import requests
import filecmp    
import os

#if your environment is not recognizing the correct library folder use this
import sys
sys.path.append("/Users/pauloalves/workspace/crawler/crawler/lib/python3.9/site-packages")

from bs4 import BeautifulSoup

# using SendGrid's Python Library
# https://github.com/sendgrid/sendgrid-python
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import Mail

from os.path import exists
from datetime import date 

import shutil

# params: 
#  _url: website address that you want to check
#  _id: HTML tag that you want to explore
def getContent(_url, _id):    
    html_text = requests.get(_url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.find(id=_id).text


# params: 
#  _fileName: File name related to the file that you want to store the content
#  _content: Content extracted from the website mentioned before 
def saveContent(_fileName,_content):
    #open file
    file = open(_fileName, "w")

    #convert variable to string
    file.write(repr(_content))

    #close file
    file.close()

    
# params: 
#  _file1: File that you want to compare
#  _file2: File that you want to compare
def compareContent(_file1, _file2):
    #compare files
    result = filecmp.cmp(_file1, _file2)
    
    return result


# params: 
#  _subject: String related to the subject that this crawler will verify
def sendMail(_subject):
    f = open("emailInformation.txt", "r")
    _emailFrom = f.readline()
    _emailTo = f.readline()

    message = Mail(    
        from_email=_emailFrom,
        to_emails=_emailTo,
        subject='Crawler Notification - '+_subject,
        html_content='<strong>You have an update in the site that you are monitoring.</strong><br>https://www.marinha.mil.br/com1dn/smv/smv-principal<br>##webCrawler##')
    try:
        sg = SendGridAPIClient(os.environ.get('SENDGRID_API_KEY'))
        response = sg.send(message)
    except Exception as e:
        print(e.message)

## MAIN ##
content = getContent('https://www.marinha.mil.br/com1dn/smv/smv-principal', "node-652")

subject = "marinha"

newFile = str(subject)+'New.txt'
oldFile = str(subject)+'Old.txt'

# verify if there is a source file to compare with the current website state
if exists(oldFile)==False:
    saveContent(oldFile, content)

# save the current website state    
saveContent(newFile, content)

# store the result of file comparison. True if they are equal, False if they are not
result = compareContent(oldFile,newFile)

# verify the result, if false, send an email to notify the stakeholders informing that there is an update
if result==False:
    print(str(date.today())+'-['+subject+']-'+ 'Do something, there is an update in your site')
    sendMail(str(subject))
    shutil.copyfile(oldFile, str(date.today())+'-'+oldFile)
    shutil.copyfile(newFile,oldFile)
else:
    print(str(date.today())+'-['+subject+']-'+'No changes')

2022-07-27-[marinha]-No changes
