## Q1- Fining web server

In [None]:
# find web server
import requests

def web_server(url):
    response = requests.get(url)
    if response.status_code == 200:
      print('Success!')
      print(response.headers['server'])
    elif response.status_code == 404:
      print('Not Found.')


In [None]:
web_server('https://www.coursera.org/')

Success!
envoy


In [None]:
web_server('https://github.com/')

Success!
GitHub.com


In [None]:
web_server('https://hackr.io')

Success!
Apache/2.4.57 () OpenSSL/1.0.2k-fips


In [None]:
web_server('https://aideadlin.es/')

Success!
GitHub.com


## Q1 - Finding Location

In [None]:
import json
import urllib.request

def location(url):
    GEO_IP_API_URL = 'http://ip-api.com/json/'

    IP_TO_SEARCH = url

    # Creating request object to GeoLocation API
    req = urllib.request.Request(GEO_IP_API_URL+IP_TO_SEARCH)

    # Getting in response JSON
    response = urllib.request.urlopen(req).read()

    # Loading JSON from text to object
    json_response = json.loads(response)

    print(json_response['country']+"/"+json_response['city'])


In [None]:
location('coursera.org')

United States/Washington


In [None]:
location('github.com')

United States/San Francisco


In [None]:
location('hackr.io')

United States/Dublin


## Q1-Finding open ports

### very slow and limited version

In [None]:
import socket # for connecting

def is_port_open(host, port):
    """
    determine whether `host` has the `port` open
    """
    # creates a new socket
    s = socket.socket()
    try:
        # tries to connect to host using that port
        s.connect((host, port))
        # make timeout if you want it a little faster ( less accuracy )
        # s.settimeout(0.2)
    except:
        # cannot connect, port is closed
        # return false
        return False
    else:
        # the connection was established, port is open!
        return True

host = 'coursera.org'
for port in range(1, 1025):
    if is_port_open(host, port):
        print(f"{host}:{port} is open")

### Faster version using thread

In [None]:
import argparse
import socket # for connecting
from threading import Thread, Lock
from queue import Queue
N_THREADS = 200
# thread queue
q = Queue()
print_lock = Lock()

def port_scan(port):
    """
    Scan a port on the global variable `host`
    """
    try:
        s = socket.socket()
        s.connect((host, port))
    except:
        with print_lock:
            print(f"{host:15}:{port:5} is closed ", end='\r')
    else:
        with print_lock:
            print(f"{host:15}:{port:5} is open ")
    finally:
        s.close()

In [None]:
def scan_thread():
    global q
    while True:
        # get the port number from the queue
        worker = q.get()
        # scan that port number
        port_scan(worker)
        # tells the queue that the scanning for that port
        # is done
        q.task_done()


def main(host, ports):
    global q
    for t in range(N_THREADS):
        # for each thread, start it
        t = Thread(target=scan_thread)
        # when we set daemon to true, that thread will end when the main thread ends
        t.daemon = True
        # start the daemon thread
        t.start()
    for worker in ports:
        # for each port, put that port into the queue
        # to start scanning
        q.put(worker)
    # wait the threads ( port scanners ) to finish
    q.join()

In [None]:
if __name__ == "__main__":
    host = 'coursera.org'
    start_port= 1
    end_port = 800
    ports = [ p for p in range(start_port, end_port)]

    main(host, ports)

coursera.org   :   80 is open 
coursera.org   :  443 is open 


## Finding emails

In [None]:
import re
import requests
import requests.exceptions
from urllib.parse import urlsplit
from collections import deque
from bs4 import BeautifulSoup

def emails(url):
    # starting url. replace google with your own url.
    starting_url = url

    # a queue of urls to be crawled
    unprocessed_urls = deque([starting_url])

    # set of already crawled urls for email
    processed_urls = set()

    # a set of fetched emails
    emails = set()

    # process urls one by one from unprocessed_url queue until queue is empty
    while len(unprocessed_urls):

        # move next url from the queue to the set of processed urls
        url = unprocessed_urls.popleft()
        processed_urls.add(url)

        # extract base url to resolve relative links
        parts = urlsplit(url)
        base_url = "{0.scheme}://{0.netloc}".format(parts)
        path = url[:url.rfind('/')+1] if '/' in parts.path else url

        # get url's content
        print("Crawling URL %s" % url)
        try:
            response = requests.get(url)
        except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
            # ignore pages with errors and continue with next url
            continue

        # extract all email addresses and add them into the resulting set
        new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
        emails.update(new_emails)
        print(emails)
        with open('emails.txt', 'a', encoding='utf-8') as f:
            for email in new_emails:
                f.writelines(email + "\n")

        # create a beutiful soup for the html document
        soup = BeautifulSoup(response.text, 'lxml')

        # Once this document is parsed and processed, now find and process all the anchors i.e. linked urls in this document
        for anchor in soup.find_all("a"):
            # extract link url from the anchor
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            # resolve relative links (starting with /)
            if link.startswith('/'):
                link = base_url + link
            elif not link.startswith('http'):
                link = path + link
            # add the new url to the queue if it was not in unprocessed list nor in processed list yet
            if not link in unprocessed_urls and not link in processed_urls:
                unprocessed_urls.append(link)


In [None]:
emails('https://github.com/')