# Exercise 1: refactoring your own code

In [1]:
import json
import linecache as lc

In [2]:
class CsvConverter:
    def __init__(self,file_name, header):
        self.file_name = file_name
        self.header = header
        self.keys = self.header.split(',')

    
    def convert_csv_to_json(self,lines):
        json_list=[]
        for line in lines:
            try:
                assert len(line) == len(self.keys)

                json_dict = {key: value for key, value in zip(self.keys, line)}
                json_list.append(json_dict)
            except AssertionError:
                print ("Number of items in line does not match the number of keys in header.")
            
        return json.dumps(json_list)
        

In [3]:
# header = 'first,second,third'
# c = CsvConverter('dSST.csv',header)
# lines =[[1,2,3],[4,5,6]]
# c.convert_csv_to_json(lines)

In [4]:
class Reader:
    
    def __init__(self, location, csvconverter):
        self.location = location
        self.csvconverter = csvconverter
        self.starter =2
        
    def get_lines(self): 
        lines_list= [lc.getline(self.csvconverter.file_name,i).strip().split(',') for i in range(self.starter, self.starter + 4)]
        self.starter = self.starter + 4
        return (self.csvconverter.convert_csv_to_json(lines_list))

In [5]:
header = lc.getline('dSST.csv',1).strip()
r = Reader('dSST.csv', CsvConverter('dSST.csv',header))

In [8]:
r.get_lines()

'[{"Year": "1889", "Jan": "-.08", "Feb": ".17", "Mar": ".07", "Apr": ".10", "May": ".00", "Jun": "-.09", "Jul": "-.07", "Aug": "-.20", "Sep": "-.23", "Oct": "-.25", "Nov": "-.33", "Dec": "-.28", "J-D": "-.10", "D-N": "-.08", "DJF": ".02", "MAM": ".06", "JJA": "-.12", "SON": "-.27"}, {"Year": "1890", "Jan": "-.41", "Feb": "-.44", "Mar": "-.39", "Apr": "-.29", "May": "-.38", "Jun": "-.24", "Jul": "-.27", "Aug": "-.38", "Sep": "-.36", "Oct": "-.24", "Nov": "-.43", "Dec": "-.30", "J-D": "-.34", "D-N": "-.34", "DJF": "-.38", "MAM": "-.35", "JJA": "-.30", "SON": "-.34"}, {"Year": "1891", "Jan": "-.33", "Feb": "-.46", "Mar": "-.18", "Apr": "-.27", "May": "-.16", "Jun": "-.20", "Jul": "-.17", "Aug": "-.17", "Sep": "-.15", "Oct": "-.21", "Nov": "-.31", "Dec": "-.04", "J-D": "-.22", "D-N": "-.24", "DJF": "-.37", "MAM": "-.20", "JJA": "-.18", "SON": "-.22"}, {"Year": "1892", "Jan": "-.28", "Feb": "-.10", "Mar": "-.40", "Apr": "-.33", "May": "-.23", "Jun": "-.22", "Jul": "-.31", "Aug": "-.27", "Se

# Exercise 2: functions with data

In [13]:
def convertion (data, function):
    
    list_output= [function(i) for i in data]
    return list_output
        

In [14]:
convertion([1,2,3,4], lambda x: x**2)

[1, 4, 9, 16]

In [15]:
def improved_convention (data, *function):
    list_output = []
    for fn in function:
        list_output.append([fn(i) for i in data])
    return list_output

In [16]:
improved_convention([1,2,3,4], lambda x : x**2, lambda y: y**3)

[[1, 4, 9, 16], [1, 8, 27, 64]]

In [22]:
def improved_convention_v2 (data, *function):
    return [[fn(i) for i in data] for fn in function]


In [23]:
improved_convention_v2([1,2,3,4], lambda x : x**2, lambda y: y**3)

[[1, 4, 9, 16], [1, 8, 27, 64]]

# Exercise 3: refactoring other people's code

In [24]:
""" 
This is a crawler program using beautifulsoup.
It crawls the website "https://sport050.nl/sportaanbieders/alle-aanbieders/"
and fetches all the sport suppliers in the city of Groningen. It outputs 
a csv-file with the url;phone-number;email-address of all the suppliers it can find.
"""

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re


def hack_ssl():
    """ ignores the certificate errors"""
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    return ctx


def open_url(url):
    """ reads url file as a big string and cleans the html file to make it
        more readable. input: url, output: soup object
    """
    ctx = hack_ssl()
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def read_hrefs(soup):
    """ get from soup object a list of anchor tags,
        get the href keys and and prints them. Input: soup object
    """
    reflist = []
    tags = soup('a')
    for tag in tags:
        reflist.append(tag)
    return reflist

def read_li(soup):
    lilist = []
    tags = soup('li')
    for tag in tags:
        lilist.append(tag)
    return lilist

def get_phone(info):
    reg = r"(?:(?:00|\+)?[0-9]{4})?(?:[ .-][0-9]{3}){1,5}"
    phone = [s for s in filter(lambda x: 'Telefoon' in str(x), info)]
    try:
        phone = str(phone[0])
    except:
        phone = [s for s in filter(lambda x: re.findall(reg, str(x)), info)]
        try:
            phone = str(phone[0])
        except:
            phone = ""   
    return phone.replace('Facebook', '').replace('Telefoon:', '')

def get_email(soup):
    try: 
        email = [s for s in filter(lambda x: '@' in str(x), soup)]
        email = str(email[0])[4:-5]
        bs = BeautifulSoup(email, features="html.parser")
        email = bs.find('a').attrs['href'].replace('mailto:', '')
    except:
        email = ""
    return email

def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def fetch_sidebar(soup):
    """ reads html file as a big string and cleans the html file to make it
        more readable. input: html, output: tables
    """
    sidebar = soup.findAll(attrs={'class': 'sidebar'})
    return sidebar[0]

def extract(url):
    text = str(url)
    text = text[26:].split('"')[0] + "/"
    return text


# print ('fetch urls')
# url = "https://sport050.nl/sportaanbieders/alle-aanbieders/"
# s = open_url(url)
# reflist = read_hrefs(s)

# print ('getting sub-urls')
# sub_urls = [s for s in filter(lambda x: '<a href="/sportaanbieders' in str(x), reflist)]
# sub_urls = sub_urls[3:]

# print ('extracting the data')
# print (f'{len(sub_urls)} sub-urls')

# for sub in sub_urls:
#     try:
#         sub = extract(sub)
#         site = url[:-16] + sub
#         soup = open_url(site)    
#         info = fetch_sidebar(soup)
#         info = read_li(info)
#         phone = get_phone(info)
#         phone = remove_html_tags(phone).strip()
#         email = get_email(info)
#         email = remove_html_tags(email).replace("/","")
#         print (f'{site} ; {phone} ; {email}')
#     except Exception as e:
#         print (e)
#         exit()

    

fetch urls
getting sub-urls
extracting the data
415 sub-urls
https://sport050.nl/sportaanbieders/3x3-unites/ ; 06-23929416 ; a.einolhagh@live.nl
https://sport050.nl/sportaanbieders/40up-fitness/ ; 06-81484254 ; info@40upfitness.nl
https://sport050.nl/sportaanbieders/5ersport/ ;  ; 5ersporten@gmail.com?subject=Bericht%20via%20Sport050.nl%20
https://sport050.nl/sportaanbieders/s-exstudiantes/ ;  ; 
https://sport050.nl/sportaanbieders/agsr-gyas/ ; + 050 526 7445 ; 
https://sport050.nl/sportaanbieders/aihato/ ; 06-40559702 ; info@aihato.nl
https://sport050.nl/sportaanbieders/atc75/ ;  ; 
https://sport050.nl/sportaanbieders/active-movement/ ;  ; http:info@activemovement.nl
https://sport050.nl/sportaanbieders/adonis-sportacrobatiek/ ;  ; bestuuradonissportacrobatiek@gmail.com
https://sport050.nl/sportaanbieders/akwaak/ ;  ; 
https://sport050.nl/sportaanbieders/allround-crossfit-groningen/ ;  ; info@allroundcrossfit.nl
https://sport050.nl/sportaanbieders/Alumni-Sportclub-Exstudiantes/ ;  ; 
h

https://sport050.nl/sportaanbieders/gsrv-de-graancirkel/ ; 06-34268127 ; secretaris@degraancirkel.nl
https://sport050.nl/sportaanbieders/gsvb-de-groene-uilen-moestasj/ ;  ; info@groeneuilenmoestasj.nl
https://sport050.nl/sportaanbieders/gsvv-donitas/ ;  ; info@donitas.nl
https://sport050.nl/sportaanbieders/gsvv-tjas/ ; 06-13562447 ; 
https://sport050.nl/sportaanbieders/gswc-bares/ ;  ; 
https://sport050.nl/sportaanbieders/gszc-de-walvisch/ ;  ; 
https://sport050.nl/sportaanbieders/gv-groen-geel/ ;  ; 
https://sport050.nl/sportaanbieders/gdvv-martinistad/ ;  ; 
https://sport050.nl/sportaanbieders/gltb/ ; 050 - 526 23 68 ; 
https://sport050.nl/sportaanbieders/go180-personal-trainers/ ; 050-2110600 ; 
https://sport050.nl/sportaanbieders/grc-groningen/ ; 050 - 52 60 649 ; 
https://sport050.nl/sportaanbieders/gsav-vitalis/ ; .com ; 
https://sport050.nl/sportaanbieders/gsbv-tweeslag/ ;  ; bestuur@tweeslag.nl
https://sport050.nl/sportaanbieders/gsr-aegir/ ; 050-3124666 ; 
https://sport050.nl/

https://sport050.nl/sportaanbieders/la-danse/ ;  ; 
https://sport050.nl/sportaanbieders/lacrosse-groningen-gladiators/ ; 06-45367278 ; info@lacrossegroningen.nl
https://sport050.nl/sportaanbieders/leefstijl050/ ;  ; info@leefstijl050.nl
https://sport050.nl/sportaanbieders/life-beat/ ; 06 23376388 ; info@life-beat.nl
https://sport050.nl/sportaanbieders/loopgroep-astrea/ ;  ; 
https://sport050.nl/sportaanbieders/loopgroep-groningen/ ;  ; secretaris@loopgroepgroningen.nl
https://sport050.nl/sportaanbieders/loopgroep-grunn/ ; 06-22564473 ; info@loopgroepgrunn.nl
https://sport050.nl/sportaanbieders/loopplezier/ ;  ; info@loopplezier.nl
https://sport050.nl/sportaanbieders/louisa-martial-arts/ ;  ; 
https://sport050.nl/sportaanbieders/lycurgus-volleybal-vereniging/ ;  ; 
https://sport050.nl/sportaanbieders/mars-vechtschool-voor-middeleeuwse-krijgskunsten-noord-nederland/ ; Telefoonnummer: +31633748759 ; info@mars-swordfighting.com
https://sport050.nl/sportaanbieders/mars-vechtschool/ ;  ; inf

https://sport050.nl/sportaanbieders/sergeij-barthelt/ ;  ; 
https://sport050.nl/sportaanbieders/shorttrack-groningen/ ;  ; http:shorttrack.groningen@gmail.com
https://sport050.nl/sportaanbieders/shotokan-karate-noord-nederland/ ;  ; info@shotokankaratenoordnederland.nl
https://sport050.nl/sportaanbieders/simply-bmx/ ;  ; 
https://sport050.nl/sportaanbieders/slim-leven-vinkhuizen/ ;  ; slimleven@wij.groningen.nl
https://sport050.nl/sportaanbieders/sloeproeiers-groningen/ ; 06-13127149 ; bestuur@sloeproeiers.nl
https://sport050.nl/sportaanbieders/smash-block-volleybal-vereniging/ ;  ; smashandblocktc@gmail.com
https://sport050.nl/sportaanbieders/smashing/ ;  ; 
https://sport050.nl/sportaanbieders/snooker-academie-groningen/ ;  ; 
https://sport050.nl/sportaanbieders/snookervereniging-groningen/ ;  ; info@snookergroningen.nl
https://sport050.nl/sportaanbieders/snowlimits-ski-outdoorcentrum/ ; 050 - 5892039 ; info@snowlimits.nl
https://sport050.nl/sportaanbieders/sound-of-walking/ ; 050-314

https://sport050.nl/sportaanbieders/zwemvereniging-ten-boer/ ;  ; secretariaat@zwemverenigingtenboer.nl
https://sport050.nl/sportaanbieders/bij-spruit/ ;  ; info@bijspruit.nl
https://sport050.nl/sportaanbieders/slandsweerbaarheid/ ;  ; 
https://sport050.nl/sportaanbieders/sc-stadspark/ ; 06-54635327 ; 
https://sport050.nl/sportaanbieders/sv-blauw-geel-1915/ ; 050-54 22 718 ; 
https://sport050.nl/sportaanbieders/vv-oosterparkers/ ; Tel: 050 - 314 04 07 ; 
https://sport050.nl/sportaanbieders/vv-potetos/ ;  ; 
https://sport050.nl/sportaanbieders/vv-groningen/ ; 050 364 6643 ; 
https://sport050.nl/sportaanbieders/vv-groninger-boys/ ;  ; 


In [None]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re

In [2]:
class Crawler:
    """ 
    This is a crawler program using beautifulsoup.
    It crawls the website "https://sport050.nl/sportaanbieders/alle-aanbieders/"
    and fetches all the sport suppliers in the city of Groningen. It outputs 
    a csv-file with the url;phone-number;email-address of all the suppliers it can find.
    """

    def __init__(self):
        pass
    
    def hack_ssl(self):
        """ ignores the certificate errors"""
        ctx = ssl.create_default_context()
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE
        return ctx


    def open_url(self, url):
        """ reads url file as a big string and cleans the html file to make it
            more readable. input: url, output: soup object
        """
        ctx = hack_ssl()
        html = urllib.request.urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, 'html.parser')
        return soup


    def read_hrefs(self,soup):
        """ get from soup object a list of anchor tags,
            get the href keys and and prints them. Input: soup object
        """
        reflist = []
        tags = soup('a')
        for tag in tags:
            reflist.append(tag)
        return reflist

    def read_li(self,soup):
        lilist = []
        tags = soup('li')
        for tag in tags:
            lilist.append(tag)
        return lilist

    def get_phone(self,info):
        reg = r"(?:(?:00|\+)?[0-9]{4})?(?:[ .-][0-9]{3}){1,5}"
        phone = [s for s in filter(lambda x: 'Telefoon' in str(x), info)]
        try:
            phone = str(phone[0])
        except:
            phone = [s for s in filter(lambda x: re.findall(reg, str(x)), info)]
            try:
                phone = str(phone[0])
            except:
                phone = ""   
        return phone.replace('Facebook', '').replace('Telefoon:', '')

    def get_email(self,soup):
        try: 
            email = [s for s in filter(lambda x: '@' in str(x), soup)]
            email = str(email[0])[4:-5]
            bs = BeautifulSoup(email, features="html.parser")
            email = bs.find('a').attrs['href'].replace('mailto:', '')
        except:
            email = ""
        return email

    def remove_html_tags(self,text):
        """Remove html tags from a string"""
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)

    def fetch_sidebar(self,soup):
        """ reads html file as a big string and cleans the html file to make it
            more readable. input: html, output: tables
        """
        sidebar = soup.findAll(attrs={'class': 'sidebar'})
        return sidebar[0]

    def extract(self,url):
        text = str(url)
        text = text[26:].split('"')[0] + "/"
        return text   
    
    sub_urls = [s for s in filter(lambda x: '<a href="/sportaanbieders' in str(x), reflist)]
    sub_urls = sub_urls[3:]
    def crawl_site(self, sub_urls):
        for sub in sub_urls:
            try:
                sub = self.extract(sub)
                site = url[:-16] + sub
                soup = self.open_url(site)    
                info = self.fetch_sidebar(soup)
                info = self.read_li(info)
                phone = self.get_phone(info)
                phone = self.remove_html_tags(phone).strip()
                email = self.get_email(info)
                email = self.remove_html_tags(email).replace("/","")
                return f'{site} ; {phone} ; {email}'
            except Exception as e:
                return e
                exit()


NameError: name 'reflist' is not defined