# Abfrage einer [Wohnungsseite](https://inberlinwohnen.de/wohnungsfinder)

Date: 2023-01-16

In [None]:
import requests
import re

myurl='https://inberlinwohnen.de/wohnungsfinder/'

response = requests.get(myurl)
#response.text

In [None]:
html_regex = r"(<li\s)+(.+?)(\sclass=\"tb-merkflat ipg\">)([\S\s]+?)(\s?<\/span><\/span><\/h3>)"

#test_str = "blala<li id=\"flat_1245972\" class=\"tb-merkflat ipg\"><h3 id=\"mfh1245972\"><span><span class=\"_tb_left\"><strong>4</strong> Zimmer, <strong>87,31</strong>  m², <strong>556,64</strong>  &euro; | Lichtenberger Straße 6, Mitte</span><span class=\"_tb_right\"><a title=\"Balkon/Terasse\" class=\"icon nohand2 balkon_loggia_terrasse\"></a>bla</a>"

matches = re.finditer(html_regex, response.text)
#cleanid_regex = r".*\"flat_(.*)\""
cleanid_regex = r"\"flat_(.*)\""

cleanspecs_regex = [
    (r"\n", ''),
    (r"<h3\sid=.+?><span><span class=\"_tb_left\">", ''),
    (r"<\/span><span class=\"_tb_right\">", '-#'),
    (r"(<\/strong>|<strong>)", ''),
    (r"class=\"icon\s?(.*?)\snohand2\s?(.*?)\"><\/a>", ''),
    (r"<a title=", '; '),
    (r",\s", '; '),
    (r"\s;\s", '; '),
    (r"\s\|\s", ' #-'),
    (r"&euro", '€'),
    (r"\"", ''),
    (r"\s{2}", ' '),
    (r"\s$", '')
]

parse_regex = r"(.+?)\sZimmer;\s(.+?)\sm²;\s(.+?)\s€;\s#-(.+?)-#;?\s?(Balkon/Terasse)?;?\s?(Aufzug)?;?\s?(Keller)?;?\s?(Barierrefrei)?;?\s?(Wohnberechtigungsschein)?;?\s?(Gäste-WC)?"

new_dict = {}
for matchNum, match in enumerate(matches, start=1):
    # search for matching intermediate substring, match for starting from beginning
    clean_id = re.search(cleanid_regex, match.group(2)).group(1)

    clean_specs = match.group(4)
    for (sub, rep) in cleanspecs_regex:
        clean_specs = re.sub(sub, rep, clean_specs)

    #print(f"{matchNum}: {match.group(2)} - {match.group(4)} --- {clean_id} -- {clean_specs}")
    #print(f"{matchNum}: {clean_id} -- {clean_specs}")
    parsing = re.match(parse_regex, clean_specs)
    #print(parsing)
    #print(parsing.groups())
    #print(parsing.group(5))
    parse_dict = {
        'Zimmer': float(parsing.group(1).replace(',', '.')),
        'QM': float(parsing.group(2).replace(',', '.')),
        'Preis': float(parsing.group(3).replace('.', '').replace(',', '.')),
        'Ort': parsing.group(4),
        'Balkon': (True if parsing.group(5) is not None else False),
        'Aufzug': (True if parsing.group(6) is not None else False),
        'Keller': (True if parsing.group(7) is not None else False),
        'Barrierefrei': (True if parsing.group(8) is not None else False),
        'WBS': (True if parsing.group(9) is not None else False),
        'Gäste-WC': (True if parsing.group(10) is not None else False)
    }
    new_dict[clean_id] = parse_dict
    #print(f"{clean_id}: {parse_dict}")

In [None]:
ref_dict = {}
print(f"Reference Dictionary entries: {len(ref_dict)}")
# if results dict has changed, swap dict with newer version 
if len(ref_dict.keys() ^ new_dict.keys()) > 0:
    ref_dict = new_dict
    print(f"Reference Dictionary entries after update: {len(ref_dict)}")
    ref_query = {k: v for k, v in ref_dict.items() if (v['WBS'] and v['QM'] > 60.0) }
    print(f"Reference Dictionary entries, queried: {len(ref_query)}")

In [None]:
import time
import datetime

for counter in range(1,600):
    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{counter:03} - {now}")
    for counter in range(1,60):
        try:
            time.sleep(1)
        except KeyboardInterrupt as err:
            print(f"System halted: {err}")
            exit()
            #SystemExit("Stop right there!")


### Mengenoperationen

In [None]:
# base case
dict_1 = {
    'name1': 'dog',
    'name2': 'elephant',
    'name3': 'cat'
}

# one more item
dict_2 = {
    'name1': 'dog',
    'name2': 'elephant',
    'name3': 'cat',
    'name4': 'dog'
}


# one more item, one less item
dict_3 = {
    'name1': 'dog',
    'name2': 'elephant',
    'name4': 'dog'
}

# get union of items from both dicts
union_dict = dict_1.copy()
union_dict.update(dict_3)
union_dict

# set operations
# s1 | s2: union of s1 and s2, returns elements of both s1 and s2 without repetition.
# s1 & s2: intersection of s1 and s2, returns elements which are present in both sets.
# s1 − s2: set difference, returns elements which are in s1 but not in s2.
# s1 ˆ s2: the symmetric difference of two sets of elements

def print_ops(dict_A, dict_B):
    print('Dict A: ', {k: v for k, v in union_dict.items() if k in dict_A.keys()})
    print('Dict B: ', {k: v for k, v in union_dict.items() if k in dict_B.keys()})
    print('#'*40)
    print('Union: ', {k: v for k, v in union_dict.items() if k in (dict_A.keys() | dict_B.keys())})
    print('Intersection: ', {k: v for k, v in union_dict.items() if k in (dict_A.keys() & dict_B.keys())})
    print('Difference A: ', {k: v for k, v in union_dict.items() if k in (dict_A.keys() - dict_B.keys())})
    print('Difference B: ', {k: v for k, v in union_dict.items() if k in (dict_B.keys() - dict_A.keys())})
    print('Symmetric Difference: ', {k: v for k, v in union_dict.items() if k in (dict_A.keys() ^ dict_B.keys())})

print_ops(dict_1, dict_2)


### Sending emails
source: [link](https://betterdatascience.com/send-emails-with-python/)

Had issue with the conformity to email standards when using TSL like in this [hint](https://realpython.com/python-send-email/).

In [None]:

from email.message import EmailMessage
import smtplib, ssl
import getpass

EMAIL_ADDRESS = "trismegistus@die-optimisten.net"
EMAIL_PASSWORD = getpass.getpass("Type your password and press enter:")
#input("Type your password and press enter:")

port = 465
smtp_server = "mail.gmx.net"

msg = EmailMessage()
msg['Subject'] = 'This is my first Python email'
msg['From'] = EMAIL_ADDRESS 
msg['To'] = EMAIL_ADDRESS 
msg.set_content('And it actually works')

with smtplib.SMTP_SSL(smtp_server, port) as smtp:
    smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) 
    smtp.send_message(msg)
#msg.as_string()