# Neubaukompass DF creation & updater

### Script should be a daily scheduled/cron job

In [1]:
import os
import requests
import pandas as pd
import numpy as np
from IPython.display import clear_output
pd.options.display.max_rows = 999
import bs4 as bs
from datetime import datetime as date

#current date for naming convention of data

today_string = str(date.today().year) + '' + str(date.today().month) + '' + str(date.today().day)

#define cities wanted

cities = ['duesseldorf','frankfurt', 'koeln', 'muenchen', 'hamburg', 'berlin', 'stuttgart']

#check whether there is a df already - load if yes, create if no

exists = os.path.isfile(f'neubaukompass_{today_string}.csv')

if exists:
    df_complete = pd.read_csv(f'neubaukompass_{today_string}.csv', index_col=0)
else:
    df_complete = pd.DataFrame(columns = ['name', 'address', 'subtitle', 'price', 'size', 'completion', 'developer', 'city', 'image_location'])

#scrape cities, wrangle, clean, save    

for city in cities:
    for i in range(1,20):
        clear_output(wait=True)
        progress = (i/20)*100
        print(f'{city} is {progress}% done')
        print(f'getting page{i}')
        
        articles = []
        res = requests.get(f'https://www.neubaukompass.de/neubau-immobilien/{city}-region/{i}')
        soup = bs.BeautifulSoup(res.text)
        divs = soup.find_all('div', {'class':'col-12 col-md-6 col-lg-4'})

        for div in divs:
            article = div.find('article', {'class':'my-3 background-color-white'})
            articles.append(article)

        for article in articles:
            
            project_names = []
            subtitles = []
            street_names = []
            prices = []
            areas = []
            ready_infos = []
            developers = []
            image_locations = []

            project_name = article.find('h3', {'class':'mb-3 font-size-larger font-middle text-truncate w-100'})
            project_names.append(project_name.text)

            street_name = article.find_all('p', {'class': 'mb-2'})[1]
            street_names.append(street_name.text)

            subtitle = article.find_all('p', {'class':'mb-2'})[0]
            subtitles.append(subtitle.text)

            price = article.find_all('span', {'class':'d-block price ml-auto'})[0]
            prices.append(price.text)

            area = article.find_all('span', {'class':'d-block area ml-auto'})[0]
            areas.append(area.text)

            ready = article.find_all('span', {'class':'d-block ready ml-auto'})[0]
            ready_infos.append(ready.text)

            developer = article.find_all('p', {'class': 'mb-0 text-truncate pl-3'})[0]
            developers.append(developer.text)
            
            image_locations.append(['https://neubaukompass.de/'+ x['src'] for x in article.findAll('img', {'class': 'img-fluid image-center'})])
            
            df_partial = pd.DataFrame(data = [project_names,street_names,subtitles,prices,areas,ready_infos,developers, image_locations[0]])
            df_partial = df_partial.transpose()
            df_partial[0] = df_partial[0].map(lambda x: x.strip())
            df_partial[1] = df_partial[1].map(lambda x : x.replace('\n',','))
            df_partial[1] = df_partial[1].map(lambda x: x.replace('                    ', ' '))
            df_partial[1] = df_partial[1].map(lambda x: x.replace('      /', ''))

            df_partial[1] = df_partial[1].map(lambda x:x.lstrip(','))
            df_partial[1] = df_partial[1].map(lambda x:x.lstrip('     ,  '))
            df_partial[1] = df_partial[1].map(lambda x:x.lstrip('    '))
            df_partial[1] = df_partial[1].map(lambda x:x.lstrip('\xa0,  '))
            df_partial[1] = df_partial[1].map(lambda x:x.lstrip(','))

            df_partial[1] = df_partial[1].map(lambda x: x.rstrip(','))
            df_partial[1] = df_partial[1].map(lambda x: x.lstrip(','))
            df_partial[2] = df_partial[2].map(lambda x:x.rstrip())
            df_partial[2] = df_partial[2].map(lambda x: x.strip())
            df_partial[3] = df_partial[3].map(lambda x: x.strip())

            df_partial['city'] = f'{city}'
            df_partial.columns = ['name', 'address', 'subtitle', 'price', 'size', 'completion', 'developer', 'image_location', 'city']
            df_complete = df_complete.append(df_partial, ignore_index = True)
            

# in a glorious future this ought be done with a dict and key,value in dict.iteritems
            

df_complete['developer'] = df_complete['developer'].apply(lambda x: 'AREAL' if x.startswith('AREAL') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'Baustolz' if x.startswith('Baustolz') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'Bonava' if x.startswith('Bonava') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'BPD' if x.startswith('BPD') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'Dussmann' if x.startswith('Dussmann') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'Engel & Voelkers' if x.lower().startswith('engel &') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'FRANKONIA' if x.startswith('FRANKONIA') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'Grossmann & Berger' if x.startswith('Grossmann & Berger') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'HFH Immobilien' if x.startswith('HFH Immobilien') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'LBS' if x.startswith('LBS') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'Lüthen & Co.' if x.startswith('Lüthen & ') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'PROJECT Immobilien' if x.startswith('PROJECT Immobilien') else x)
df_complete['developer'] = df_complete['developer'].apply(lambda x: 'Sparda Immobilien' if x.startswith('Sparda Immobilien') else x)



df_complete['completion'] = df_complete['completion'].apply(lambda x: np.nan if x.startswith('auf Anfrage') else x)
df_complete.fillna('no_information', inplace = True)
df_complete['min_price'] = df_complete['price'].apply(lambda x: x.split()[1] if x.startswith('ab') else (x.split()[0] if x[0].isdigit() else x))
df_complete['max_price'] = df_complete['price'].apply(lambda x: x.split()[3] if len(x) >= 15 else 'no_information')
df_complete['size'] = df_complete['size'].map(lambda x: x.strip())
df_complete['min_size'] = df_complete['size'].apply(lambda x: x.split()[1] if x.startswith('ab') else (x.split()[0] if x[0].isdigit() else x))
df_complete['max_size'] = df_complete['size'].apply(lambda x: x.split()[3] if len(x) >= 13 and x != 'no_information' else 'no_information')
import locale
locale.setlocale( locale.LC_ALL, 'de_DE.UTF-8' ) 
df_complete['min_price'] = df_complete['min_price'].map(lambda x: locale.atoi(x) if x[0].isdigit() else x)
df_complete['max_price'] = df_complete['max_price'].map(lambda x: locale.atoi(x) if x[0].isdigit() else x)
df_complete['min_size'] = df_complete['min_size'].map(lambda x: locale.atof(x) if x[0].isdigit() else x)
df_complete['max_size'] = df_complete['max_size'].map(lambda x: locale.atof(x) if x[0].isdigit() else x)
def year_extractor(cell):
    import re
    search = re.compile('\d\d\d\d')
    result = search.findall(cell)
    if len(result) != 0:
        return(result[0])
    else:
        return('no_information')
df_complete['completion_year'] = df_complete['completion'].apply(year_extractor)
            
df_complete.to_csv(f'neubaukompass_{today_string}.csv', encoding='utf-8-sig')

stuttgart is 95.0% done
getting page19
