<a href="https://colab.research.google.com/github/miriammazzeo95/BigData_and_Timeseries_in_Pyspark/blob/main/API_BBR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Set Up Pyspark, Imports and Functions**

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# TO CHANGE CURRENT DIR
# %cd /content/drive/MyDrive/Colab\ Notebooks/Big\ Data\ with\ Pyspark

/content/drive/MyDrive/Colab Notebooks/Big Data with Pyspark


 Import configuration Colab Script

In [37]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# change directory to content
%cd /
%cd content

# get sharinglink from Colab notebook to be imported
# https://colab.research.google.com/drive/1YGZCoCGw632dFe_yxAViQYIsXnFKqEdA?usp=sharing
colb_script_id = '1YGZCoCGw632dFe_yxAViQYIsXnFKqEdA'
link_to_file_in_drive = drive.CreateFile({'id':colb_script_id})

link_to_file_in_drive.GetContentFile('Pyspark_Configuration_Imports_Functions.ipynb') # creates a local copy in the local content folder
!rm Pyspark_Configuration_Imports_Functions.py # deletes previouse version copy if any
!jupyter nbconvert --to python 'Pyspark_Configuration_Imports_Functions.ipynb' # converts the local copy from notebook to .py
!rm Pyspark_Configuration_Imports_Functions.ipynb # deletes the local copy

/
/content
[NbConvertApp] Converting notebook Pyspark_Configuration_Imports_Functions.ipynb to python
[NbConvertApp] Writing 13117 bytes to Pyspark_Configuration_Imports_Functions.py


In [38]:
ls 

adc.json                                    [0m[01;34msample_data[0m/
[01;34mdrive[0m/                                      [01;34mspark-3.0.0-bin-hadoop3.2[0m/
[01;34m__pycache__[0m/                                spark-3.0.0-bin-hadoop3.2.tgz
Pyspark_Configuration_Imports_Functions.py  spark-3.0.0-bin-hadoop3.2.tgz.1
Pyspark_ConfigurationImportsFunctions.py    spark-3.0.0-bin-hadoop3.2.tgz.2


In [39]:
import Pyspark_Configuration_Imports_Functions as pyspark_config # imports everything from the script
dir(pyspark_config)

/


['ArrayType',
 'BinaryType',
 'BooleanType',
 'ByteType',
 'DataFrame',
 'DataType',
 'DateType',
 'DecimalType',
 'DoubleType',
 'FloatType',
 'IntegerType',
 'LongType',
 'MAXYEAR',
 'MINYEAR',
 'MapType',
 'NullType',
 'PandasUDFType',
 'ShortType',
 'SparkSession',
 'StringType',
 'StructField',
 'StructType',
 'TimestampType',
 'UserDefinedFunction',
 'WRAPPER_ASSIGNMENTS',
 'WRAPPER_UPDATES',
 'Window',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'abs',
 'acos',
 'add_index_column',
 'add_months',
 'approxCountDistinct',
 'approx_count_distinct',
 'array',
 'array_contains',
 'array_distinct',
 'array_except',
 'array_intersect',
 'array_join',
 'array_max',
 'array_min',
 'array_position',
 'array_remove',
 'array_repeat',
 'array_sort',
 'array_union',
 'arrays_overlap',
 'arrays_zip',
 'asc',
 'asc_nulls_first',
 'asc_nulls_last',
 'ascii',
 'asin',
 'atan',
 'atan2',
 'avg',
 'base64',
 'basestring',
 'bin

# **Building Register**

# **Define BBR Class**

In [40]:
from collections import defaultdict
import pandas as pd
import requests
import json
import numpy as np


### === BBR === ###
class BBR:
    '''
    Class to collect information about addresses from BBR (Bygnings- og Boligregistret)

    Functions:
        collect_items():
            Uses the helping functions grab_id and collect_coords to pull information from the BBR API and
            create a pandas Dataframe with the information.
        collect_coords():
            Collects the coordinates of the specified address from the BBR API.

    '''

    def __init__(self, addresses):
        # Address URL
        self.ad_url = "https://dawa.aws.dk/adresser?q="
        # ID URL
        self.id_url = "https://dawa.aws.dk/bbrlight/enheder?adresseid="
        # BBR Items
        self.items = ['BYG_ANVEND_KODE', 'BYG_ARL_SAML', 'BYG_BOLIG_ARL_SAML', 'BYG_BEBYG_ARL', 'ERHV_ARL_SAML', 'GARAGE_INDB_ARL', 'CARPORT_INDB_ARL', 'UDHUS_INDB_ARL', 'UDESTUE_ARL',
                      'OPFOERELSE_AAR', 'OMBYG_AAR', 'VARMEINSTAL_KODE', 'OPVARMNING_KODE', 'VARME_SUPPL_KODE', 'BYG_VANDFORSY_KODE', 'ETAGER_ANT', 'ETAGER_AFVIG_KODE', 'YDERVAEG_KODE', 'TAG_KODE']
        # Addresses parsed
        self.adds = addresses

    # get the address id

    def grab_id(self, address):
        try:
            res = requests.get(self.ad_url+address)
            raw_addr = json.loads(res.content)
            _id = raw_addr[0]['id']
            return _id
        except IndexError as e:
            return ''

    # get the coordinates

    def collect_coords(self, address):
        try:
            res = requests.get(self.ad_url+address)
            raw_addr = json.loads(res.content)
            lon = raw_addr[0]['adgangsadresse']['adgangspunkt']['koordinater'][0]
            lat = raw_addr[0]['adgangsadresse']['adgangspunkt']['koordinater'][1]
            return [lon, lat]
        except IndexError as e:
            return '', ''

    # Collect items specified from BBR

    def collect_items(self):
        ''' Uses a list of addresses and a list of items to be pulled from
        BBR and creates a dataframe with these'''

        root = 'bygning'
        data = defaultdict(list)

        for address in self.adds:
            rsp = requests.get(self.id_url + self.grab_id(address))
            raw = json.loads(rsp.content)
            try:
                data['Address'].append(address)
                data['Lat'].append(self.collect_coords(address)[1])
                data['Lon'].append(self.collect_coords(address)[0])
                for item in self.items:
                    res = raw[0][root][item]
                    data[item].append(res)
            except (IndexError, KeyError) as e:
                for item in self.items:
                    data[item].append(np.nan)
        df = pd.DataFrame.from_dict(data)
        return df
        

In [None]:
# FOR CONVERTING PANDAS-DF IN OUTPUT FROM BBR FUNCTION TO SPARK DF        
def BBR_fix_dtypes(df):
    df = df.fillna('')
    df = df.infer_objects()
    df = df.astype({'Address':'string'})
    df = df.astype({'VARMEINSTAL_KODE':'int64',  'OPVARMNING_KODE':'string', 'VARME_SUPPL_KODE':'string', 'YDERVAEG_KODE':'int64', 'TAG_KODE':'int64'}, errors='ignore')
    return df