In [1]:
from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
# Schema for Landlord JSON
landlord_schema = StructType([
            StructField("Landlord_id", IntegerType(), False),
            StructField("Password", StringType(), True),
            StructField("Landlord_name", StringType(), False),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True),
            StructField("event_time", StringType(), True)])

# Schema for building JSON
building_schema = StructType([
            StructField("Building_id", IntegerType(), False),
            StructField("Building_name", StringType(), True),
            StructField("Landlord_id", IntegerType(), False),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True),
            StructField("event_time", StringType(), True)])

# Schema for Apartment JSON
apartment_schema = StructType([
            StructField("Building_id", IntegerType(), True),
            StructField("Apartment_number", IntegerType(), True),
            StructField("Type", StringType(), True),
            StructField("Rent_fee", StringType(), True),
            StructField("Building_name", StringType(), True),
            StructField("Appt_details", StringType(), True),
            StructField("event_time", StringType(), True)])

# Schema for Contractor
contractor_schema = StructType([
            StructField("Contract_id", IntegerType(), False),
            StructField("Name", StringType(), True),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True)])

# Schema for Tenant
tenant_schema = StructType([
            StructField("Tenant_id", IntegerType(), False),
            StructField("First_name", StringType(), True),
            StructField("Last_name", StringType(), False),
            StructField("Ssn", StringType(), True),
            StructField("Phone", StringType(), True),
            StructField("Email", StringType(), True), 
            StructField("Mobile", StringType(), True)])

# Schema for Lease 
lease_schema = StructType([
            StructField("Lease_id", IntegerType(), False),
            StructField("Start", StringType(), True),
            StructField("End", StringType(), False),
            StructField("Deposit", StringType(), True),
            StructField("Tenant_id", IntegerType(), True),
            StructField("Apartment_id", IntegerType(), True)])

# Schema  for Rent
rent_schema = StructType([
            StructField("Rent_id", IntegerType(), False),
            StructField("Rent_fee", StringType(), True),
            StructField("Late_fee", StringType(), False),
            StructField("Due_date", StringType(), True),
            StructField("Lease_id", IntegerType(), True),
            StructField("Pay_id", IntegerType(), True)])

# Schema for Payment
payment_schema = StructType([
            StructField("Payment_id", IntegerType(), False),
            StructField("Pay_date", StringType(), True),
            StructField("Pay_amount", StringType(), False),
            StructField("Method", StringType(), True),
            StructField("Rent_id", IntegerType(), True)])

# Schema for Apartment Maintenance
apt_maintenance_schema = StructType([
            StructField("Maintenance_id", IntegerType(), False),
            StructField("Apartment_number", IntegerType(), True),
            StructField("Mdate", StringType(), False),
            StructField("Issue_reported", StringType(), True),
            StructField("Contractor_id", IntegerType(), True), 
            StructField("Resolution", StringType(), True), 
            StructField("Status", StringType(), True),
            StructField("Charges_incurred", StringType(), True)])

# Schema for Building Maintenance
building_maintenance_schema = StructType([
            StructField("Maintenance_id", IntegerType(), False),
            StructField("Building_name", StringType(), True),
            StructField("Ndate", StringType(), False),
            StructField("Issue_reported", StringType(), True),
            StructField("Contractor_id", IntegerType(), True), 
            StructField("Resolution", StringType(), True), 
            StructField("Status", StringType(), True)])


In [2]:
project_Path = "/FileStore/apartment/"
api_key ="ebd69250"
landlord_url = "https://my.api.mockaroo.com/landlord.json?key=" + api_key
landlord_Path = "/FileStore/apartment/landlord/inprogress/"

building_url = "https://my.api.mockaroo.com/building.json?key=" + api_key
building_Path = "/FileStore/apartment/building/inprogress/"

apartment_url = "https://my.api.mockaroo.com/apartment.json?key=" + api_key
apartment_Path = "/FileStore/apartment/apartment/inprogress/"

contractor_url = "https://my.api.mockaroo.com/contractor_table.json?key=" + api_key
contractor_Path = "/FileStore/apartment/contractor/inprogress/"

tenant_url = "https://my.api.mockaroo.com/tenant.json?key=" + api_key
tenant_Path = "/FileStore/apartment/tenant/inprogress/"

lease_url = "https://my.api.mockaroo.com/lease.json?key=" + api_key
lease_Path = "/FileStore/apartment/lease/inprogress/"

rent_url = "https://my.api.mockaroo.com/rent.json?key=" + api_key
rent_Path = "/FileStore/apartment/rent/inprogress/"

payment_url = "https://my.api.mockaroo.com/payment.json?key=" + api_key
payment_Path = "/FileStore/apartment/payment/inprogress/"

apartment_maintenance_url = "https://my.api.mockaroo.com/apartment_maintenance.json?key=" + api_key
apartment_maintenance_Path = "/FileStore/apartment/apartment_maintenance/inprogress/"

building_maintenance_url = "https://my.api.mockaroo.com/building_maintenance.json?key=" + api_key
building_maintenance_Path = "/FileStore/apartment/building_maintenance/inprogress/"


In [3]:
# Read API from Kafka, Event hub, Sockets (for testing) and files . One  need to read data from API and store in file else these structures before processing by spark sreaming
# create the base directory to store csv files
dbutils.fs.rm(project_Path,recurse=True)
dbutils.fs.mkdirs(landlord_Path)
dbutils.fs.mkdirs(building_Path)
dbutils.fs.mkdirs(apartment_Path)
dbutils.fs.mkdirs(contractor_Path)
dbutils.fs.mkdirs(tenant_Path)
dbutils.fs.mkdirs(lease_Path)
dbutils.fs.mkdirs(rent_Path)
dbutils.fs.mkdirs(payment_Path)
dbutils.fs.mkdirs(apartment_maintenance_Path)
dbutils.fs.mkdirs(building_maintenance_Path)

In [4]:
# Save csv file from api url
# '/FileStore/apartment/inprogress/'
def getCSV_FromUrl(url, schema, path_to_save):  
  df = spark.createDataFrame(pd.read_csv(url, lineterminator='\n'), schema)
  
  ts = time.time()
  st = datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H_%M')
  df_with_batch = df.withColumn("fetch_time", lit(datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H_%M_%S')))
  fileName = path_to_save + st + '.tmp'
  fileprefix = path_to_save
  df_with_batch.coalesce(1).write.format("com.databricks.spark.csv") \
    .option("header", True) \
    .option('quote', '"')  \
    .save(fileName)  #saved to the FileStore
    
  fileList =  dbutils.fs.ls(fileName)

  csvFileLocation = ''
  for fileInfo in fileList:   
    if ".csv" in fileInfo.path:
      print("this file is csv file.." )
      print(fileInfo.path)
      csvFileLocation = fileprefix + fileInfo.name      
      dbutils.fs.cp(fileInfo.path,fileprefix)
      dbutils.fs.rm(fileName,recurse=True)
      print(csvFileLocation)

In [5]:
import schedule
import time
import requests
import datetime
import pandas as pd
from pyspark.sql.functions import lit
 
def job():
  print("calling CSV load function")
  getCSV_FromUrl(landlord_url, landlord_schema, landlord_Path)
  getCSV_FromUrl(building_url, building_schema, building_Path)
  getCSV_FromUrl(apartment_url, apartment_schema, apartment_Path)
  getCSV_FromUrl(contractor_url, contractor_schema, contractor_Path)
  getCSV_FromUrl(tenant_url, tenant_schema, tenant_Path)
  getCSV_FromUrl(lease_url, lease_schema, lease_Path)
  getCSV_FromUrl(rent_url, rent_pre_schema, rent_Path)
  getCSV_FromUrl(payment_url, payment_pre_schema, payment_Path)
  getCSV_FromUrl(apartment_maintenance_url, apt_maintenance_schema, apartment_maintenance_Path)
  getCSV_FromUrl(building_maintenance_url, building_maintenance_schema, building_maintenance_Path)


In [6]:
# Entry point
schedule.every(30).seconds.do(job)
while True:
    schedule.run_pending()
    time.sleep(1)