In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, DoubleType, DateType,TimestampType
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark import SQLContext
import json 
from datetime import datetime, timedelta
import math
import os
from IPython.core.display import display, HTML
from datetime import datetime
import pyspark.conf as conf
from IPython.display import clear_output

conf.autoBroadcastJoinThreshold = -1
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
spark = SparkSession.builder.appName("TFG").getOrCreate()

# SCHEMAS

In [3]:
schema_vehicle = StructType([ \
    StructField("DEVICE_ID",StringType(),True), \
    StructField("LATITUDE",DoubleType(),True), \
    StructField("LONGITUDE",DoubleType(),True), \
    StructField("DATETIME",TimestampType(),True), \
    StructField("SPEED",DoubleType(),True), \
  ])

# First look at the data

In [4]:
first_look_csv = 'data/fleet_original/logrecords_20210201_20210201_0_anon.csv' 

first_look_csv = spark.read.csv(first_look_csv, header=True, inferSchema=True, sep='|', schema=schema_vehicle)

first_look_csv.printSchema()
first_look_csv.show(3, truncate=False)

root
 |-- DEVICE_ID: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- SPEED: double (nullable = true)

+---------+----------+-----------+-----------------------+-----+
|DEVICE_ID|LATITUDE  |LONGITUDE  |DATETIME               |SPEED|
+---------+----------+-----------+-----------------------+-----+
|HQ3OZP1U |39.8995209|3.07118082 |2021-02-01 01:00:00.063|0.0  |
|GT42SJ4M |40.408371 |-3.66694403|2021-02-01 01:00:03    |42.0 |
|IILVL5RI |36.5030899|-6.26657295|2021-02-01 01:00:08.063|0.0  |
+---------+----------+-----------+-----------------------+-----+
only showing top 3 rows



# TRATAMIENTO DE DATOS

In [5]:
# rename columns
first_look_csv.withColumnRenamed('SPEED', 'SPEED (km/h)').show(3, truncate=False)

+---------+----------+-----------+-----------------------+------------+
|DEVICE_ID|LATITUDE  |LONGITUDE  |DATETIME               |SPEED (km/h)|
+---------+----------+-----------+-----------------------+------------+
|HQ3OZP1U |39.8995209|3.07118082 |2021-02-01 01:00:00.063|0.0         |
|GT42SJ4M |40.408371 |-3.66694403|2021-02-01 01:00:03    |42.0        |
|IILVL5RI |36.5030899|-6.26657295|2021-02-01 01:00:08.063|0.0         |
+---------+----------+-----------+-----------------------+------------+
only showing top 3 rows



# FUNCTIONS

In [6]:
def get_distance(lat1deg, lon1deg, lat2deg, lon2deg):
    '''
    return: distance in metres from 2 locations (lat, lon)
    '''
    if not all((lat1deg, lon1deg, lat2deg, lon2deg)):
        return 0.0
    
    #approximate radius of earth in m
    R = 6373000.0

    lat1 = math.radians(lat1deg)
    lon1 = math.radians(lon1deg)
    lat2 = math.radians(lat2deg)
    lon2 = math.radians(lon2deg)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance

# register as a UDF 
get_distance_udf = F.udf(get_distance, DoubleType())

In [7]:
def get_delta_time(timestr): 
    '''
    return: deltatime as float in seconds from HH:MM:SS.mmm
    '''
    values = timestr.split(':')
    #Generate a timedelta
    delta = timedelta(hours=float(values[0]), minutes=float(values[1]), seconds=float(values[2]))
    #Represent in Seconds
    return delta.total_seconds()

# register as a UDF 
get_delta_time_udf = F.udf(get_delta_time, DoubleType())

In [8]:
def abs_time_delta(y,x): 
    '''
    return: duration in seconds
    '''
    if not all((x, y)):
        return 0.0
    delta = math.fabs((x-y).total_seconds())
    return delta

# register as a UDF 
abs_time_delta_udf = F.udf(abs_time_delta, DoubleType())

add dupe column

In [9]:
def get_dupe_column(w, df_vehicles, device_id):
    '''
    create new column "dupe" to keep only one of the stationary vehicle entries
    '''
    df_vehicle = df_vehicles \
                .where(F.col('DEVICE_ID') == device_id.DEVICE_ID).sort(F.col('DATETIME')) \
                .withColumn("dupe", \
                            F.when((F.col("SPEED") == 0) & \
                                   (F.lag("SPEED").over(w) == 0) & \
                                   (F.col("LONGITUDE") == F.lag("LONGITUDE").over(w)) & \
                                   (F.col("LATITUDE") == F.lag("LATITUDE").over(w)), 1).otherwise(0)) \
                .filter((F.col("dupe") == False) | (F.col("dupe").isNull())).drop("dupe")
    return df_vehicle

duration and distance

In [10]:
def get_duration_and_distance(w, df_vehicle):
    '''
    calculate duration and distance of the route
    if distance is > 1km, it is considered a tunnel
    '''
    df_vehicle = df_vehicle.withColumn('PREV_LATITUDE', F.lag('LATITUDE').over(w)) \
                   .withColumn('PREV_LONGITUDE', F.lag('LONGITUDE').over(w)) \
                   .withColumn('PREV_DATETIME', F.lag('DATETIME').over(w)) \
                   .withColumn('DISTANCE', get_distance_udf(F.col('LATITUDE'), F.col('LONGITUDE'), F.col('PREV_LATITUDE'), F.col('PREV_LONGITUDE'))) \
                   .withColumn('DURATION', abs_time_delta_udf(F.col('PREV_DATETIME'), F.col('DATETIME'))) \
                   .where(F.col('DISTANCE') != 0) \
                   .withColumn('TUNNEL', F.when(F.col('DISTANCE') > 1000, True).otherwise(False)) \
                   .withColumn('PREV_TUNNEL', F.lag('TUNNEL').over(w)) \
                   .drop('PREV_DATETIME')
    return df_vehicle

total speed and distance

In [11]:
def get_distance_and_driving_time(df_vehicle_dur_dist, first_line_vehicle, output_file):
    '''
    calculates the total driving time and distance (km)
    '''
    df_duration_distance = df_vehicle_dur_dist.select('DURATION', 'DISTANCE');
    df_sum = df_duration_distance.groupBy().sum()
    datasum = df_sum.collect()
    duration = datasum[0]['sum(DURATION)']
    distance = datasum[0]['sum(DISTANCE)']
    speed = 0.0
    
    if (duration is None) | (distance is None):
        speed = 0.0
        distance = 0.0

    else:
        speed = round(distance * 3.6 / duration, 2) # makes it km/h
        distance = round(distance / 1000.0, 2)
    
    return speed, distance


vehicles geometry for geojson

In [12]:
def get_vehicle_geometry(headerVehicle, device_id, speed, distance, df_block_vehicle, output_file):
    tunnel = "1" if headerVehicle[0]['TUNNEL'] is True else "0"
    date = '{}/{}/{}'.format(headerVehicle[0]['DATETIME'].strftime("%d"),headerVehicle[0]['DATETIME'].strftime("%m"),headerVehicle[0]['DATETIME'].strftime("%Y"))
    output_file.write('{"type": "Feature", "id": "' + device_id.DEVICE_ID + \
                        '", "date": "' + date + \
                        '", "speed": ' + str(speed) + \
                        ', "distance": ' + str(distance) + \
                        ', "tunnel": ' + tunnel + \
                        ', "geometry": {"type": "LineString", "coordinates": [');
    df_vehicle_geometry = df_block_vehicle.select('LONGITUDE', 'LATITUDE', 'PREV_LONGITUDE', 'PREV_LATITUDE', 'TUNNEL', 'PREV_TUNNEL') \
                                            .cache()

    data_vehicle = df_vehicle_geometry.rdd.toLocalIterator()
    
    df_vehicle_geometry.unpersist()

    return data_vehicle

# VEHICLES TREATMENT

In [13]:
def vehicles_treatment(df_vehicles, output_file):
    '''
    does all the treatment of the vehicles
    param df_vehicles
    return:
    '''
    devices_id = df_vehicles.select('DEVICE_ID').distinct().collect() 
    total_vehicles = len(devices_id)
    vehicle_number = 1
    first_line_vehicle = True
    w = Window().orderBy(F.col("DATETIME"))

    for device_id in devices_id:
        df_vehicle = get_dupe_column(w, df_vehicles, device_id)
        df_vehicle = df_vehicle.cache()

        df_vehicle_dur_dist = get_duration_and_distance(w, df_vehicle)
        df_vehicle_dur_dist = df_vehicle_dur_dist.cache()

        windowSpec = Window().orderBy(F.col("DATETIME")).rowsBetween(Window.unboundedPreceding, Window.currentRow)


        # creates column relevantCount that states the line number
        df_vehicle_dur_dist = df_vehicle_dur_dist.withColumn('relevantCount',F.count(F.when(F.col('TUNNEL') != F.col('PREV_TUNNEL'), F.lit(1))
                                                                                        ).over(windowSpec))
        
        print("    Vehicle {} ({}/{})".format(device_id.DEVICE_ID, vehicle_number, total_vehicles))
        vehicle_number = vehicle_number + 1

        speed, distance = get_distance_and_driving_time(df_vehicle_dur_dist, first_line_vehicle, output_file)

        vehicleBlockIndex = 0
        first_line_block = True

        while True:
            df_block_vehicle = df_vehicle_dur_dist.where(F.col('relevantCount') == vehicleBlockIndex) 
            
            vehicleBlockIndex = vehicleBlockIndex + 1
            headerVehicle = df_block_vehicle.head(1)

            if len(headerVehicle) == 0:
                break

            if first_line_block:
                first_line_block = False

            else:
                output_file.write(', ')

           
            data_vehicle = get_vehicle_geometry(headerVehicle, device_id, speed, distance, df_block_vehicle, output_file)
           
            first_line_geometry = True

            for row in data_vehicle:
                if first_line_geometry:
                    first_line_geometry = False
                    if row["TUNNEL"] != row["PREV_TUNNEL"]:
                        output_file.write('[' + str(row['PREV_LONGITUDE']) + ', ' + str(row['PREV_LATITUDE']) + '], ');
                else:
                    output_file.write(', ')
                output_file.write('[' + str(row['LONGITUDE']) + ', ' + str(row['LATITUDE']) + ']');
            output_file.write(']}}');            
            df_block_vehicle.unpersist()
        df_vehicle.unpersist()
        df_vehicle_dur_dist.unpersist()

read all vehicles

In [14]:
def get_vehicless_initial(file):
    df_vehicles_initial = spark.read.csv(file, header=True, inferSchema=True, sep='|',schema=schema_vehicle)
    return df_vehicles_initial

In [15]:
starting_time = datetime.now()
directory = "data/fleet_original/"
for filename in os.listdir(directory):
    # leemos csv
    if filename.endswith(".csv"): 

        print(directory + filename)
        dia = filename.split('_')[1][6:]
        mes = filename.split('_')[1][4:6]
        ano = filename.split('_')[1][:4]
        
        df_vehicle = get_vehicless_initial(directory + filename)
        print('df_vehicle loaded at {}'.format(datetime.now()))

        with open("json_data/fleet/vehicles_{}{}{}.json".format(ano,mes,dia), 'wt') as output_file:
            output_file.write('{"type": "FeatureCollection", "features": [')

            vehicles_treatment(df_vehicle, output_file)

            #clear_output(wait=True)

            print('save vehicles from {} done at {}'.format(filename, datetime.now()))
            output_file.write(']}')

data/fleet_original/logrecords_20210201_20210201_0_anon.csv
df_vehicle loaded at 2022-02-13 21:19:44.182980
    Vehicle 73TO3F88 (1/1260)
    Vehicle 46CBRBHN (2/1260)
    Vehicle 4XUR6E8S (3/1260)
    Vehicle EIRCXU5I (4/1260)
    Vehicle 9UHI5H8P (5/1260)
    Vehicle 285VUDZJ (6/1260)
    Vehicle CFHZ38OO (7/1260)
    Vehicle YHI8RFLK (8/1260)
    Vehicle JI2SVIBH (9/1260)
    Vehicle IFEMQIQA (10/1260)
    Vehicle SLMLB1US (11/1260)
    Vehicle 69YB6C4G (12/1260)
    Vehicle MQFPIMWU (13/1260)
    Vehicle GHKWPH6E (14/1260)
    Vehicle 6OTUZKR5 (15/1260)
    Vehicle 4CE3R75N (16/1260)
    Vehicle KVPAS4IR (17/1260)
    Vehicle 8VU78QAP (18/1260)
    Vehicle WWZT8WBU (19/1260)
    Vehicle M9JLR1EP (20/1260)
    Vehicle 9VYLN0F6 (21/1260)
    Vehicle H5GG70Q5 (22/1260)
    Vehicle 6VA93QSM (23/1260)
    Vehicle ZTVL8T8X (24/1260)
    Vehicle OB34P62L (25/1260)
    Vehicle OUQY9YQG (26/1260)
    Vehicle SR8RK7LC (27/1260)
    Vehicle IV31HQW6 (28/1260)
    Vehicle 0MBQDNMY (29/1260)
  

    Vehicle ZZAJ8BTV (258/1260)
    Vehicle YXJVBZEJ (259/1260)
    Vehicle MOLKW4PF (260/1260)
    Vehicle TGQL10Y0 (261/1260)
    Vehicle YJBTQJUQ (262/1260)
    Vehicle BCA86XZE (263/1260)
    Vehicle MH8CPNEQ (264/1260)
    Vehicle ASKQB67I (265/1260)
    Vehicle ZY8CI7HI (266/1260)
    Vehicle 8IY0BP1G (267/1260)
    Vehicle 54B53X2F (268/1260)
    Vehicle 94N3CTLK (269/1260)
    Vehicle 2QVJIO7G (270/1260)
    Vehicle 7CH5D8IF (271/1260)
    Vehicle 35WO91X5 (272/1260)
    Vehicle 00LXDZCW (273/1260)
    Vehicle 2UWNKIV5 (274/1260)
    Vehicle 7WJSLWAK (275/1260)
    Vehicle X9YVBBH0 (276/1260)
    Vehicle LITY30QJ (277/1260)
    Vehicle 2X1V8LFB (278/1260)
    Vehicle 7XUW89WI (279/1260)
    Vehicle 6G7TEFKA (280/1260)
    Vehicle WSU9K8HX (281/1260)
    Vehicle 10XPA79L (282/1260)
    Vehicle GT42SJ4M (283/1260)
    Vehicle ZNBJM21J (284/1260)
    Vehicle 5OBUCFTP (285/1260)
    Vehicle 9O658CMC (286/1260)
    Vehicle OX8AUPYO (287/1260)
    Vehicle 02X0FJBH (288/1260)
    Vehi

    Vehicle 4T39NWLH (515/1260)
    Vehicle BSICFUCU (516/1260)
    Vehicle OUA4C57C (517/1260)
    Vehicle 58V9ESUY (518/1260)
    Vehicle 4JJ8HTIB (519/1260)
    Vehicle JK8I9FAM (520/1260)
    Vehicle OPMN8KZT (521/1260)
    Vehicle CG1M0YO9 (522/1260)
    Vehicle W6LOFVMN (523/1260)
    Vehicle 4L9WL9FV (524/1260)
    Vehicle 39MVP3DA (525/1260)
    Vehicle 7LI2TGG8 (526/1260)
    Vehicle UFP86HFE (527/1260)
    Vehicle RMMGE252 (528/1260)
    Vehicle M3ISI50M (529/1260)
    Vehicle HE6O8RO3 (530/1260)
    Vehicle 3STDHEOB (531/1260)
    Vehicle JJ7C6Z77 (532/1260)
    Vehicle WIXRHY64 (533/1260)
    Vehicle GIMQ11HF (534/1260)
    Vehicle JD655SOM (535/1260)
    Vehicle G6TLEL8Y (536/1260)
    Vehicle EYN2DYWC (537/1260)
    Vehicle Y4ERYKI5 (538/1260)
    Vehicle YGFES4OC (539/1260)
    Vehicle QAV6UU4A (540/1260)
    Vehicle 2HP8H3UR (541/1260)
    Vehicle L755YV68 (542/1260)
    Vehicle HA4WNKQA (543/1260)
    Vehicle P4NC962X (544/1260)
    Vehicle MZQZOS92 (545/1260)
    Vehi

    Vehicle XQF7NXPQ (772/1260)
    Vehicle 5FRIQ7CO (773/1260)
    Vehicle NGA8ZJYV (774/1260)
    Vehicle WQ5Z9C5J (775/1260)
    Vehicle 1NDYBIT7 (776/1260)
    Vehicle ZLGK0299 (777/1260)
    Vehicle G2JELJ0F (778/1260)
    Vehicle 4AZQ2BQX (779/1260)
    Vehicle DME75AB8 (780/1260)
    Vehicle T85JMZYJ (781/1260)
    Vehicle EL06PEEK (782/1260)
    Vehicle DW9DDBKL (783/1260)
    Vehicle B0E6U0OB (784/1260)
    Vehicle S5KVQMB4 (785/1260)
    Vehicle CMK865KA (786/1260)
    Vehicle 0MCOS47D (787/1260)
    Vehicle DJAZ3PO7 (788/1260)
    Vehicle TRBM4G71 (789/1260)
    Vehicle 1WHP3QMW (790/1260)
    Vehicle 32XWTSYD (791/1260)
    Vehicle WCAE4GE7 (792/1260)
    Vehicle 7ALYFLFA (793/1260)
    Vehicle ZQ1EN8JZ (794/1260)
    Vehicle KEOMNM0H (795/1260)
    Vehicle H6WS111D (796/1260)
    Vehicle 65E24T4E (797/1260)
    Vehicle RBAYAL5U (798/1260)
    Vehicle DRO7DY0H (799/1260)
    Vehicle YQHOVUA3 (800/1260)
    Vehicle 9CNJ022G (801/1260)
    Vehicle Q1T50S84 (802/1260)
    Vehi

    Vehicle 4SAGDEDL (1028/1260)
    Vehicle 3Z0PKR61 (1029/1260)
    Vehicle KYDL1DJA (1030/1260)
    Vehicle 3EMHXP32 (1031/1260)
    Vehicle 4GR51YXD (1032/1260)
    Vehicle WUSWN9VV (1033/1260)
    Vehicle OGQ0H69N (1034/1260)
    Vehicle GKOFE346 (1035/1260)
    Vehicle T8578PPN (1036/1260)
    Vehicle M6LJG8FS (1037/1260)
    Vehicle 4B1V22W3 (1038/1260)
    Vehicle X8SIOBH5 (1039/1260)
    Vehicle 06JS22XS (1040/1260)
    Vehicle WO43RSAV (1041/1260)
    Vehicle 3YVQ1MKX (1042/1260)
    Vehicle TZFSUQ6B (1043/1260)
    Vehicle GCBISVR2 (1044/1260)
    Vehicle H4JWR703 (1045/1260)
    Vehicle URGNNWT9 (1046/1260)
    Vehicle E6BJ18CE (1047/1260)
    Vehicle X875LT55 (1048/1260)
    Vehicle ER0CJMZM (1049/1260)
    Vehicle 73E4FZOI (1050/1260)
    Vehicle NIZX8XJO (1051/1260)
    Vehicle 8QLHM2ZV (1052/1260)
    Vehicle AD4GQRYI (1053/1260)
    Vehicle BSDLO85L (1054/1260)
    Vehicle OK8641BF (1055/1260)
    Vehicle H9W788A1 (1056/1260)
    Vehicle LWB8RK9D (1057/1260)
    Vehicl

    Vehicle JI2SVIBH (12/1262)
    Vehicle 6OTUZKR5 (13/1262)
    Vehicle MQFPIMWU (14/1262)
    Vehicle GHKWPH6E (15/1262)
    Vehicle KVPAS4IR (16/1262)
    Vehicle 8VU78QAP (17/1262)
    Vehicle 4CE3R75N (18/1262)
    Vehicle M9JLR1EP (19/1262)
    Vehicle WWZT8WBU (20/1262)
    Vehicle 6VA93QSM (21/1262)
    Vehicle 9VYLN0F6 (22/1262)
    Vehicle H5GG70Q5 (23/1262)
    Vehicle ZTVL8T8X (24/1262)
    Vehicle OUQY9YQG (25/1262)
    Vehicle QM5QC31Z (26/1262)
    Vehicle OB34P62L (27/1262)
    Vehicle K7H3XGTE (28/1262)
    Vehicle SR8RK7LC (29/1262)
    Vehicle 0MBQDNMY (30/1262)
    Vehicle IV31HQW6 (31/1262)
    Vehicle OP8F2BOU (32/1262)
    Vehicle FBN00C4B (33/1262)
    Vehicle EK1T8UL2 (34/1262)
    Vehicle 61NAP4CB (35/1262)
    Vehicle B7BRD3YK (36/1262)
    Vehicle STXKDCGN (37/1262)
    Vehicle 0R76S8C0 (38/1262)
    Vehicle SVIRBPJH (39/1262)
    Vehicle 7ADFF6LI (40/1262)
    Vehicle A5UF9RUG (41/1262)
    Vehicle FT7LKMSK (42/1262)
    Vehicle E2E462LD (43/1262)
    Vehi

    Vehicle 00LXDZCW (271/1262)
    Vehicle 2UWNKIV5 (272/1262)
    Vehicle 2X1V8LFB (273/1262)
    Vehicle X9YVBBH0 (274/1262)
    Vehicle 10XPA79L (275/1262)
    Vehicle 7WJSLWAK (276/1262)
    Vehicle LITY30QJ (277/1262)
    Vehicle 7XUW89WI (278/1262)
    Vehicle WSU9K8HX (279/1262)
    Vehicle 6G7TEFKA (280/1262)
    Vehicle 5OBUCFTP (281/1262)
    Vehicle MIZU2V7B (282/1262)
    Vehicle 02X0FJBH (283/1262)
    Vehicle GT42SJ4M (284/1262)
    Vehicle OX8AUPYO (285/1262)
    Vehicle ZNBJM21J (286/1262)
    Vehicle 9O658CMC (287/1262)
    Vehicle W528MENL (288/1262)
    Vehicle SJJHCMPM (289/1262)
    Vehicle AIHL8GKZ (290/1262)
    Vehicle 00YOI3WI (291/1262)
    Vehicle XRIV2MUF (292/1262)
    Vehicle 08OWVC1O (293/1262)
    Vehicle 5FPHLCH3 (294/1262)
    Vehicle IBZT08CC (295/1262)
    Vehicle WJSZ6W3K (296/1262)
    Vehicle Z0AVUQX5 (297/1262)
    Vehicle ATQNN7O0 (298/1262)
    Vehicle A820CSG7 (299/1262)
    Vehicle 41XGM7P0 (300/1262)
    Vehicle JKST1NWS (301/1262)
    Vehi

    Vehicle M3ISI50M (528/1262)
    Vehicle RMMGE252 (529/1262)
    Vehicle 7LI2TGG8 (530/1262)
    Vehicle 3STDHEOB (531/1262)
    Vehicle WIXRHY64 (532/1262)
    Vehicle QAV6UU4A (533/1262)
    Vehicle G6TLEL8Y (534/1262)
    Vehicle GIMQ11HF (535/1262)
    Vehicle YGFES4OC (536/1262)
    Vehicle Y4ERYKI5 (537/1262)
    Vehicle JD655SOM (538/1262)
    Vehicle EYN2DYWC (539/1262)
    Vehicle 2HP8H3UR (540/1262)
    Vehicle P4NC962X (541/1262)
    Vehicle 2R805RLS (542/1262)
    Vehicle L755YV68 (543/1262)
    Vehicle HA4WNKQA (544/1262)
    Vehicle F8WDJH6H (545/1262)
    Vehicle MZQZOS92 (546/1262)
    Vehicle T7US804A (547/1262)
    Vehicle MAAM6NKW (548/1262)
    Vehicle P2Y9QT3T (549/1262)
    Vehicle 6162D50R (550/1262)
    Vehicle UJHJ5MND (551/1262)
    Vehicle 27FF8SVP (552/1262)
    Vehicle SEGTRUJL (553/1262)
    Vehicle 9WDJP8J8 (554/1262)
    Vehicle 0TTNG0YU (555/1262)
    Vehicle 7SO0R6IL (556/1262)
    Vehicle MABAYCC2 (557/1262)
    Vehicle XOG8OEUF (558/1262)
    Vehi

Py4JJavaError: An error occurred while calling o453139.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 112680.0 failed 1 times, most recent failure: Lost task 0.0 in stage 112680.0 (TID 339453, 192.168.1.146, executor driver): java.lang.OutOfMemoryError: Java heap space

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2164)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3450)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3447)
	at jdk.internal.reflect.GeneratedMethodAccessor123.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: java.lang.OutOfMemoryError: Java heap space


In [None]:
print('vehicles_treatment process finished (duration = {} hours, {} minutes)'.format(
    ((datetime.now() - MAIN_START_TIME).seconds)//3600,
    (((datetime.now() - MAIN_START_TIME).seconds)//60)%60))

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 10182)
Traceback (most recent call last):
  File "C:\Users\Maria\anaconda3\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Users\Maria\anaconda3\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "C:\Users\Maria\anaconda3\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Users\Maria\anaconda3\lib\socketserver.py", line 720, in __init__
    self.handle()
  File "C:\Users\Maria\anaconda3\lib\site-packages\pyspark\accumulators.py", line 268, in handle
    poll(accum_updates)
  File "C:\Users\Maria\anaconda3\lib\site-packages\pyspark\accumulators.py", line 241, in poll
    if func():
  File "C:\Users\Maria\anaconda3\lib\site-packages\pyspark\accumulators.py", line 245, in accum_updates
  