In [1]:
from pyspark.sql import functions as F

from pyspark.sql.types import StructType,StructField, StringType, DoubleType, DateType,TimestampType

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark import SQLContext

import json 

from datetime import datetime, timedelta
import math
import os
from IPython.core.display import display, HTML
from datetime import datetime
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
spark = SparkSession.builder.appName("TFG").getOrCreate()

In [3]:
# SCHEMAS

In [4]:
schema_vehicle = StructType([ \
    StructField("DEVICE_ID",StringType(),True), \
    StructField("LATITUDE",DoubleType(),True), \
    StructField("LONGITUDE",DoubleType(),True), \
    StructField("DATETIME",TimestampType(),True), \
    StructField("SPEED",DoubleType(),True), \
  ])
schema_excep =  StructType([ \
    StructField("DEVICE_ID",StringType(),True), \
    StructField("RULE_ID",StringType(),True), \
    StructField("ACTIVE_FROM",TimestampType(),True), \
    StructField("ACTIVE_TO",TimestampType(),True), \
    StructField("DURATION",StringType(),True),
  ])

In [5]:
# First look at the data

In [6]:
first_look_csv = 'data/exception_original/exception_events_20210201_20210201_0_anon.csv' 

first_look_csv = spark.read.csv(first_look_csv, header=True, inferSchema=True, sep='|', schema=schema_excep)

first_look_csv.printSchema()
first_look_csv.show(3, truncate=False)

root
 |-- DEVICE_ID: string (nullable = true)
 |-- RULE_ID: string (nullable = true)
 |-- ACTIVE_FROM: timestamp (nullable = true)
 |-- ACTIVE_TO: timestamp (nullable = true)
 |-- DURATION: string (nullable = true)

+---------+--------------+-----------------------+-----------------------+----------------+
|DEVICE_ID|RULE_ID       |ACTIVE_FROM            |ACTIVE_TO              |DURATION        |
+---------+--------------+-----------------------+-----------------------+----------------+
|NU5CJSDX |RuleSeatbeltId|2021-02-01 01:17:30    |2021-02-01 01:17:38.66 |00:00:08.6600000|
|4BF9LOXT |RuleSeatbeltId|2021-02-01 02:13:29.317|2021-02-01 02:14:07.103|00:00:37.7860000|
|GT42SJ4M |RuleSeatbeltId|2021-02-01 02:22:07.183|2021-02-01 02:22:11    |00:00:03.8170000|
+---------+--------------+-----------------------+-----------------------+----------------+
only showing top 3 rows



In [7]:
# COMMON FUNCTIONS

In [8]:
def get_day_month_year(filename):
    '''
    return: day, month and year of the filename
    '''
    dia = filename.split('_')[1][6:]
    mes = filename.split('_')[1][4:6]
    ano = filename.split('_')[1][:4]
    
    return dia, mes, ano

In [9]:
def get_distance(lat1deg, lon1deg, lat2deg, lon2deg):
    '''
    return: distance in metres from 2 locations (lat, lon)
    '''
    if not all((lat1deg, lon1deg, lat2deg, lon2deg)):
        return 0.0
    
    #approximate radius of earth in m
    R = 6373000.0

    lat1 = math.radians(lat1deg)
    lon1 = math.radians(lon1deg)
    lat2 = math.radians(lat2deg)
    lon2 = math.radians(lon2deg)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance

# register as a UDF 
get_distance_udf = F.udf(get_distance, DoubleType())

In [10]:
def get_delta_time(timestr): 
    '''
    return: deltatime as float in seconds from HH:MM:SS.mmm
    '''
    values = timestr.split(':')
    #Generate a timedelta
    delta = timedelta(hours=float(values[0]), minutes=float(values[1]), seconds=float(values[2]))
    #Represent in Seconds
    return delta.total_seconds()

# register as a UDF 
get_delta_time_udf = F.udf(get_delta_time, DoubleType())

In [11]:
def abs_time_delta(y,x): 
    '''
    return: duration in seconds
    '''
    if not all((x, y)):
        return 0.0
    delta = math.fabs((x-y).total_seconds())
    return delta

# register as a UDF 
abs_time_delta_udf = F.udf(abs_time_delta, DoubleType())

In [12]:
# return time - 10 minutes
mintimeudf = F.udf(lambda x: (x - timedelta(seconds=600)), TimestampType())
# return time + 10 minutes
maxtimeudf = F.udf(lambda x: (x + timedelta(seconds=600)), TimestampType())

In [13]:
# EXCEPTIONS TREATMENT

In [14]:
def get_exceptions(file, df_vehicle):
    '''
    
    '''
    df_excep_initial = spark.read.csv(file ,header=True, inferSchema=True, sep='|', schema=schema_excep) \
                    .withColumnRenamed('DURATION', 'DURATION_STR')

    df_excep1 = df_excep_initial.withColumn('DATE', F.date_format(F.col('ACTIVE_FROM'), "yyyy/MM/dd")) \
                                .withColumn('DURATION', get_delta_time_udf(F.col('DURATION_STR'))) \
                                .withColumn('starttime', mintimeudf(F.col('ACTIVE_FROM'))) \
                                .withColumn('endtime', maxtimeudf(F.col('ACTIVE_TO'))) \
                                .drop('DURATION_STR')
    df_excep2 = df_excep1.join(df_vehicle, on=((df_vehicle.DEVICE_ID==df_excep1.DEVICE_ID) &
                                         (df_vehicle.DATETIME_V.between(df_excep1.starttime, df_excep1.endtime))), how='left') \
                         .drop(df_vehicle.DEVICE_ID)
   
    df_excep3 = df_excep2.where(F.col('DATETIME_V').isNotNull())
    window_excep = Window.partitionBy(["DEVICE_ID", "ACTIVE_FROM"]).orderBy(F.col('diff_date').desc())
    df_excep4 = df_excep3.withColumn('diff_date', abs_time_delta_udf(F.col('ACTIVE_FROM'), F.col('DATETIME_V'))) \
                        .withColumn('next_diff_date', F.lead('diff_date', default=1000).over(window_excep)) \
                        .withColumn('result', F.when(F.col('diff_date') < F.col('next_diff_date'), 1).otherwise(0)) \
                        .filter(F.col('result') == 1) \
                        .drop('diff_date', 'next_diff_date', 'result', 'DATETIME_V', 'ACTIVE_FROM', 'ACTIVE_TO', 'DURATION', 'SPEED')

    return df_excep4


In [15]:
# SAVE EXCEPTIONS

In [16]:
def save_exceptions(df_exceptions, output_file):
    '''
    defines the structure the json file will take
    '''
    data_exceptions = df_exceptions.collect()
    primera_linea = True
    for row in data_exceptions:
        if primera_linea:
            primera_linea = False
        else:
            output_file.write(', ')
        output_file.write('{"device_id": "' + row['DEVICE_ID'] 
                          + '", "date": "' + row['DATE'] 
                          + '", "type_exception": "' + row['RULE_ID'] 
                          + '", "coordinates": [' + str(row['LONGITUDE']) + ', ' + str(row['LATITUDE']) + ']}')
        

In [17]:
MAIN_START_TIME = datetime.now()
directory = "data/fleet_original/"
for filename in os.listdir(directory):
    # read csv
    if filename.endswith(".csv"):
        
        dia, mes, ano =  get_day_month_year(filename)
      
        #---------------------
        df_vehicles = spark.read.csv(directory+filename, header=True,inferSchema=True, sep='|',schema=schema_vehicle
                                    ).withColumnRenamed('DATETIME', 'DATETIME_V') # read vehicles csv and rename datetimecolumn so we know it corresponds to the vehicles
        print('{} read at {}'.format(filename, datetime.now()))

        # open file
        output_file = open("json_data/exceptions/exceptions_{}{}{}.json".format(ano,mes,dia), "wt")
       
        output_file.write('{"type": "FeatureCollection", "features": [') # all the lines will start like this
        
        directory_exceptions = "data/exception_original/"
        exception_filename = 'exception_events_{}{}{}_{}{}{}_0_anon.csv'.format(ano,mes,dia,ano,mes,dia)
        
        df_exceptions = get_exceptions(directory_exceptions + exception_filename, df_vehicles)
        
        print('df_exceptions done at {}'.format(datetime.now()))
        save_exceptions(df_exceptions, output_file)
    
        output_file.write(']}')
        output_file.close()
        print('save exceptions done at {}'.format(datetime.now()))

logrecords_20210201_20210201_0_anon.csv read at 2022-02-15 19:54:51.107084
df_exceptions done at 2022-02-15 19:54:52.368220
save exceptions done at 2022-02-15 19:55:39.763834
logrecords_20210202_20210202_0_anon.csv read at 2022-02-15 19:55:39.960784
df_exceptions done at 2022-02-15 19:55:41.982997
save exceptions done at 2022-02-15 19:56:13.850699
logrecords_20210203_20210203_0_anon.csv read at 2022-02-15 19:56:13.881597
df_exceptions done at 2022-02-15 19:56:14.088630
save exceptions done at 2022-02-15 19:56:43.749325
logrecords_20210204_20210204_0_anon.csv read at 2022-02-15 19:56:43.773228
df_exceptions done at 2022-02-15 19:56:43.960995
save exceptions done at 2022-02-15 19:57:08.320266
logrecords_20210205_20210205_0_anon.csv read at 2022-02-15 19:57:08.337050
df_exceptions done at 2022-02-15 19:57:08.520410
save exceptions done at 2022-02-15 19:57:34.135430
logrecords_20210206_20210206_0_anon.csv read at 2022-02-15 19:57:34.162623
df_exceptions done at 2022-02-15 19:57:34.365173
s

In [18]:
print('exception_treatment process finished (duration = {} hours, {} minutes)'.format(
    ((datetime.now() - MAIN_START_TIME).seconds)//3600,
    (((datetime.now() - MAIN_START_TIME).seconds)//60)%60))

exception_treatment process finished (duration = 0 hours, 11 minutes)
