# Validations Max 

### Load Trimet JSON into dataframe: 
| BreadCrumb      | Trip |
| ----------- | ----------- |
| tstamp      | trip_id       |
| latitude   | route_id        |
| longitude   | vehicle_id        |
| speed   | service_key        |
| trip_id   | direction        |


In [40]:
import pandas as pd
df = pd.read_json('2023-05-10.json')
df.head(5)


Unnamed: 0,EVENT_NO_TRIP,EVENT_NO_STOP,OPD_DATE,VEHICLE_ID,METERS,ACT_TIME,GPS_LONGITUDE,GPS_LATITUDE,GPS_SATELLITES,GPS_HDOP
0,229985012,229985125,01JAN2023:00:00:00,3058,45415,33102,-122.568135,45.43627,10.0,0.8
1,229985012,229985125,01JAN2023:00:00:00,3058,45442,33106,-122.56826,45.43651,10.0,0.8
2,229985012,229985125,01JAN2023:00:00:00,3058,45461,33112,-122.568512,45.436545,11.0,0.8
3,229985127,229985128,01JAN2023:00:00:00,3058,45635,34432,-122.569083,45.43591,9.0,1.1
4,229985127,229985128,01JAN2023:00:00:00,3058,45673,34437,-122.569107,45.43626,10.0,0.9


In [25]:
df.describe()


Unnamed: 0,EVENT_NO_TRIP,EVENT_NO_STOP,VEHICLE_ID,METERS,ACT_TIME,GPS_LONGITUDE,GPS_LATITUDE,GPS_SATELLITES,GPS_HDOP
count,216463.0,216463.0,216463.0,216463.0,216463.0,216355.0,216355.0,216462.0,216462.0
mean,230132700.0,230132800.0,3591.081982,111297.653927,51441.488864,-122.646437,45.514654,11.436894,0.836358
std,220427.4,220428.5,385.386362,75309.03347,17722.068811,0.109406,0.045828,1.129011,0.350909
min,229810800.0,229810800.0,3015.0,7.0,15142.0,-123.115858,45.356927,0.0,0.6
25%,230015600.0,230015700.0,3216.0,52008.0,37327.0,-122.68305,45.496533,11.0,0.7
50%,230095700.0,230095700.0,3566.0,99199.0,49984.0,-122.64841,45.515575,12.0,0.8
75%,230174400.0,230174400.0,3945.0,154249.5,64046.0,-122.579147,45.544608,12.0,0.9
max,231115500.0,231115600.0,4303.0,372781.0,94349.0,-122.398347,45.614322,12.0,24.6


In [26]:
# 1. There are no 0/NaN values (each row has all values)

notna = df.notna()
if False in notna: 
    print ('False')
else: 
    print ("No NaN/empty values found")

No NaN/empty values found


In [47]:
# 2. Date should be one day: once timestamp configed check that date is same day


In [None]:
# 3. A trip should have a trip number as well as a vehicle number. 
# The code for #1 should cover this 

In [30]:
# 4. Every same trip number should have the same vehicle number 
# trips = df.groupby(['EVENT_NO_TRIP']).mean()
trips = df['EVENT_NO_TRIP'].unique()
trips.size


5751

In [27]:
# 5. There are a few millions records daily but not more (>5mil = alarm)
df['EVENT_NO_TRIP'].size

216463

In [29]:
# 6. The date should always be the same in every row 
date = df['OPD_DATE'].unique()
date.size

1

In [None]:
# 7. The velocity should conform to a (normal?) distribution - max/min/mean speeds 
# Can't do without transformation

# Transformation -> SQL tables

In [42]:
# Should just load table with columns needed to save memory 
df.head(5)

Unnamed: 0,EVENT_NO_TRIP,EVENT_NO_STOP,OPD_DATE,VEHICLE_ID,METERS,ACT_TIME,GPS_LONGITUDE,GPS_LATITUDE,GPS_SATELLITES,GPS_HDOP
0,229985012,229985125,01JAN2023:00:00:00,3058,45415,33102,-122.568135,45.43627,10.0,0.8
1,229985012,229985125,01JAN2023:00:00:00,3058,45442,33106,-122.56826,45.43651,10.0,0.8
2,229985012,229985125,01JAN2023:00:00:00,3058,45461,33112,-122.568512,45.436545,11.0,0.8
3,229985127,229985128,01JAN2023:00:00:00,3058,45635,34432,-122.569083,45.43591,9.0,1.1
4,229985127,229985128,01JAN2023:00:00:00,3058,45673,34437,-122.569107,45.43626,10.0,0.9


In [43]:
df = df.drop(columns=['EVENT_NO_STOP', 'GPS_SATELLITES', 'GPS_HDOP'])
df.head(3)

Unnamed: 0,EVENT_NO_TRIP,OPD_DATE,VEHICLE_ID,METERS,ACT_TIME,GPS_LONGITUDE,GPS_LATITUDE
0,229985012,01JAN2023:00:00:00,3058,45415,33102,-122.568135,45.43627
1,229985012,01JAN2023:00:00:00,3058,45442,33106,-122.56826,45.43651
2,229985012,01JAN2023:00:00:00,3058,45461,33112,-122.568512,45.436545


In [44]:
from datetime import datetime, timedelta

def create_timestamp(i):
    date = datetime.strptime(i['OPD_DATE'],'%d%b%Y:%H:%M:%S')
    time = timedelta(seconds=i['ACT_TIME'])
    return date + time 
    
df['TIMESTAMP'] = df.apply(create_timestamp,axis=1)

In [45]:
df.head(5)

Unnamed: 0,EVENT_NO_TRIP,OPD_DATE,VEHICLE_ID,METERS,ACT_TIME,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP
0,229985012,01JAN2023:00:00:00,3058,45415,33102,-122.568135,45.43627,2023-01-01 09:11:42
1,229985012,01JAN2023:00:00:00,3058,45442,33106,-122.56826,45.43651,2023-01-01 09:11:46
2,229985012,01JAN2023:00:00:00,3058,45461,33112,-122.568512,45.436545,2023-01-01 09:11:52
3,229985127,01JAN2023:00:00:00,3058,45635,34432,-122.569083,45.43591,2023-01-01 09:33:52
4,229985127,01JAN2023:00:00:00,3058,45673,34437,-122.569107,45.43626,2023-01-01 09:33:57


In [46]:
df = df.drop(columns=['OPD_DATE', 'ACT_TIME'])
df

Unnamed: 0,EVENT_NO_TRIP,VEHICLE_ID,METERS,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP
0,229985012,3058,45415,-122.568135,45.436270,2023-01-01 09:11:42
1,229985012,3058,45442,-122.568260,45.436510,2023-01-01 09:11:46
2,229985012,3058,45461,-122.568512,45.436545,2023-01-01 09:11:52
3,229985127,3058,45635,-122.569083,45.435910,2023-01-01 09:33:52
4,229985127,3058,45673,-122.569107,45.436260,2023-01-01 09:33:57
...,...,...,...,...,...,...
216458,231115345,4057,309944,-122.750707,45.589312,2023-01-02 00:23:47
216459,231115345,4057,309971,-122.751035,45.589187,2023-01-02 00:23:52
216460,231115345,4057,309989,-122.751198,45.589072,2023-01-02 00:23:57
216461,231115345,4057,310012,-122.751483,45.589113,2023-01-02 00:24:02


In [None]:
# df['c']=df['b'].apply(lambda x: 0 if x ==0 else math.log(x))

In [48]:
df['dMETERS'] = df['METERS'].diff()
df['dTIMESTAMP'] = df['TIMESTAMP'].diff()

df

Unnamed: 0,EVENT_NO_TRIP,VEHICLE_ID,METERS,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP,dMETERS,dTIMESTAMP
0,229985012,3058,45415,-122.568135,45.436270,2023-01-01 09:11:42,,NaT
1,229985012,3058,45442,-122.568260,45.436510,2023-01-01 09:11:46,27.0,0 days 00:00:04
2,229985012,3058,45461,-122.568512,45.436545,2023-01-01 09:11:52,19.0,0 days 00:00:06
3,229985127,3058,45635,-122.569083,45.435910,2023-01-01 09:33:52,174.0,0 days 00:22:00
4,229985127,3058,45673,-122.569107,45.436260,2023-01-01 09:33:57,38.0,0 days 00:00:05
...,...,...,...,...,...,...,...,...
216458,231115345,4057,309944,-122.750707,45.589312,2023-01-02 00:23:47,34.0,0 days 00:00:05
216459,231115345,4057,309971,-122.751035,45.589187,2023-01-02 00:23:52,27.0,0 days 00:00:05
216460,231115345,4057,309989,-122.751198,45.589072,2023-01-02 00:23:57,18.0,0 days 00:00:05
216461,231115345,4057,310012,-122.751483,45.589113,2023-01-02 00:24:02,23.0,0 days 00:00:05


In [49]:
# This needs to be fixed: For each trip, calculate speed (don't calculate dif between separate trips)
def speed(meters, time):
    speed = meters/time
    return speed

df['SPEED'] = df.apply(lambda row: speed(row['dMETERS'],row['dTIMESTAMP'].total_seconds()), axis=1)
df

Unnamed: 0,EVENT_NO_TRIP,VEHICLE_ID,METERS,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP,dMETERS,dTIMESTAMP,SPEED
0,229985012,3058,45415,-122.568135,45.436270,2023-01-01 09:11:42,,NaT,
1,229985012,3058,45442,-122.568260,45.436510,2023-01-01 09:11:46,27.0,0 days 00:00:04,6.750000
2,229985012,3058,45461,-122.568512,45.436545,2023-01-01 09:11:52,19.0,0 days 00:00:06,3.166667
3,229985127,3058,45635,-122.569083,45.435910,2023-01-01 09:33:52,174.0,0 days 00:22:00,0.131818
4,229985127,3058,45673,-122.569107,45.436260,2023-01-01 09:33:57,38.0,0 days 00:00:05,7.600000
...,...,...,...,...,...,...,...,...,...
216458,231115345,4057,309944,-122.750707,45.589312,2023-01-02 00:23:47,34.0,0 days 00:00:05,6.800000
216459,231115345,4057,309971,-122.751035,45.589187,2023-01-02 00:23:52,27.0,0 days 00:00:05,5.400000
216460,231115345,4057,309989,-122.751198,45.589072,2023-01-02 00:23:57,18.0,0 days 00:00:05,3.600000
216461,231115345,4057,310012,-122.751483,45.589113,2023-01-02 00:24:02,23.0,0 days 00:00:05,4.600000


In [50]:
df = df.drop(columns=['dMETERS','dTIMESTAMP'])
df

Unnamed: 0,EVENT_NO_TRIP,VEHICLE_ID,METERS,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP,SPEED
0,229985012,3058,45415,-122.568135,45.436270,2023-01-01 09:11:42,
1,229985012,3058,45442,-122.568260,45.436510,2023-01-01 09:11:46,6.750000
2,229985012,3058,45461,-122.568512,45.436545,2023-01-01 09:11:52,3.166667
3,229985127,3058,45635,-122.569083,45.435910,2023-01-01 09:33:52,0.131818
4,229985127,3058,45673,-122.569107,45.436260,2023-01-01 09:33:57,7.600000
...,...,...,...,...,...,...,...
216458,231115345,4057,309944,-122.750707,45.589312,2023-01-02 00:23:47,6.800000
216459,231115345,4057,309971,-122.751035,45.589187,2023-01-02 00:23:52,5.400000
216460,231115345,4057,309989,-122.751198,45.589072,2023-01-02 00:23:57,3.600000
216461,231115345,4057,310012,-122.751483,45.589113,2023-01-02 00:24:02,4.600000


In [53]:
# df.sort_values(by=['EVENT_NO_TRIP', 'TIMESTAMP'])
df.sort_values(by='EVENT_NO_TRIP', na_position='first')
df


Unnamed: 0,EVENT_NO_TRIP,VEHICLE_ID,METERS,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP,SPEED
0,229985012,3058,45415,-122.568135,45.436270,2023-01-01 09:11:42,6.750000
1,229985012,3058,45442,-122.568260,45.436510,2023-01-01 09:11:46,6.750000
2,229985012,3058,45461,-122.568512,45.436545,2023-01-01 09:11:52,3.166667
3,229985127,3058,45635,-122.569083,45.435910,2023-01-01 09:33:52,0.131818
4,229985127,3058,45673,-122.569107,45.436260,2023-01-01 09:33:57,7.600000
...,...,...,...,...,...,...,...
216458,231115345,4057,309944,-122.750707,45.589312,2023-01-02 00:23:47,6.800000
216459,231115345,4057,309971,-122.751035,45.589187,2023-01-02 00:23:52,5.400000
216460,231115345,4057,309989,-122.751198,45.589072,2023-01-02 00:23:57,3.600000
216461,231115345,4057,310012,-122.751483,45.589113,2023-01-02 00:24:02,4.600000


In [52]:
df.iloc[0,6] = df.iloc[1,6]
df

Unnamed: 0,EVENT_NO_TRIP,VEHICLE_ID,METERS,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP,SPEED
0,229985012,3058,45415,-122.568135,45.436270,2023-01-01 09:11:42,6.750000
1,229985012,3058,45442,-122.568260,45.436510,2023-01-01 09:11:46,6.750000
2,229985012,3058,45461,-122.568512,45.436545,2023-01-01 09:11:52,3.166667
3,229985127,3058,45635,-122.569083,45.435910,2023-01-01 09:33:52,0.131818
4,229985127,3058,45673,-122.569107,45.436260,2023-01-01 09:33:57,7.600000
...,...,...,...,...,...,...,...
216458,231115345,4057,309944,-122.750707,45.589312,2023-01-02 00:23:47,6.800000
216459,231115345,4057,309971,-122.751035,45.589187,2023-01-02 00:23:52,5.400000
216460,231115345,4057,309989,-122.751198,45.589072,2023-01-02 00:23:57,3.600000
216461,231115345,4057,310012,-122.751483,45.589113,2023-01-02 00:24:02,4.600000


In [84]:
import pandas as pd
df = pd.read_json('2023-05-10.json')
df = df.drop(columns=['EVENT_NO_STOP', 'GPS_SATELLITES', 'GPS_HDOP'])
from datetime import datetime, timedelta

def create_timestamp(i):
    date = datetime.strptime(i['OPD_DATE'],'%d%b%Y:%H:%M:%S')
    time = timedelta(seconds=i['ACT_TIME'])
    return date + time 
    
df['TIMESTAMP'] = df.apply(create_timestamp,axis=1)
df = df.drop(columns=['OPD_DATE', 'ACT_TIME'])

def speed(meters, time):
    speed = meters/time
    return speed

df['dMETERS'] = df['METERS'].diff()
df['dTIMESTAMP'] = df['TIMESTAMP'].diff()
df['SPEED'] = df.apply(lambda row: speed(row['dMETERS'],row['dTIMESTAMP'].total_seconds()), axis=1)
df = df.drop(columns=['dMETERS','dTIMESTAMP'])

# Code that sets first breadcrumb value to second (not currently for all trips)
# df.sort_values(by='EVENT_NO_TRIP', na_position='first')
# df.iloc[0,6] = df.iloc[1,6]
df


Unnamed: 0,EVENT_NO_TRIP,VEHICLE_ID,METERS,GPS_LONGITUDE,GPS_LATITUDE,TIMESTAMP,SPEED
0,229985012,3058,45415,-122.568135,45.436270,2023-01-01 09:11:42,
1,229985012,3058,45442,-122.568260,45.436510,2023-01-01 09:11:46,6.750000
2,229985012,3058,45461,-122.568512,45.436545,2023-01-01 09:11:52,3.166667
3,229985127,3058,45635,-122.569083,45.435910,2023-01-01 09:33:52,0.131818
4,229985127,3058,45673,-122.569107,45.436260,2023-01-01 09:33:57,7.600000
...,...,...,...,...,...,...,...
216458,231115345,4057,309944,-122.750707,45.589312,2023-01-02 00:23:47,6.800000
216459,231115345,4057,309971,-122.751035,45.589187,2023-01-02 00:23:52,5.400000
216460,231115345,4057,309989,-122.751198,45.589072,2023-01-02 00:23:57,3.600000
216461,231115345,4057,310012,-122.751483,45.589113,2023-01-02 00:24:02,4.600000


| BreadCrumb      | Trip |
| ----------- | ----------- |
| tstamp      | trip_id       |
| latitude   | route_id        |
| longitude   | vehicle_id        |
| speed   | service_key        |
| trip_id   | direction        |


In [89]:
# df.rename({'Courses': 'Course_ Name','Fee': 'CourseFee', 'Duration': 'CourseDuration'}, 
#           axis = "columns", inplace = True) 
BreadCrumb = df[['TIMESTAMP','GPS_LATITUDE', 'GPS_LONGITUDE', 'SPEED', 'EVENT_NO_TRIP']].copy()
BreadCrumb.rename({'TIMESTAMP': 'tstamp', 'GPS_LATITUDE': 'latitude', 'GPS_LONGITUDE':'longitude', 'SPEED':'speed', 'EVENT_NO_TRIP':'trip_id'}, axis='columns', inplace=True)
BreadCrumb.head(5)

Unnamed: 0,tstamp,latitude,longitude,speed,trip_id
0,2023-01-01 09:11:42,45.43627,-122.568135,,229985012
1,2023-01-01 09:11:46,45.43651,-122.56826,6.75,229985012
2,2023-01-01 09:11:52,45.436545,-122.568512,3.166667,229985012
3,2023-01-01 09:33:52,45.43591,-122.569083,0.131818,229985127
4,2023-01-01 09:33:57,45.43626,-122.569107,7.6,229985127


In [93]:
Trip = df[['EVENT_NO_TRIP', 'VEHICLE_ID']].copy()
Trip.rename({'EVENT_NO_TRIP':'trip_id', 'VEHICLE_ID':'vehicle_id'},axis='columns',inplace=True)
Trip.insert(loc=1, column='route_id', value=-1)
Trip.insert(loc=3, column='service_key', value=-1)
Trip.insert(loc=4, column='direction', value=-1)

Trip.head(5)

Unnamed: 0,trip_id,route_id,vehicle_id,service_key,direction
0,229985012,-1,3058,-1,-1
1,229985012,-1,3058,-1,-1
2,229985012,-1,3058,-1,-1
3,229985127,-1,3058,-1,-1
4,229985127,-1,3058,-1,-1
