## BIG EXPRESS EDA v1 - 4/5/2022
### ROLL ON EIGHTEEN WHEELERS

In [24]:
# IMPORT LIBRARIES
# import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
# import sqlite3    # ONLY IMPORT THIS ONCE FOR A SET OF FILES
from tqdm.notebook import tqdm

### INITIAL DATA IMPORT

In [2]:
jfaults = pd.read_csv('../data/J1939Faults.csv', 
                      dtype={'EquipmentID': str})

In [3]:
jfaults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,True,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,990360,2015-02-21 11:34:34.000,,,unknown,unknown,unknown,unknown,11,629,12,True,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,False,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26.000
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,True,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
4,5,990416,2015-02-21 11:39:41.000,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37.000


In [4]:
jfaults.shape

(1187335, 20)

In [5]:
jfaults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   RecordID               1187335 non-null  int64  
 1   ESS_Id                 1187335 non-null  int64  
 2   EventTimeStamp         1187335 non-null  object 
 3   eventDescription       1126490 non-null  object 
 4   actionDescription      0 non-null        float64
 5   ecuSoftwareVersion     891285 non-null   object 
 6   ecuSerialNumber        844318 non-null   object 
 7   ecuModel               1122577 non-null  object 
 8   ecuMake                1122577 non-null  object 
 9   ecuSource              1187335 non-null  int64  
 10  spn                    1187335 non-null  int64  
 11  fmi                    1187335 non-null  int64  
 12  active                 1187335 non-null  bool   
 13  activeTransitionCount  1187335 non-null  int64  
 14  faultValue        

### DATA CLEAN-UP

In [6]:
jfaults['EquipmentID'] = jfaults['EquipmentID']\
                            .map(lambda x: x.lstrip('R')\
                                 .rstrip('aAbBcC'))

In [7]:
jfaults['EquipmentID'] = pd.to_numeric(jfaults['EquipmentID'])

In [8]:
# 2169 ROWS WITH EquipmentID OF LEN(6 +) 

# np.where(len(jfaults['EquipmentID']) > 5)
jfaults[jfaults['EquipmentID'] >= 100000]

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
7069,7070,1157485,2015-03-03 09:18:42.000,,,unknown,unknown,unknown,unknown,11,0,0,True,127,,105438416,105438416,35.588148,-86.444027,2015-02-24 14:11:01.000
7070,7071,1157507,2015-03-03 09:19:43.000,Abnormal Update Rate Tire Location,,unknown,unknown,unknown,unknown,49,929,9,True,126,,105438416,105438416,35.588148,-86.444027,2015-02-24 14:11:01.000
59121,61191,2545755,2015-05-21 08:16:17.000,,,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,4364,18,True,1,,105393153,105393153,33.340046,-87.019212,2015-05-21 08:19:05.000
59587,61657,2553312,2015-05-21 13:27:44.000,,,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,4364,18,True,1,,105393153,105393153,33.339675,-86.925833,2015-05-21 13:27:40.000
60728,62798,2570947,2015-05-22 10:44:37.000,High (Severity Medium) Aftertreatment 1 Intake...,,04993120*00027785*040213150018*07700044*I0*BBZ*,79464671,6X1u10D1500000000,CMMNS,0,3216,16,True,1,,105393153,105393153,33.897731,-87.624629,2015-05-22 10:44:33.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151754,1208978,107277367,2019-10-23 09:48:05.000,Abnormal Update Rate Engine Total Idle Fuel Used,,,,,,49,236,9,False,1,,105455678,105455678,35.588379,-86.443842,2019-10-23 09:48:00.000
1151759,1208983,107278317,2019-10-23 09:53:52.000,,,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0082771,EEO-xxF112C,EATON,3,8484,9,True,1,,105455678,105455678,35.588379,-86.443842,2019-10-23 09:54:29.000
1151763,1208987,107278881,2019-10-23 09:57:38.000,,,5516018*202.56.0*5516502*E003.e003*5539540*25....,Z0082771,EEO-xxF112C,EATON,3,8484,9,False,1,,105455678,105455678,35.588379,-86.443842,2019-10-23 09:57:33.000
1176114,1236255,118205869,2020-01-23 16:53:26.000,Low (Severity Medium) Engine Coolant Level,,04384413*22380869*082218154102*60701732*G1*BGT*,80156146,6X1u17D1500000000,CMMNS,0,111,18,False,4,,105411909,105411909,36.135000,-83.290138,2020-01-23 17:00:23.000


In [9]:
jfaults = jfaults[jfaults['EquipmentID'] < 100000]

In [10]:
jfaults.drop('actionDescription', 
             axis=1, 
             inplace=True)

In [11]:
jfaults.drop('faultValue', 
             axis=1, 
             inplace=True)

In [12]:
jfaults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,17,True,2,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,990360,2015-02-21 11:34:34.000,,unknown,unknown,unknown,unknown,11,629,12,True,127,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,2,False,127,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26.000
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,2,True,127,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
4,5,990416,2015-02-21 11:39:41.000,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37.000


### SQLITE DATABASE CREATION AND QUERYING

In [17]:
# DATABASE CREATION ONLY NEEDS TO BE RUN ONE TIME

#jfaults_db = sqlite3.connect('../data/Big_G.sqlite')

#for chunk in tqdm(pd.read_csv('../data/J1939Faults.csv', 
 #                             chunksize = 10000)):
    # chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]   
  #  chunk.to_sql('big_g', 
   #              jfaults_db, 
    #             if_exists = 'append', 
     #            index = False)

0it [00:00, ?it/s]

In [None]:
# db.execute('CREATE INDEX average_day_wait ON jfaults(...))')

In [13]:
# TO DO

jfaults_db = sqlite3.connect('../data/Big_G.sqlite')

jfaults_query = "SELECT * FROM big_g WHERE EquipmentID <= 5 LIMIT 100"

jfaults_sqlite = pd.read_sql(jfaults_query, jfaults_db)

jfaults_sqlite.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,0,111,17,1,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,990360,2015-02-21 11:34:34.000,,,unknown,unknown,unknown,unknown,11,629,12,1,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,0,127,,1369,105336226,41.42125,-87.767361,2015-02-21 11:35:26.000
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,11,1807,2,1,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
4,5,990416,2015-02-21 11:39:41.000,,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,0,2,,1674,105427130,38.416481,-89.442638,2015-02-21 11:39:37.000


In [23]:
# 603 ROWS active = TRUE FOR fault 5246 vs 1189 ROWS TOTAL
faults_5246_lat_long_query = """

    SELECT COUNT(*) 
    FROM big_g
    WHERE spn = '5246' 
        AND active = TRUE
        AND LENGTH(EquipmentID) <= 5 AND 
        (
            (
                latitude NOT BETWEEN (36.0666667 + 0.016667) AND (36.0666667 - 0.016667)
                AND
                longitude NOT BETWEEN (-86.4347222 + 0.016667) AND (-86.4347222 - 0.016667) 
            )
            AND
            (
                latitude NOT BETWEEN (35.5883333 + 0.016667) AND (35.5883333 - 0.016667)
                AND 
                longitude NOT BETWEEN (-86.4438888 + 0.016667) AND (-86.4438888 - 0.016667)
            )
            AND
            (
                latitude NOT BETWEEN (36.1950 + 0.016667) AND (36.1950 - 0.016667)
                AND 
                longitude NOT BETWEEN (-83.174722 + 0.016667) AND (-83.174722 - 0.016667)
            )
        )
    ;
    
"""

faults_5246_lat_long_sqlite = pd.read_sql(faults_5246_lat_long_query, jfaults_db)

faults_5246_lat_long_sqlite

Unnamed: 0,COUNT(*)
0,603


In [26]:
# 607,336 ROWS FOR active = TRUE vs 1,185,166 TOTAL
all_faults_lat_long_query = """

    SELECT * 
    FROM big_g
    WHERE LENGTH(EquipmentID) <= 5 
        AND active = TRUE
        AND 
        (
            (
                latitude NOT BETWEEN (36.0666667 + 0.016667) AND (36.0666667 - 0.016667)
                AND
                longitude NOT BETWEEN (-86.4347222 + 0.016667) AND (-86.4347222 - 0.016667) 
            )
            AND
            (
                latitude NOT BETWEEN (35.5883333 + 0.016667) AND (35.5883333 - 0.016667)
                AND 
                longitude NOT BETWEEN (-86.4438888 + 0.016667) AND (-86.4438888 - 0.016667)
            )
            AND
            (
                latitude NOT BETWEEN (36.1950 + 0.016667) AND (36.1950 - 0.016667)
                AND 
                longitude NOT BETWEEN (-83.174722 + 0.016667) AND (-83.174722 - 0.016667)
            )
        )
    ;
    
"""

all_faults_lat_long_sqlite = pd.read_sql(all_faults_lat_long_query, jfaults_db)

# all_faults_lat_long_sqlite

In [27]:
# TO DO - WE CAN OUTPUT A DATA SET TO A CSV ONCE WE DECIDE WHICH DATA SET

all_faults_lat_long_sqlite.to_csv('../data/all_faults_lat_long_sqlite.csv', sep = ',')

In [30]:
jfaults_db.close()

#### TO DO - CORRELATIONS, etc. (maybe separate notebook)

In [28]:
all_faults = pd.read_csv('../data/all_faults_lat_long_sqlite.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [29]:
all_faults.head()

Unnamed: 0.1,Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,actionDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,...,spn,fmi,active,activeTransitionCount,faultValue,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,...,111,17,1,2,,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,1,2,990360,2015-02-21 11:34:34.000,,,unknown,unknown,unknown,unknown,...,629,12,1,127,,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
2,2,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,,unknown,unknown,unknown,unknown,...,1807,2,1,127,,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
3,3,6,990431,2015-02-21 11:40:22.000,Low (Severity Low) Engine Coolant Level,,04993120*00025921*082113134117*07700053*I0*BBZ*,79466580,6X1u10D1500000000,CMMNS,...,111,17,1,1,,1417,105438630,33.043564,-96.179722,2015-02-21 11:40:59.000
4,4,7,990439,2015-02-21 11:40:52.000,Low (Severity Low) Engine Coolant Level,,unknown,unknown,unknown,unknown,...,111,17,1,2,,1597,105344243,36.902916,-86.436481,2015-02-21 11:41:29.000


#### TO DO - GEOSPATIAL (maybe separate notebook)