In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
!pwd

/Users/manya/QuadSci


In [2]:
### DATA DESCRIPTION

# Telemetry Time Series Data (PdM_telemetry.csv): It consists of hourly average of voltage, rotation, pressure, vibration 
# collected from 100 machines for the year 2015.

# Error (PdM_errors.csv): These are errors encountered by the machines while in operating condition. 
# Since, these errors don't shut down the machines, these are not considered as failures. 
# The error date and times are rounded to the closest hour since the telemetry data is collected at an hourly rate.

# Maintenance (PdM_maint.csv): If a component of a machine is replaced, that is captured as a record in this table. 
# Components are replaced under two situations: 1. During the regular scheduled visit, the technician replaced it (Proactive Maintenance) 
# 2. A component breaks down and then the technician does an unscheduled maintenance to replace the component (Reactive Maintenance). 
# This is considered as a failure and corresponding data is captured under Failures. Maintenance data has both 2014 and 2015 records. 
# This data is rounded to the closest hour since the telemetry data is collected at an hourly rate.

# Failures (PdM_failures.csv): Each record represents replacement of a component due to failure. 
# This data is a subset of Maintenance data. This data is rounded to the closest hour since the telemetry data is collected at an hourly rate.

# Metadata of Machines (PdM_Machines.csv): Model type & age of the Machines.

In [32]:
telemetry_raw = pd.read_csv("Data/PdM_telemetry.csv")
errors_raw = pd.read_csv("Data/PdM_errors.csv")
maintenance_raw = pd.read_csv("Data/PdM_maint.csv")
failures_raw = pd.read_csv("Data/PdM_failures.csv")
metadata_raw = pd.read_csv("Data/PdM_machines.csv")

datasets = {"telemetry": telemetry_raw, "errors": errors_raw, "maintenance": maintenance_raw, "failures": failures_raw, "metadata": metadata_raw}

Exploratory Data Analysis

In [39]:
for k,v in datasets.items():
    print(f"Size of {k} dataset:", v.shape)

Size of telemetry dataset: (876100, 6)
Size of errors dataset: (3919, 3)
Size of maintenance dataset: (3286, 3)
Size of failures dataset: (761, 3)
Size of metadata dataset: (100, 3)


In [5]:
telemetry_raw.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511


In [41]:
metadata_raw.head()

Unnamed: 0,machineID,model,age
0,1,model3,18
1,2,model4,7
2,3,model3,8
3,4,model3,7
4,5,model3,2


In [50]:
# Makes sense to add machine metadata alongside machine telemetry 

feature_df = pd.merge(telemetry_raw, metadata_raw, on="machineID", how = "outer")
print(feature_df.shape[0] == telemetry_raw.shape[0])
feature_df.head()

True


Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,model,age
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,model3,18
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,model3,18
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,model3,18
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,model3,18
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,model3,18


In [51]:
errors_raw.head()

Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4


In [21]:
failures_raw.head()

Unnamed: 0,datetime,machineID,failure
0,2015-01-05 06:00:00,1,comp4
1,2015-03-06 06:00:00,1,comp1
2,2015-04-20 06:00:00,1,comp2
3,2015-06-19 06:00:00,1,comp4
4,2015-09-02 06:00:00,1,comp4


In [22]:
failures_raw['time'] = failures_raw['datetime'].apply(lambda x: x.split(" ")[1])
failures_raw["time"].unique()

# All failures are recorded either at 0600 hrs or 0300 hrs

array(['06:00:00', '03:00:00'], dtype=object)

In [52]:
maintenance_raw.head()

Unnamed: 0,datetime,machineID,comp
0,2014-06-01 06:00:00,1,comp2
1,2014-07-16 06:00:00,1,comp4
2,2014-07-31 06:00:00,1,comp3
3,2014-12-13 06:00:00,1,comp1
4,2015-01-05 06:00:00,1,comp4


In [53]:
maintenance_raw['time'] = maintenance_raw['datetime'].apply(lambda x: x.split(" ")[1])
maintenance_raw["time"].unique()

# Most maintenance happens around 0600 hrs

array(['06:00:00'], dtype=object)

In [57]:
is_subset = failures_raw.isin(maintenance_raw).all().all()
print(is_subset)

# confirming if any failures are present in maintenance dataset at all or no

False
