In [1]:
!pip install py4j


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# from multiprocessing.reduction import duplicate
from babel.util import distinct
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, isnan, when, isnull, sum as spark_sum, countDistinct
from pyspark.sql.types import NumericType, TimestampType, DateType
from pyspark.sql.functions import col
from pyspark.sql.functions import col, min, max, count, lag, datediff
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Predictive Maintenance System").getOrCreate()

# Read datasets with inferred schema
sensor_df = spark.read.csv("/home/beboy/Documents/projects/scale/Predictive-Maintenance-System-using-Apache-Spark/Data Processing & Analysis/Data Generator/sensor_data.csv", header=True, inferSchema=True)
maintenance_df = spark.read.csv("/home/beboy/Documents/projects/scale/Predictive-Maintenance-System-using-Apache-Spark/Data Processing & Analysis/Data Generator/maintenance_logs.csv", header=True, inferSchema=True)
equipment_df = spark.read.csv("/home/beboy/Documents/projects/scale/Predictive-Maintenance-System-using-Apache-Spark/Data Processing & Analysis/Data Generator/equipment_specs.csv", header=True, inferSchema=True)
operational_df = spark.read.csv("/home/beboy/Documents/projects/scale/Predictive-Maintenance-System-using-Apache-Spark/Data Processing & Analysis/Data Generator/operational_data.csv", header=True, inferSchema=True)

# Display the inferred schema and first few rows of each dataset
print("Sensor Data Schema:")
sensor_df.printSchema()
print("Sensor Data Sample:")
sensor_df.show(5)

print("\nMaintenance Logs Schema:")
maintenance_df.printSchema()
print("Maintenance Logs Sample:")
maintenance_df.show(5)

print("\nEquipment Specifications Schema:")
equipment_df.printSchema()
print("Equipment Specifications Sample:")
equipment_df.show(5)

print("\nOperational Data Schema:")
operational_df.printSchema()
print("Operational Data Sample:")
operational_df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/21 15:12:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Sensor Data Schema:
root
 |-- equipment_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- temperature: double (nullable = true)
 |-- vibration: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- rotational_speed: double (nullable = true)
 |-- power_output: double (nullable = true)
 |-- noise_level: double (nullable = true)
 |-- voltage: double (nullable = true)
 |-- current: double (nullable = true)
 |-- oil_viscosity: double (nullable = true)

Sensor Data Sample:
+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+---------+-------------+
|equipment_id|           timestamp|temperature| vibration| pressure|rotational_speed|power_output|noise_level|  voltage|  current|oil_viscosity|
+------------+--------------------+-----------+----------+---------+----------------+------------+-----------+---------+---------+-------------+
|           1|2024-05-21 14:54:...|   62.29391|0

# Data Validation & Quality Assessment

In [4]:
"""General steps apply to all datasets
1. Check for null values in each column
2. Identify duplicate rows
3. Validate data types for each column
4. Check for consistency in equipment_id across all datasets"""

def general_data_validation(df, dataset_name):
    print(f"General Data Validation for {dataset_name}")
    
    # 1. Check for null values in each column
    print("1. Null values count for each column:")
    null_counts = []
    for column in df.columns:
        # Get the data type of the column
        data_type = df.schema[column].dataType
        
        # For numeric columns, check for both null and NaN
        if isinstance(data_type, NumericType):
            null_counts.append(count(when(col(column).isNull() | isnan(col(column)), column)).alias(column))
        # For timestamp or date columns, only check for null
        elif isinstance(data_type, (TimestampType, DateType)):
            null_counts.append(count(when(col(column).isNull(), column)).alias(column))
        # For other types (like string), only check for null
        else:
            null_counts.append(count(when(col(column).isNull(), column)).alias(column))
    
    # Use select to apply the counting operation
    null_df = df.select(null_counts)
    
    print("Null value counts for each column:")
    null_df.show()
    
    # 2. Identify and handle duplicate rows
    total_rows = df.count()
    distinct_rows = df.distinct().count()
    duplicate_rows = total_rows - distinct_rows
    print(f"2. Duplicate rows:")
    print(f"   Total rows: {total_rows}")
    print(f"   Distinct rows: {distinct_rows}")
    print(f"   Duplicate rows: {duplicate_rows}")
    
    # 3. Validate data types for each column
    print("3. Data types for each column:")
    df.printSchema()
    
    # 4. Check for consistency in equipment_id across all datasets
    if 'equipment_id' in df.columns:
        unique_equipment_ids = df.select('equipment_id').distinct().count()
        print(f"4. Number of unique equipment ids in {dataset_name}: {unique_equipment_ids}")
    else:
        print(f"4. 'equipment_id' column is missing in {dataset_name}'")
    print(f"End of General Data Validation for {dataset_name} \n")
    
def check_equipment_id_consistency(sensor_df, maintenance_df, equipment_df, operational_df):
    print("Cross-dataset equipment_id consistency check")
    
    sensor_ids = sensor_df.select('equipment_id').distinct()
    maintenance_ids = maintenance_df.select('equipment_id').distinct()
    equipment_ids = equipment_df.select('equipment_id').distinct()
    operation_ids = operational_df.select('equipment_id').distinct()
    
    all_ids = sensor_ids.union(maintenance_ids).union(equipment_ids).union(operation_ids).distinct()
    
    total_unique_ids = all_ids.count()
    print(f"Total unique equipment_ids across all datasets: {total_unique_ids}")
    
    print("equipment_ids not present in all datasets:")
    missing_ids = all_ids.join(sensor_ids, on='equipment_id', how='left_anti') \
                         .union(all_ids.join(maintenance_ids, on='equipment_id', how='left_anti')) \
                         .union(all_ids.join(equipment_ids, on='equipment_id', how='left_anti')) \
                         .union(all_ids.join(operation_ids, on='equipment_id', how='left_anti')) \
                         .distinct()
    
    missing_ids.show()
    print(f"Number of equipment_ids not consistently present: {missing_ids.count()}")
    print("End of Cross-dataset equipment_id Consistency Check")
    

In [5]:
sensor_df_clean = general_data_validation(sensor_df, "Sensor Data")
maintenance_df_clean = general_data_validation(maintenance_df, "Maintenance Logs")
equipment_df_clean = general_data_validation(equipment_df, "Equipment Specifications")
operational_df_clean = general_data_validation(operational_df, "Operational Data")
check_equipment_id_consistency(sensor_df, maintenance_df, equipment_df, operational_df)

General Data Validation for Sensor Data
1. Null values count for each column:
Null value counts for each column:
+------------+---------+-----------+---------+--------+----------------+------------+-----------+-------+-------+-------------+
|equipment_id|timestamp|temperature|vibration|pressure|rotational_speed|power_output|noise_level|voltage|current|oil_viscosity|
+------------+---------+-----------+---------+--------+----------------+------------+-----------+-------+-------+-------------+
|           0|        0|          0|        0|       0|               0|           0|          0|      0|      0|            0|
+------------+---------+-----------+---------+--------+----------------+------------+-----------+-------+-------+-------------+

2. Duplicate rows:
   Total rows: 73000
   Distinct rows: 73000
   Duplicate rows: 0
3. Data types for each column:
root
 |-- equipment_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- temperature: double (nullable =

25/05/21 15:13:08 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
