# Table of Contents

### [Read Json From HDFS](#1.-Read-Json-From-HDFS)
### [Sensors DataFrame](#2.-Sensors-DataFrame)
### [Fails DataFrame](#3.-Fails-DataFrame)
### [Merge DataFrame Sensors and Fails](#4.-Merge-DataFrame-Sensors-and-Fails)
### [Parse time by minute](#5.-Parse-time-by-minute)
### [Parse time by hour](#6.-Parse-time-by-hour)
### [Create Hive Table](#7.-Create-Hive-Table)
### [H2O](#8.-H2O)
### [Auto-Arima](#9.-Auto-Arima)
### [Prophet](#10.-Prophet)

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import explode, min, max, col, round, hour, minute, count, dayofmonth, month, substring, avg
from pyspark.sql.types import IntegerType
from pyspark.sql.types import TimestampType
import pandas as pd
import numpy as np
import seaborn as sns

warehouse_location = 'hdfs://namenode:8020/warehouse'
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

21/12/07 12:49:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# 1. Read Json From HDFS
         hdfs://namenode:8020//raw/fails
         hdfs://namenode:8020//raw/sensors

In [3]:
df_falhas = spark.read.json('hdfs://namenode:8020//raw/fails/1692abe5-8bd0-4a3e-acad-af18231c4114', multiLine = True)

df_sensores = spark.read.json('hdfs://namenode:8020//raw/sensors/8eebb26d-b551-4088-8ba0-9a63adc343de', multiLine = True)


                                                                                

In [4]:
df_sensores = df_sensores.withColumn("results", explode("results")) \
                    .withColumn("sensor_created_at", col("results")["sensor_created_at"].cast("timestamp")) \
                    .withColumn("sensor_event_type", col("results")["sensor_event_type"].cast("string")) \
                    .withColumn("sensor_id_cycle", col("results")["sensor_id_cycle"].cast("integer")) \
                    .withColumn("sensor_ip", col("results")["sensor_ip"].cast("string")) \
                    .withColumn("sensor_unique_id", col("results")["sensor_unique_id"].cast("string")) \
                    .withColumn("sensor_value_humidity", col("results")["sensor_value_humidity"].cast("integer")) \
                    .withColumn("sensor_value_temperature_motor1", col("results")["sensor_value_temperature_motor1"].cast("integer")) \
                    .withColumn("sensor_value_temperature_motor2", col("results")["sensor_value_temperature_motor2"].cast("integer")) \
                    .withColumn("sensor_value_temperature_motor3", col("results")["sensor_value_temperature_motor3"].cast("integer")) \
                    .withColumn("sensor_value_vibrationhz_x", col("results")["sensor_value_vibrationhz_x"].cast("integer")) \
                    .withColumn("sensor_value_vibrationhz_y", col("results")["sensor_value_vibrationhz_y"].cast("integer")) \
                    .withColumn("sensor_value_vibrationhz_z", col("results")["sensor_value_vibrationhz_z"].cast("integer")) \
                    .withColumn("value_noise_dba_motor1", col("results")["value_noise_dba_motor1"].cast("integer")) \
                    .withColumn("value_noise_dba_motor2", col("results")["value_noise_dba_motor2"].cast("integer")) \
                    .withColumn("value_noise_dba_motor3", col("results")["value_noise_dba_motor3"].cast("integer")) \
                    .drop("results") \
                    .orderBy("sensor_created_at")
df_sensores2 = df_sensores.toPandas()

                                                                                

In [5]:
df_falhas = df_falhas.withColumn("results", explode("results")) \
                .withColumn("falha_created_at", col("results")["falha_created_at"].cast("timestamp")) \
                .withColumn("falha_id_falha", col("results")["falha_id_falha"].cast("string")) \
                .withColumn("falha_ip", col("results")["falha_ip"].cast("string")) \
                .withColumn("falha_hostname", col("results")["falha_hostname"].cast("string")) \
                .withColumn("falha_event_type", col("results")["falha_event_type"].cast("string")) \
                .withColumn("falha_tipo_falha", col("results")["falha_tipo_falha"].cast("string")) \
                .withColumn("falha_error_code", col("results")["falha_error_code"].cast("string")) \
                .withColumn("falha_error_description", col("results")["falha_error_description"].cast("string")) \
                .drop("results") \
                .orderBy("falha_created_at")
df_falhas2 = df_falhas.toPandas()

## 2. Sensors DataFrame

In [6]:
df_sensores2

Unnamed: 0,sensor_created_at,sensor_event_type,sensor_id_cycle,sensor_ip,sensor_unique_id,sensor_value_humidity,sensor_value_temperature_motor1,sensor_value_temperature_motor2,sensor_value_temperature_motor3,sensor_value_vibrationhz_x,sensor_value_vibrationhz_y,sensor_value_vibrationhz_z,value_noise_dba_motor1,value_noise_dba_motor2,value_noise_dba_motor3
0,2021-11-25 12:25:35.015,sensor,4,172.18.0.2,e269fcc6-7737-410a-9bad-938f4d926247,23,19,28,8,11,14,20,23,47,23
1,2021-11-25 12:25:40.016,sensor,1,172.18.0.2,e461209a-82fe-4ce9-9fb2-d7cebd3b2fa5,22,26,21,27,19,16,20,39,58,46
2,2021-11-25 12:25:45.017,sensor,1,172.18.0.2,bf19c9e6-638a-47b0-a499-33878176943b,81,7,30,29,10,16,15,34,63,39
3,2021-11-25 12:25:50.019,sensor,4,172.18.0.2,26e31171-15f2-4005-b654-54a2e12dc4dc,41,18,23,11,18,11,21,26,20,44
4,2021-11-25 12:25:55.020,sensor,4,172.18.0.2,027c718e-a87e-453a-b0fb-e884f280db4a,42,11,32,17,12,15,14,36,28,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72235,2021-11-29 16:47:01.342,sensor,2,172.18.0.2,f087c835-37bc-4343-b647-118ab4849d0a,54,22,9,29,11,10,16,56,25,54
72236,2021-11-29 16:47:06.344,sensor,4,172.18.0.2,dc932a86-5e2b-49d3-a6bf-cd05b294216f,25,19,7,18,13,16,21,17,63,16
72237,2021-11-29 16:47:11.344,sensor,2,172.18.0.2,65f78734-bea7-465b-9523-99afab9dabd2,57,8,12,34,21,13,13,39,32,35
72238,2021-11-29 16:47:16.345,sensor,4,172.18.0.2,aadb443e-69fb-446e-8b4f-b3930f8bd762,46,6,31,18,19,19,12,59,51,64


## 3. Fails DataFrame

In [7]:
df_falhas2

Unnamed: 0,falha_created_at,falha_id_falha,falha_ip,falha_hostname,falha_event_type,falha_tipo_falha,falha_error_code,falha_error_description
0,2021-11-25 16:44:19.978,0c57eba9-1fb8-461a-af31-d0b33c77ee99,172.18.0.2,32e102a5111a,fail,falha_1,erro_500,Pequena falha
1,2021-11-25 16:44:25.021,65bd9df3-2310-4299-bba5-9185da1075a6,172.18.0.2,32e102a5111a,fail,falha_2,erro_500,Falha Geral
2,2021-11-25 16:44:34.911,1c691588-3c43-4e28-84fc-f2edaf98e85f,172.18.0.2,32e102a5111a,fail,falha_3,erro_500,Falha Crítica
3,2021-11-25 17:44:19.980,422e18aa-1b4a-4186-aabb-a022ac854caf,172.18.0.2,32e102a5111a,fail,falha_1,erro_500,Pequena falha
4,2021-11-25 18:44:19.980,8bf65258-ec7a-4b65-b552-a9cc396e17c3,172.18.0.2,32e102a5111a,fail,falha_1,erro_500,Pequena falha
...,...,...,...,...,...,...,...,...
157,2021-11-29 14:44:20.062,7a80527e-2893-4058-bcd3-a4de3fa64b98,172.18.0.2,32e102a5111a,fail,falha_1,erro_500,Pequena falha
158,2021-11-29 15:11:14.926,f685ab8a-9248-4d9c-8d39-3ceb78076448,172.18.0.2,32e102a5111a,fail,falha_3,erro_500,Falha Crítica
159,2021-11-29 15:44:20.063,e7fc28d7-9fd2-42d7-98b9-217bcfb63026,172.18.0.2,32e102a5111a,fail,falha_1,erro_500,Pequena falha
160,2021-11-29 16:34:25.062,d86c0489-e17e-47ba-beda-084e57df1c40,172.18.0.2,32e102a5111a,fail,falha_2,erro_500,Falha Geral


## Rename Columns

In [8]:
df_falhas2 = df_falhas2[["falha_created_at", "falha_tipo_falha", "falha_error_description"]]
df_falhas2.rename(columns = {'falha_created_at':'sensor_created_at'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## 4. Merge DataFrame Sensors and Fails

In [9]:
import pandas as pd
df_merged = pd.merge_asof(df_sensores2, df_falhas2, on="sensor_created_at", tolerance=pd.Timedelta("4s"))

In [10]:
df_merged["falha_error_description"].value_counts()

Pequena falha    75
Falha Geral      35
Falha Crítica    13
Name: falha_error_description, dtype: int64

   ## Replace Nulls by "sem_falha"

In [11]:
df_merged = df_merged.copy()
df_merged = df_merged.fillna('sem_falha')

## Merged Dataframe

In [12]:
df_merged

Unnamed: 0,sensor_created_at,sensor_event_type,sensor_id_cycle,sensor_ip,sensor_unique_id,sensor_value_humidity,sensor_value_temperature_motor1,sensor_value_temperature_motor2,sensor_value_temperature_motor3,sensor_value_vibrationhz_x,sensor_value_vibrationhz_y,sensor_value_vibrationhz_z,value_noise_dba_motor1,value_noise_dba_motor2,value_noise_dba_motor3,falha_tipo_falha,falha_error_description
0,2021-11-25 12:25:35.015,sensor,4,172.18.0.2,e269fcc6-7737-410a-9bad-938f4d926247,23,19,28,8,11,14,20,23,47,23,sem_falha,sem_falha
1,2021-11-25 12:25:40.016,sensor,1,172.18.0.2,e461209a-82fe-4ce9-9fb2-d7cebd3b2fa5,22,26,21,27,19,16,20,39,58,46,sem_falha,sem_falha
2,2021-11-25 12:25:45.017,sensor,1,172.18.0.2,bf19c9e6-638a-47b0-a499-33878176943b,81,7,30,29,10,16,15,34,63,39,sem_falha,sem_falha
3,2021-11-25 12:25:50.019,sensor,4,172.18.0.2,26e31171-15f2-4005-b654-54a2e12dc4dc,41,18,23,11,18,11,21,26,20,44,sem_falha,sem_falha
4,2021-11-25 12:25:55.020,sensor,4,172.18.0.2,027c718e-a87e-453a-b0fb-e884f280db4a,42,11,32,17,12,15,14,36,28,40,sem_falha,sem_falha
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72235,2021-11-29 16:47:01.342,sensor,2,172.18.0.2,f087c835-37bc-4343-b647-118ab4849d0a,54,22,9,29,11,10,16,56,25,54,sem_falha,sem_falha
72236,2021-11-29 16:47:06.344,sensor,4,172.18.0.2,dc932a86-5e2b-49d3-a6bf-cd05b294216f,25,19,7,18,13,16,21,17,63,16,sem_falha,sem_falha
72237,2021-11-29 16:47:11.344,sensor,2,172.18.0.2,65f78734-bea7-465b-9523-99afab9dabd2,57,8,12,34,21,13,13,39,32,35,sem_falha,sem_falha
72238,2021-11-29 16:47:16.345,sensor,4,172.18.0.2,aadb443e-69fb-446e-8b4f-b3930f8bd762,46,6,31,18,19,19,12,59,51,64,sem_falha,sem_falha


In [32]:
df_merged.to_csv('./provaconceito/csv/df_merged.csv', index=False)

## 5. Parse time by minute

In [13]:
dfsensoresmin = df_sensores.withColumn("sensor_dayOfMonth", dayofmonth("sensor_created_at")) \
             .withColumn("sensor_month", month("sensor_created_at")) \
             .withColumn("sensor_hour", substring("sensor_created_at", 11, 13)) \
             .withColumn("sensor_date", substring("sensor_created_at", 0, 10)) \
             .withColumn("sensor_datehhmm", substring("sensor_created_at", 0, 16).cast(TimestampType())) \
              .orderBy("sensor_datehhmm")

## Dataframe with sensor_datehhmm by minute

In [14]:
dfsensoresmin.toPandas()

                                                                                

Unnamed: 0,sensor_created_at,sensor_event_type,sensor_id_cycle,sensor_ip,sensor_unique_id,sensor_value_humidity,sensor_value_temperature_motor1,sensor_value_temperature_motor2,sensor_value_temperature_motor3,sensor_value_vibrationhz_x,sensor_value_vibrationhz_y,sensor_value_vibrationhz_z,value_noise_dba_motor1,value_noise_dba_motor2,value_noise_dba_motor3,sensor_dayOfMonth,sensor_month,sensor_hour,sensor_date,sensor_datehhmm
0,2021-11-25 12:25:40.016,sensor,1,172.18.0.2,e461209a-82fe-4ce9-9fb2-d7cebd3b2fa5,22,26,21,27,19,16,20,39,58,46,25,11,12:25:40.016,2021-11-25,2021-11-25 12:25:00
1,2021-11-25 12:25:35.015,sensor,4,172.18.0.2,e269fcc6-7737-410a-9bad-938f4d926247,23,19,28,8,11,14,20,23,47,23,25,11,12:25:35.015,2021-11-25,2021-11-25 12:25:00
2,2021-11-25 12:25:50.019,sensor,4,172.18.0.2,26e31171-15f2-4005-b654-54a2e12dc4dc,41,18,23,11,18,11,21,26,20,44,25,11,12:25:50.019,2021-11-25,2021-11-25 12:25:00
3,2021-11-25 12:25:55.020,sensor,4,172.18.0.2,027c718e-a87e-453a-b0fb-e884f280db4a,42,11,32,17,12,15,14,36,28,40,25,11,12:25:55.02,2021-11-25,2021-11-25 12:25:00
4,2021-11-25 12:25:45.017,sensor,1,172.18.0.2,bf19c9e6-638a-47b0-a499-33878176943b,81,7,30,29,10,16,15,34,63,39,25,11,12:25:45.017,2021-11-25,2021-11-25 12:25:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72235,2021-11-29 16:47:21.346,sensor,1,172.18.0.2,cb00939a-3bfa-4b00-8be4-1bd80f0611cc,42,23,22,21,10,20,17,28,42,20,29,11,16:47:21.346,2021-11-29,2021-11-29 16:47:00
72236,2021-11-29 16:47:01.342,sensor,2,172.18.0.2,f087c835-37bc-4343-b647-118ab4849d0a,54,22,9,29,11,10,16,56,25,54,29,11,16:47:01.342,2021-11-29,2021-11-29 16:47:00
72237,2021-11-29 16:47:11.344,sensor,2,172.18.0.2,65f78734-bea7-465b-9523-99afab9dabd2,57,8,12,34,21,13,13,39,32,35,29,11,16:47:11.344,2021-11-29,2021-11-29 16:47:00
72238,2021-11-29 16:47:06.344,sensor,4,172.18.0.2,dc932a86-5e2b-49d3-a6bf-cd05b294216f,25,19,7,18,13,16,21,17,63,16,29,11,16:47:06.344,2021-11-29,2021-11-29 16:47:00


In [15]:
dfsensoresmin

DataFrame[sensor_created_at: timestamp, sensor_event_type: string, sensor_id_cycle: int, sensor_ip: string, sensor_unique_id: string, sensor_value_humidity: int, sensor_value_temperature_motor1: int, sensor_value_temperature_motor2: int, sensor_value_temperature_motor3: int, sensor_value_vibrationhz_x: int, sensor_value_vibrationhz_y: int, sensor_value_vibrationhz_z: int, value_noise_dba_motor1: int, value_noise_dba_motor2: int, value_noise_dba_motor3: int, sensor_dayOfMonth: int, sensor_month: int, sensor_hour: string, sensor_date: string, sensor_datehhmm: timestamp]

In [16]:
dfsensoresmin.describe()

                                                                                

DataFrame[summary: string, sensor_event_type: string, sensor_id_cycle: string, sensor_ip: string, sensor_unique_id: string, sensor_value_humidity: string, sensor_value_temperature_motor1: string, sensor_value_temperature_motor2: string, sensor_value_temperature_motor3: string, sensor_value_vibrationhz_x: string, sensor_value_vibrationhz_y: string, sensor_value_vibrationhz_z: string, value_noise_dba_motor1: string, value_noise_dba_motor2: string, value_noise_dba_motor3: string, sensor_dayOfMonth: string, sensor_month: string, sensor_hour: string, sensor_date: string]

In [17]:
dfsensoresmin.dtypes

[('sensor_created_at', 'timestamp'),
 ('sensor_event_type', 'string'),
 ('sensor_id_cycle', 'int'),
 ('sensor_ip', 'string'),
 ('sensor_unique_id', 'string'),
 ('sensor_value_humidity', 'int'),
 ('sensor_value_temperature_motor1', 'int'),
 ('sensor_value_temperature_motor2', 'int'),
 ('sensor_value_temperature_motor3', 'int'),
 ('sensor_value_vibrationhz_x', 'int'),
 ('sensor_value_vibrationhz_y', 'int'),
 ('sensor_value_vibrationhz_z', 'int'),
 ('value_noise_dba_motor1', 'int'),
 ('value_noise_dba_motor2', 'int'),
 ('value_noise_dba_motor3', 'int'),
 ('sensor_dayOfMonth', 'int'),
 ('sensor_month', 'int'),
 ('sensor_hour', 'string'),
 ('sensor_date', 'string'),
 ('sensor_datehhmm', 'timestamp')]

## Create Agg DataFrame with Avg, Max, Min

In [18]:
colName = "sensor_value_humidity"

dfsensoresminagg = dfsensoresmin.groupBy("sensor_datehhmm", "sensor_ip") \
     .agg( \
     min("sensor_value_humidity").alias('min_humidity'), \
     max("sensor_value_humidity").alias('max_humidity'), \
     round(avg("sensor_value_humidity"), 2).alias('avg_humidity'), \
     min("sensor_value_temperature_motor1").alias('min_temp_motor1'), \
     max("sensor_value_temperature_motor1").alias('max_temp_motor1'), \
     round(avg("sensor_value_temperature_motor1"), 2).alias('avg_temp_motor1'), \
     min("sensor_value_temperature_motor2").alias('min_temp_motor2'), \
     max("sensor_value_temperature_motor2").alias('max_temp_motor2'), \
     round(avg("sensor_value_temperature_motor2"), 2).alias('avg_temp_motor2'), \
     min("sensor_value_temperature_motor3").alias('min_temp_motor3'), \
     max("sensor_value_temperature_motor3").alias('max_temp_motor3'), \
     round(avg("sensor_value_temperature_motor3"), 2).alias('avg_temp_motor3'), \
     min("sensor_value_vibrationhz_x").alias('min_vibrationhz_x'), \
     max("sensor_value_vibrationhz_x").alias('max_vibrationhz_x'), \
     round(avg("sensor_value_vibrationhz_x"), 2).alias('avg_vibrationhz_x'), \
     min("sensor_value_vibrationhz_y").alias('min_vibrationhz_y'), \
     max("sensor_value_vibrationhz_y").alias('max_vibrationhz_y'), \
     round(avg("sensor_value_vibrationhz_y"), 2).alias('avg_vibrationhz_y'), \
     min("sensor_value_vibrationhz_z").alias('min_vibrationhz_z'), \
     max("sensor_value_vibrationhz_z").alias('max_vibrationhz_z'), \
     round(avg("sensor_value_vibrationhz_z"), 2).alias('avg_vibrationhz_z'), \
     min("value_noise_dba_motor1").alias('min_noise_dba_motor1'), \
     max("value_noise_dba_motor1").alias('max_noise_dba_motor1'), \
     round(avg("value_noise_dba_motor1"), 2).alias('avg_noise_dba_motor1'), \
     min("value_noise_dba_motor2").alias('min_noise_dba_motor2'), \
     max("value_noise_dba_motor2").alias('max_noise_dba_motor2'), \
     round(avg("value_noise_dba_motor2"), 2).alias('avg_noise_dba_motor2'), \
     min("value_noise_dba_motor3").alias('min_noise_dba_motor3'), \
     max("value_noise_dba_motor3").alias('max_noise_dba_motor3'), \
     round(avg("value_noise_dba_motor3"), 2).alias('avg_noise_dba_motor3'), \
     count(colName).alias("temp_count_hour")) \
     .withColumn('temp_count_hour', col('temp_count_hour').cast('int'))  \
     .orderBy("sensor_datehhmm")               


In [19]:
dfsensoresminagg.toPandas()

21/12/07 12:50:51 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,sensor_datehhmm,sensor_ip,min_humidity,max_humidity,avg_humidity,min_temp_motor1,max_temp_motor1,avg_temp_motor1,min_temp_motor2,max_temp_motor2,...,min_noise_dba_motor1,max_noise_dba_motor1,avg_noise_dba_motor1,min_noise_dba_motor2,max_noise_dba_motor2,avg_noise_dba_motor2,min_noise_dba_motor3,max_noise_dba_motor3,avg_noise_dba_motor3,temp_count_hour
0,2021-11-25 12:25:00,172.18.0.2,22,81,41.80,7,26,16.20,21,32,...,23,39,31.60,20,63,43.20,23,46,38.40,5
1,2021-11-25 12:26:00,172.18.0.2,18,84,56.58,7,28,15.75,6,32,...,19,57,39.67,19,63,41.42,16,48,32.58,12
2,2021-11-25 12:27:00,172.18.0.2,16,84,51.67,9,28,17.33,6,29,...,16,55,31.25,15,62,41.42,22,64,49.00,12
3,2021-11-25 12:28:00,172.18.0.2,15,82,42.92,5,28,20.25,8,31,...,18,64,45.17,21,52,39.42,21,64,45.17,12
4,2021-11-25 12:29:00,172.18.0.2,15,82,45.83,5,29,17.00,7,32,...,27,62,46.42,16,64,43.50,16,64,41.25,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6018,2021-11-29 16:43:00,172.18.0.2,15,83,52.67,5,29,19.42,10,30,...,18,63,43.50,20,57,33.17,19,62,36.08,12
6019,2021-11-29 16:44:00,172.18.0.2,18,63,40.67,7,29,20.17,5,32,...,15,64,39.67,15,60,28.75,25,59,43.92,12
6020,2021-11-29 16:45:00,172.18.0.2,17,76,40.17,6,27,20.25,6,32,...,17,52,33.75,24,62,43.33,16,64,37.58,12
6021,2021-11-29 16:46:00,172.18.0.2,15,78,52.75,7,28,19.83,5,30,...,23,57,41.83,20,54,40.67,15,63,36.83,12


In [20]:
dfsensoresminagg.printSchema()

root
 |-- sensor_datehhmm: timestamp (nullable = true)
 |-- sensor_ip: string (nullable = true)
 |-- min_humidity: integer (nullable = true)
 |-- max_humidity: integer (nullable = true)
 |-- avg_humidity: double (nullable = true)
 |-- min_temp_motor1: integer (nullable = true)
 |-- max_temp_motor1: integer (nullable = true)
 |-- avg_temp_motor1: double (nullable = true)
 |-- min_temp_motor2: integer (nullable = true)
 |-- max_temp_motor2: integer (nullable = true)
 |-- avg_temp_motor2: double (nullable = true)
 |-- min_temp_motor3: integer (nullable = true)
 |-- max_temp_motor3: integer (nullable = true)
 |-- avg_temp_motor3: double (nullable = true)
 |-- min_vibrationhz_x: integer (nullable = true)
 |-- max_vibrationhz_x: integer (nullable = true)
 |-- avg_vibrationhz_x: double (nullable = true)
 |-- min_vibrationhz_y: integer (nullable = true)
 |-- max_vibrationhz_y: integer (nullable = true)
 |-- avg_vibrationhz_y: double (nullable = true)
 |-- min_vibrationhz_z: integer (nullable =

# 6. Parse time by hour

In [21]:
dfsensoreshour = df_sensores.withColumn("sensor_dayOfMonth", dayofmonth("sensor_created_at")) \
             .withColumn("sensor_month", month("sensor_created_at")) \
             .withColumn("sensor_hour", substring("sensor_created_at", 11, 13)) \
             .withColumn("sensor_date", substring("sensor_created_at", 0, 10)) \
             .withColumn("sensor_datehhmm", substring("sensor_created_at", 0, 13).cast(TimestampType())) \
              .orderBy("sensor_datehhmm")

## Create Agg Dataframe with Avg, Max, Min by hour

In [22]:
colName = "sensor_value_humidity"
dfsensoreshouragg = dfsensoreshour.groupBy("sensor_datehhmm", "sensor_ip") \
     .agg( \
     min("sensor_value_humidity").alias('min_humidity'), \
     max("sensor_value_humidity").alias('max_humidity'), \
     round(avg("sensor_value_humidity"), 2).alias('avg_humidity'), \
     min("sensor_value_temperature_motor1").alias('min_temp_motor1'), \
     max("sensor_value_temperature_motor1").alias('max_temp_motor1'), \
     round(avg("sensor_value_temperature_motor1"), 2).alias('avg_temp_motor1'), \
     min("sensor_value_temperature_motor2").alias('min_temp_motor2'), \
     max("sensor_value_temperature_motor2").alias('max_temp_motor2'), \
     round(avg("sensor_value_temperature_motor2"), 2).alias('avg_temp_motor2'), \
     min("sensor_value_temperature_motor3").alias('min_temp_motor3'), \
     max("sensor_value_temperature_motor3").alias('max_temp_motor3'), \
     round(avg("sensor_value_temperature_motor3"), 2).alias('avg_temp_motor3'), \
     min("sensor_value_vibrationhz_x").alias('min_vibrationhz_x'), \
     max("sensor_value_vibrationhz_x").alias('max_vibrationhz_x'), \
     round(avg("sensor_value_vibrationhz_x"), 2).alias('avg_vibrationhz_x'), \
     min("sensor_value_vibrationhz_y").alias('min_vibrationhz_y'), \
     max("sensor_value_vibrationhz_y").alias('max_vibrationhz_y'), \
     round(avg("sensor_value_vibrationhz_y"), 2).alias('avg_vibrationhz_y'), \
     min("sensor_value_vibrationhz_z").alias('min_vibrationhz_z'), \
     max("sensor_value_vibrationhz_z").alias('max_vibrationhz_z'), \
     round(avg("sensor_value_vibrationhz_z"), 2).alias('avg_vibrationhz_z'), \
     min("value_noise_dba_motor1").alias('min_noise_dba_motor1'), \
     max("value_noise_dba_motor1").alias('max_noise_dba_motor1'), \
     round(avg("value_noise_dba_motor1"), 2).alias('avg_noise_dba_motor1'), \
     min("value_noise_dba_motor2").alias('min_noise_dba_motor2'), \
     max("value_noise_dba_motor2").alias('max_noise_dba_motor2'), \
     round(avg("value_noise_dba_motor2"), 2).alias('avg_noise_dba_motor2'), \
     min("value_noise_dba_motor3").alias('min_noise_dba_motor3'), \
     max("value_noise_dba_motor3").alias('max_noise_dba_motor3'), \
     round(avg("value_noise_dba_motor3"), 2).alias('avg_noise_dba_motor3'), \
     count(colName).alias("temp_count_hour")) \
     .orderBy("sensor_datehhmm")             

In [27]:
dfhouragg = dfsensoreshouragg.toPandas()

In [30]:
dfhouragg.to_csv('./provaconceito/csv/dfhouragg.csv', index=False)

In [24]:
dfsensoreshouragg.dtypes

[('sensor_datehhmm', 'timestamp'),
 ('sensor_ip', 'string'),
 ('min_humidity', 'int'),
 ('max_humidity', 'int'),
 ('avg_humidity', 'double'),
 ('min_temp_motor1', 'int'),
 ('max_temp_motor1', 'int'),
 ('avg_temp_motor1', 'double'),
 ('min_temp_motor2', 'int'),
 ('max_temp_motor2', 'int'),
 ('avg_temp_motor2', 'double'),
 ('min_temp_motor3', 'int'),
 ('max_temp_motor3', 'int'),
 ('avg_temp_motor3', 'double'),
 ('min_vibrationhz_x', 'int'),
 ('max_vibrationhz_x', 'int'),
 ('avg_vibrationhz_x', 'double'),
 ('min_vibrationhz_y', 'int'),
 ('max_vibrationhz_y', 'int'),
 ('avg_vibrationhz_y', 'double'),
 ('min_vibrationhz_z', 'int'),
 ('max_vibrationhz_z', 'int'),
 ('avg_vibrationhz_z', 'double'),
 ('min_noise_dba_motor1', 'int'),
 ('max_noise_dba_motor1', 'int'),
 ('avg_noise_dba_motor1', 'double'),
 ('min_noise_dba_motor2', 'int'),
 ('max_noise_dba_motor2', 'int'),
 ('avg_noise_dba_motor2', 'double'),
 ('min_noise_dba_motor3', 'int'),
 ('max_noise_dba_motor3', 'int'),
 ('avg_noise_dba_motor

# 7. Create Hive Table

In [None]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

In [None]:
spark.sql(
    """
    SHOW TABLES FROM sensores
    """
).show() 

In [None]:
spark.sql(
    """
    DROP TABLE IF EXISTS sensores.sensors
    """
)

In [None]:
spark.sql(
    """
    CREATE TABLE sensores.sensors (
        sensor_datehhmm TIMESTAMP,
        sensor_ip VARCHAR(255),
        min_humidity INT,
        max_humidity INT,
        avg_humidity DOUBLE,
        min_temp_motor1 INT,
        max_temp_motor1 INT,
        avg_temp_motor1 DOUBLE,
        min_temp_motor2 INT,
        max_temp_motor2 INT,
        avg_temp_motor2 DOUBLE,
        min_temp_motor3 INT,
        max_temp_motor3 INT,
        avg_temp_motor3 DOUBLE,
        min_vibrationhz_x INT,
        max_vibrationhz_x INT,
        avg_vibrationhz_x DOUBLE,
        min_vibrationhz_y INT,
        max_vibrationhz_y INT,
        avg_vibrationhz_y DOUBLE,
        min_vibrationhz_z INT,
        max_vibrationhz_z INT,
        avg_vibrationhz_z DOUBLE,
        min_noise_dba_motor1 INT,
        max_noise_dba_motor1 INT,
        avg_noise_dba_motor1 DOUBLE,
        min_noise_dba_motor2 INT,
        max_noise_dba_motor2 INT,
        avg_noise_dba_motor2 DOUBLE,
        min_noise_dba_motor3 INT,
        max_noise_dba_motor3 INT,
        avg_noise_dba_motor3 DOUBLE
     
    )
    STORED AS PARQUET
    PARTITIONED BY (
        temp_count_hour INT
    )
    LOCATION 'hdfs://namenode:8020/warehouse/sensores.db/sensors/'
    """
)

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

In [None]:
spark.sql(
    """
    SHOW TABLES FROM sensores
    """
).show()

In [None]:
spark.sql(
    """
    SELECT *
    FROM sensores.sensors
    """
).show()

In [None]:
spark.sql(
    """
    DESCRIBE FORMATTED sensores.sensors
    """
).toPandas()

In [None]:
dfsensoresminagg.repartition(1) \
    .write \
    .partitionBy("temp_count_hour") \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://namenode:8020/warehouse/sensores.db/sensors/")

In [None]:
spark.catalog.recoverPartitions("sensores.sensors")

spark.sql(
    """
    SELECT *
    FROM sensores.sensors
    """
).toPandas()

In [None]:
dfsensoresminagg.toPandas().to_csv('./df3.csv', index=False)

In [None]:
spark.sql(
    """
    SELECT *
    FROM sensores.sensors
    """
).toPandas()

# 8. H2O

![h2o](./images/h2o.png)

##### H2O is an in-memory platform for distributed, scalable machine learning. H2O uses familiar interfaces like R, Python, Scala, Java, JSON and the Flow notebook/web interface, and works seamlessly with big data technologies like Hadoop and Spark. H2O provides implementations of many popular algorithms such as Generalized Linear Models (GLM), Gradient Boosting Machines (including XGBoost), Random Forests, Deep Neural Networks, Stacked Ensembles, Naive Bayes, Generalized Additive Models (GAM), Cox Proportional Hazards, K-Means, PCA, Word2Vec, as well as a fully automatic machine learning algorithm (H2O AutoML).

In [None]:
pip install h2o==3.32.1.7

In [None]:
import h2o

In [None]:
h2o.init(ip="172.16.131.140", port="54321")

In [None]:
pip install cane

In [None]:
pip install mlflow

In [None]:
from funcs import functions as fc
import h2o
from h2o.automl import H2OAutoML, get_leaderboard
import mlflow
import mlflow.h2o
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import mean_absolute_error as MAE

In [25]:
dfminagg = dfsensoresminagg.toPandas()

In [31]:
dfminagg.to_csv('./provaconceito/csv/dfminagg.csv', index=False)

In [None]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
from typing import List
from statsmodels.stats.outliers_influence import variance_inflation_factor
import cane

def toInt(x):
    return int(x)

def calc_vif(X):
    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif.sort_values(["VIF"], ascending=False)
    return(vif)

def tratamento(dataframe: pd.DataFrame)-> pd.DataFrame: 

    for i in dataframe:
        if is_string_dtype(dataframe[i]):
            dataframe[i].fillna('Desconhecido', inplace=True)
        elif is_numeric_dtype(dataframe[i]): 
            dataframe[i].fillna(0, inplace=True)
        else: print(i)
    return dataframe

def object(dataframe: pd.DataFrame) -> List:

    col_obj = []
    for i in dataframe:
        if is_string_dtype(dataframe[i]):
            col_obj.append(i)
        else: print(i)
    return col_obj


def not_object(dataframe: pd.DataFrame) -> List:

    col_not_obj = []
    for i in dataframe:
        if is_string_dtype(dataframe[i]):
            print(i)
        else: col_not_obj.append(i)
    return col_not_obj

def idf_encode(train: pd.DataFrame, test: pd.DataFrame, list: List, target):

    _train = train.copy()
    _test = test.copy()

    dataIDF = cane.idf_multicolumn(_train, columns_use = list)  # aplication of specific multicolumn setting IDF

    idfDicionary = cane.idfDictionary(Original = _train, Transformed = dataIDF, columns_use = list
                                , targetColumn=target) #following the example above of the 2 columns
    for col in list:
        _test[col] = (
        _test[col]
        .map(idfDicionary[col])
        .fillna(max(idfDicionary[col].values()))
    )
        _train[col] = (
        _train[col]
        .map(idfDicionary[col])
        .fillna(max(idfDicionary[col].values()))
    )
    
    return _train, _test, idfDicionary

def NMAE(mae, y):
    ymax = y.max()
    ymin = y.min()
    dif = ymax-ymin
    nmae = mae/dif
    return nmae



In [None]:

target = 'avg_humidity'
#train, test = train_test_split(dfconceito, test_size=0.2, random_state=25)
train = dfminagg[0:5500]
test = dfminagg[5500:6022]
col_obj=object(dfminagg)

col_not_obj = not_object(dfminagg)
col_not_obj.remove('avg_humidity')

In [None]:
pip install boto3

In [None]:
import boto3
import os

In [None]:
os.environ["MLFLOW_TRACKING_URI"] = "http://mlflow:5000"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "minio"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"

In [None]:
train_h2o=h2o.H2OFrame(train)
test_h2o=h2o.H2OFrame(test)

x = train_h2o.columns
y = target

experiment_name = 'provaconceito3'

client = MlflowClient()

try:
    experiment = mlflow.create_experiment(experiment_name)
except:
    experiment = client.get_experiment_by_name(experiment_name)
mlflow.set_experiment(experiment_name)


with mlflow.start_run():
    model = H2OAutoML(max_runtime_secs = 10, seed = 1, project_name = "provaconceito3", nfolds=10)
    model.train(x=x, y = y, training_frame = train_h2o)
   
    mlflow.log_param("runtime_sec", model.max_runtime_secs)
    mlflow.log_param("seed", model.seed)
    mlflow.log_param("nfolds", model.nfolds)
    mlflow.h2o.log_model(model.leader, "model")
    
    lb = model.leaderboard
    lb = get_leaderboard(model, extra_columns='ALL')
    print('')
    print(lb.head(rows=lb.nrows))
    model.leader.varimp_plot()
    pred = model.predict(test_h2o)
    print(pred.head())
    pred_df = h2o.as_list(pred)

    perf = model.leader.model_performance(test_h2o)
    print(perf)
    mae = MAE(test[y], pred_df)
    nmae = NMAE(mae, test[y])
    print('--------MAE-------', mae)
    print('--------NMAE-------', nmae)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("nmae", nmae)
    #exm = model.leader.explain_row(test_h2o, row_index=200)

#all_mlflow_runs = client.list_run_infos(experiment.experiment_id)
#if len(all_mlflow_runs) > 0:
#    run_info = all_mlflow_runs[-1]
#    model = mlflow.h2o.load_model("mlruns/{exp_id}/{run_id}/artifacts/model/".format(exp_id=experiment.experiment_id,run_id=run_info.run_uuid))
#    result = model.predict(test_h2o)
#else:
#    raise Exception('Run the training first')

In [None]:
pred_df = pred_df.reset_index(drop=True)

In [None]:
pred_df.index += 5500

In [None]:
pred_df

In [None]:
test

In [None]:
pip install plotly

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8,5))
plt.plot(train['avg_humidity'],label="Training")
plt.plot(test['avg_humidity'],label="Test")
plt.plot(pred_df,label="Predicted")
plt.xlim([5700,5900])
plt.show()


# 9. Auto-Arima

In [26]:
dfarima = dfminagg

In [27]:
dfarima

Unnamed: 0,sensor_datehhmm,sensor_ip,min_humidity,max_humidity,avg_humidity,min_temp_motor1,max_temp_motor1,avg_temp_motor1,min_temp_motor2,max_temp_motor2,...,min_noise_dba_motor1,max_noise_dba_motor1,avg_noise_dba_motor1,min_noise_dba_motor2,max_noise_dba_motor2,avg_noise_dba_motor2,min_noise_dba_motor3,max_noise_dba_motor3,avg_noise_dba_motor3,temp_count_hour
0,2021-11-25 12:25:00,172.18.0.2,22,81,41.80,7,26,16.20,21,32,...,23,39,31.60,20,63,43.20,23,46,38.40,5
1,2021-11-25 12:26:00,172.18.0.2,18,84,56.58,7,28,15.75,6,32,...,19,57,39.67,19,63,41.42,16,48,32.58,12
2,2021-11-25 12:27:00,172.18.0.2,16,84,51.67,9,28,17.33,6,29,...,16,55,31.25,15,62,41.42,22,64,49.00,12
3,2021-11-25 12:28:00,172.18.0.2,15,82,42.92,5,28,20.25,8,31,...,18,64,45.17,21,52,39.42,21,64,45.17,12
4,2021-11-25 12:29:00,172.18.0.2,15,82,45.83,5,29,17.00,7,32,...,27,62,46.42,16,64,43.50,16,64,41.25,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6018,2021-11-29 16:43:00,172.18.0.2,15,83,52.67,5,29,19.42,10,30,...,18,63,43.50,20,57,33.17,19,62,36.08,12
6019,2021-11-29 16:44:00,172.18.0.2,18,63,40.67,7,29,20.17,5,32,...,15,64,39.67,15,60,28.75,25,59,43.92,12
6020,2021-11-29 16:45:00,172.18.0.2,17,76,40.17,6,27,20.25,6,32,...,17,52,33.75,24,62,43.33,16,64,37.58,12
6021,2021-11-29 16:46:00,172.18.0.2,15,78,52.75,7,28,19.83,5,30,...,23,57,41.83,20,54,40.67,15,63,36.83,12


In [28]:
dfarimadrop = dfarima.drop(columns=['sensor_datehhmm', 'sensor_ip', 'min_humidity', 'avg_humidity', 'max_humidity', 'min_temp_motor1', 'max_temp_motor1', 'min_temp_motor2', 'max_temp_motor2', 'avg_temp_motor2', 'min_temp_motor3', 'max_temp_motor3', 'avg_temp_motor3', 'min_noise_dba_motor1', 'avg_noise_dba_motor1', 'max_noise_dba_motor1', 'min_noise_dba_motor2', 'avg_noise_dba_motor2', 'max_noise_dba_motor2', 'min_noise_dba_motor3', 'avg_noise_dba_motor3', 'max_noise_dba_motor3', 'temp_count_hour', 'min_vibrationhz_x', 'avg_vibrationhz_x', 'max_vibrationhz_x', 'min_vibrationhz_y', 'avg_vibrationhz_y', 'max_vibrationhz_y', 'min_vibrationhz_z', 'avg_vibrationhz_z', 'max_vibrationhz_z'])


In [29]:
dfarimadrop

Unnamed: 0,avg_temp_motor1
0,16.20
1,15.75
2,17.33
3,20.25
4,17.00
...,...
6018,19.42
6019,20.17
6020,20.25
6021,19.83


In [30]:
dfarimadrop.to_csv('./provaconceito/csv/dfarimadrop.csv', index=False)

In [None]:
pip install cane

# 10. Prophet


![prophet2](./images/prophet2.png)

#### Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

In [None]:
pip install pystan==2.19.1.1

In [None]:
pip install fbprophet

In [None]:
pip install prophet

In [31]:
dfprophet = dfminagg.drop(columns=['sensor_ip', 'min_humidity', 'avg_humidity', 'max_humidity', 'min_temp_motor1', 'max_temp_motor1', 'min_temp_motor2', 'max_temp_motor2', 'avg_temp_motor2', 'min_temp_motor3', 'max_temp_motor3', 'avg_temp_motor3', 'min_noise_dba_motor1', 'avg_noise_dba_motor1', 'max_noise_dba_motor1', 'min_noise_dba_motor2', 'avg_noise_dba_motor2', 'max_noise_dba_motor2', 'min_noise_dba_motor3', 'avg_noise_dba_motor3', 'max_noise_dba_motor3', 'temp_count_hour', 'min_vibrationhz_x', 'avg_vibrationhz_x', 'max_vibrationhz_x', 'min_vibrationhz_y', 'avg_vibrationhz_y', 'max_vibrationhz_y', 'min_vibrationhz_z', 'avg_vibrationhz_z', 'max_vibrationhz_z'])

In [32]:
dfprophet

Unnamed: 0,sensor_datehhmm,avg_temp_motor1
0,2021-11-25 12:25:00,16.20
1,2021-11-25 12:26:00,15.75
2,2021-11-25 12:27:00,17.33
3,2021-11-25 12:28:00,20.25
4,2021-11-25 12:29:00,17.00
...,...,...
6018,2021-11-29 16:43:00,19.42
6019,2021-11-29 16:44:00,20.17
6020,2021-11-29 16:45:00,20.25
6021,2021-11-29 16:46:00,19.83


In [33]:
dfprophet.to_csv('./provaconceito/csv/dfprophet.csv', index=False)

In [None]:
dfprophet2.info()

In [None]:
dfResults = runprophet(dfprophet2,iterations)