In [42]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7fe8ea7a3d68>]

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
userSchema = StructType() \
        .add("window",StructType()\
             .add("start",TimestampType())\
             .add("end",TimestampType()))\
        .add("subject", StringType())\
        .add("count", LongType())

In [11]:
raw_data = spark\
.readStream.format("parquet")\
.schema(userSchema)\
.load("/cms/users/carizapo/ming/groupdata_cmsweb_logs");
raw_data.printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- subject: string (nullable = true)
 |-- count: long (nullable = true)



In [181]:
raw_data_match = raw_data.withColumnRenamed("subject","system").drop("subject").withColumn('date',col("window.start")).drop(col("window"))\

In [None]:
raw_data_flow.lastProgress()

In [14]:
raw_data_flow = raw_data.writeStream.queryName("hdfs").outputMode("Append").format("memory").start()

In [12]:
raw_data_flow.stop()

In [157]:
alerts = spark.sql("select * from hdfs")
alerts.show()

+--------------------+-------+-----+
|              window|subject|count|
+--------------------+-------+-----+
|[2019-06-28 16:14...|   null|  101|
|[2019-06-29 19:59...|   null|    2|
|[2019-06-28 12:54...|   null|    1|
|[2019-07-01 07:45...|   null|    2|
|[2019-06-28 12:08...|   null| 4292|
|[2019-07-01 03:43...|   null| 8435|
|[2019-06-28 12:57...|   null|    1|
|[2019-06-30 21:38...|   null|    1|
|[2019-06-29 13:42...|   null|    1|
|[2019-07-01 04:39...|   null|  662|
|[2019-06-29 20:53...|   null|    1|
|[2019-06-29 17:29...|   null|   25|
|[2019-06-30 16:13...|   null|    1|
|[2019-06-29 09:23...|   null|    4|
|[2019-06-28 12:58...|   null|    1|
|[2019-06-30 17:13...|   null|    9|
|[2019-06-29 05:14...|   null|    1|
|[2019-06-30 00:59...|   null|   37|
|[2019-06-30 10:38...|   null|    1|
|[2019-06-29 08:14...|   null|    5|
+--------------------+-------+-----+
only showing top 20 rows



In [158]:
from pyspark.sql.window import Window as W
alerts = alerts.fillna({'subject':'unknown'})
raw_data_id = alerts.select("subject").distinct()
windowSpec = W.orderBy("subject")
raw_data_id=raw_data_id.withColumn("id", row_number().over(windowSpec))
raw_data_id=raw_data_id.withColumnRenamed("subject","system")

In [159]:
alerts.show()
raw_data_id.show()

+--------------------+-------+-----+
|              window|subject|count|
+--------------------+-------+-----+
|[2019-06-28 16:14...|unknown|  101|
|[2019-06-29 19:59...|unknown|    2|
|[2019-06-28 12:54...|unknown|    1|
|[2019-07-01 07:45...|unknown|    2|
|[2019-06-28 12:08...|unknown| 4292|
|[2019-07-01 03:43...|unknown| 8435|
|[2019-06-28 12:57...|unknown|    1|
|[2019-06-30 21:38...|unknown|    1|
|[2019-06-29 13:42...|unknown|    1|
|[2019-07-01 04:39...|unknown|  662|
|[2019-06-29 20:53...|unknown|    1|
|[2019-06-29 17:29...|unknown|   25|
|[2019-06-30 16:13...|unknown|    1|
|[2019-06-29 09:23...|unknown|    4|
|[2019-06-28 12:58...|unknown|    1|
|[2019-06-30 17:13...|unknown|    9|
|[2019-06-29 05:14...|unknown|    1|
|[2019-06-30 00:59...|unknown|   37|
|[2019-06-30 10:38...|unknown|    1|
|[2019-06-29 08:14...|unknown|    5|
+--------------------+-------+-----+
only showing top 20 rows

+-------+---+
| system| id|
+-------+---+
|unknown|  1|
+-------+---+



In [164]:
alerts_df=alerts\
.join(raw_data_id, raw_data_id.system == alerts.subject)\
.withColumn('date',col("window.start")).drop(col("window"))\
.drop(alerts.subject)

In [165]:
alerts_df.show()

+-----+-------+---+-------------------+
|count| system| id|               date|
+-----+-------+---+-------------------+
|  101|unknown|  1|2019-06-28 16:14:00|
|    2|unknown|  1|2019-06-29 19:59:00|
|    1|unknown|  1|2019-06-28 12:54:00|
|    2|unknown|  1|2019-07-01 07:45:00|
| 4292|unknown|  1|2019-06-28 12:08:00|
| 8435|unknown|  1|2019-07-01 03:43:00|
|    1|unknown|  1|2019-06-28 12:57:00|
|    1|unknown|  1|2019-06-30 21:38:00|
|    1|unknown|  1|2019-06-29 13:42:00|
|  662|unknown|  1|2019-07-01 04:39:00|
|    1|unknown|  1|2019-06-29 20:53:00|
|   25|unknown|  1|2019-06-29 17:29:00|
|    1|unknown|  1|2019-06-30 16:13:00|
|    4|unknown|  1|2019-06-29 09:23:00|
|    1|unknown|  1|2019-06-28 12:58:00|
|    9|unknown|  1|2019-06-30 17:13:00|
|    1|unknown|  1|2019-06-29 05:14:00|
|   37|unknown|  1|2019-06-30 00:59:00|
|    1|unknown|  1|2019-06-30 10:38:00|
|    5|unknown|  1|2019-06-29 08:14:00|
+-----+-------+---+-------------------+
only showing top 20 rows



In [None]:
# !git clone https://github.com/dimitreOliveira/StoreItemDemand/

In [None]:
!pip install --user --upgrade dist-keras

In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler

In [2]:
%run StoreItemDemand/custom_transformers.ipynb

In [166]:
train_data, test_data = alerts_df.randomSplit([0.8,0.2], seed=1234)

In [167]:
df_train = train_data.withColumn('set', lit(0))
df_train = df_train.withColumn('id', lit(-1))
df_test = test_data.withColumn('set', lit(1))

df_test = df_test.withColumn('count', lit(-1))
joined = df_test.union(df_train.select(*df_test.columns))

train_data = joined.filter('set == 0')
test_data = joined.filter('set == 1')

In [168]:
train, validation = train_data.randomSplit([0.8,0.2], seed=1234)

In [169]:
alerts_df.show()

+-----+-------+---+-------------------+
|count| system| id|               date|
+-----+-------+---+-------------------+
|  101|unknown|  1|2019-06-28 16:14:00|
|    2|unknown|  1|2019-06-29 19:59:00|
|    1|unknown|  1|2019-06-28 12:54:00|
|    2|unknown|  1|2019-07-01 07:45:00|
| 4292|unknown|  1|2019-06-28 12:08:00|
| 8435|unknown|  1|2019-07-01 03:43:00|
|    1|unknown|  1|2019-06-28 12:57:00|
|    1|unknown|  1|2019-06-30 21:38:00|
|    1|unknown|  1|2019-06-29 13:42:00|
|  662|unknown|  1|2019-07-01 04:39:00|
|    1|unknown|  1|2019-06-29 20:53:00|
|   25|unknown|  1|2019-06-29 17:29:00|
|    1|unknown|  1|2019-06-30 16:13:00|
|    4|unknown|  1|2019-06-29 09:23:00|
|    1|unknown|  1|2019-06-28 12:58:00|
|    9|unknown|  1|2019-06-30 17:13:00|
|    1|unknown|  1|2019-06-29 05:14:00|
|   37|unknown|  1|2019-06-30 00:59:00|
|    1|unknown|  1|2019-06-30 10:38:00|
|    5|unknown|  1|2019-06-29 08:14:00|
+-----+-------+---+-------------------+
only showing top 20 rows



In [171]:
# Feature extraction
dc = DateConverter(inputCol='date', outputCol='dateFormated')
dex = DayExtractor(inputCol='dateFormated')
mex = MonthExtractor(inputCol='dateFormated')
yex = YearExtractor(inputCol='dateFormated')
wdex = WeekDayExtractor(inputCol='dateFormated')
# Data process
#tentar fazer 'day', 'month', 'year', 'weekday', 'weekend' (as colunas derivadas) ficarem de forma dinâmica, no lugar delas ficar a saída de seu respectivo transformer
va = VectorAssembler(inputCols=['id','count', 'day', 'month', 'year', 'weekday'], outputCol="features")
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Serialize data
sm = SerieMaker(inputCol='scaledFeatures', dateCol='date', idCol=['system'], serieSize=30)

pipeline = Pipeline(stages=[dc, dex, mex, yex, wdex, va, scaler, sm])

In [172]:
pipiline_model = pipeline.fit(train)

In [173]:
train_transformed = pipiline_model.transform(train)
validation_transformed = pipiline_model.transform(validation)
test_transformed = pipiline_model.transform(test_data)

In [182]:
validation_stream=raw_data_match.join(validation_transformed, ["system","date","count"], "inner")
test_stream=raw_data_match.join(test_transformed, ["system","date","count"], "inner")
train_stream=raw_data_match.join(train_transformed, ["system","date","count"], "inner")

In [183]:
validation_stream_flow=validation_stream.writeStream.queryName("validation_transformed").outputMode("Append").format("memory").start()
test_stream_flow=test_stream.writeStream.queryName("test_transformed").outputMode("Append").format("memory").start()
train_stream_flow=train_stream.writeStream.queryName("train_transformed").outputMode("Append").format("memory").start()


In [None]:
validation_stream_flow.stop()
train_stream_flow.stop()
test_stream_flow.stop()

In [None]:
spark.streams.active

In [178]:
print('Train raw: %s' % train.count())
print('Validation raw: %s' % validation.count())
print('Test raw: %s' % test_data.count())

Train raw: 156591
Validation raw: 39171
Test raw: 48908


In [None]:
print('Train transformed: %s' % train_transformed.count())
print('Validation transformed: %s' % validation_transformed.count())
print('Test transformed: %s' % test_transformed.count())

In [None]:
train_transformed.select("features","subject").distinct().show()

In [184]:
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU
from pyspark.ml.evaluation import RegressionEvaluator

Using TensorFlow backend.


In [1]:
%run StoreItemDemand/utils.ipynb

Using TensorFlow backend.


In [186]:
train_transformed = spark.sql("select * from train_transformed")
validation_transformed = spark.sql("select * from validation_transformed")

In [None]:
train_x, train_y = prepare_collected_data(train_transformed.select('serie', 'count').collect())
validation_x, validation_y = prepare_collected_data(validation_transformed.select('serie', 'count').collect())

In [None]:
n_label = 1
serie_size = len(train_x[0])
n_features = len(train_x[0][0])

In [None]:
for i in train_x:
    print(i)

In [None]:
from keras.layers import Input, Flatten

In [None]:
# hyperparameters
epochs = 80
batch = 512
lr = 0.001

# design network
model = Sequential()
model.add(GRU(40, input_shape=(serie_size, n_features)))
model.add(Dense(10, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(n_label))
model.add(Flatten())
model.summary()

adam = optimizers.Adam(lr)
model.compile(loss='mae', optimizer=adam, metrics=['mse', 'msle'])

history = model.fit(train_x, train_y, epochs=epochs, batch_size=batch, validation_data=(validation_x, validation_y), verbose=2, shuffle=False)