### Import library

In [39]:
# handle data
import pyspark.sql as ps
from pyspark.sql.functions import from_unixtime,date_format,from_utc_timestamp
from pyspark.sql.types import DateType,FloatType
from pyspark.ml.feature import VectorAssembler,MinMaxScaler

# train data|
import pandas as pd
import numpy as np
from datetime import datetime,timedelta

In [16]:
spark = ps.SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "4096m").\
        getOrCreate()
spark.sparkContext.setLogLevel("WARN")

### Read data from hdfs

**Declare period, symbol**

In [17]:
symbol = "BTCUSDT"
PERIOD = "1d"

In [29]:
df = spark.read.parquet(f"hdfs://hadoop-namenode:9000/crypto/{symbol}/{PERIOD}.parquet")

### Processing

In [30]:
# sort df by time
df = df.sort(df["open_time"].desc())

In [31]:
# format open price to float type
df = df.select("open").withColumn("open", df["open"].cast(FloatType()))

In [34]:
# bring "open" column to vector assembler
assember = VectorAssembler(inputCols=["open"],outputCol="feature_vector")
df_transformed = assember.transform(df)

In [41]:
# sac
scaler = MinMaxScaler(min=0,max=1,inputCol="feature_vector",outputCol="scaled_feature")
scaler = scaler.fit(df_transformed)
df_transformed = scaler.transform(df_transformed)

                                                                                

In [44]:
df_transformed.show()

                                                                                

+--------+------------------+--------------------+
|    open|    feature_vector|      scaled_feature|
+--------+------------------+--------------------+
|16661.61|    [16661.609375]|[0.2091282821225519]|
|16900.57|   [16900.5703125]|[0.2128438110507103]|
|16617.72| [16617.720703125]|[0.20844587088130...|
|16331.78|[16331.7802734375]|[0.20399987251291...|
|16813.16|  [16813.16015625]|[0.21148469785187...|
|17069.98|  [17069.98046875]|[0.21547791661982...|
|17602.45|  [17602.44921875]|[0.2237571067864873]|
|15922.68|   [15922.6796875]|[0.19763889536961...|
|18545.38| [18545.380859375]|[0.2384184561462253]|
|20590.67| [20590.669921875]|[0.2702200165086367]|
|20905.58| [20905.580078125]|[0.27511645610064...|
|21299.37| [21299.369140625]|[0.2812393591517365]|
|21148.52|  [21148.51953125]|[0.2788938457247388]|
|20207.12| [20207.119140625]|[0.26425630529264...|
|20151.84|  [20151.83984375]|[0.26339678478349...|
|20482.81| [20482.810546875]|[0.2685429447990164]|
|20490.74| [20490.740234375]|[0

In [45]:
TRAIN_DATA_LENGTH = df_transformed.count() * 0.6

                                                                                