In [1]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.master('local[*]').appName('HTF_Vol').getOrCreate()
spark.conf.set("spark.executor.memory", "2g")

df = spark.read.csv('newaa.csv', header=True)
df = df.withColumn('TIME', f.date_format(df.TIME, 'HH:mm:ss'))

In [3]:
df.show()

+------+--------+--------+-----+-----+
|SYMBOL|    DATE|    TIME|PRICE| SIZE|
+------+--------+--------+-----+-----+
|     A|20021202|09:30:20|19.75|30200|
|     A|20021202|09:30:22|19.75|  100|
|     A|20021202|09:30:22|19.75|  300|
|     A|20021202|09:30:22|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75| 1100|
|     A|20021202|09:30:23|19.75| 1000|
|     A|20021202|09:30:23|19.75|  400|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  200|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75| 1000|
|     A|20021202|09:30:23|19.75| 3000|
|     A|20021202|09:30:23|19.75|  100|
+------+--------+--------+-----+-----+
only showing top 20 rows



In [4]:
# patition a window to calculcate the u_sequence
w = Window().partitionBy(df['SYMBOL']).orderBy('DATE', 'TIME')

# calculate the u_sequence
df = df.withColumn('PRICE_1', f.lead('PRICE').over(w))
df = df.withColumn('LOG_RETURN', f.log(df.PRICE_1/df.PRICE))

# patition a window to calculate the moving standard deviation
window_period = 100
w2 = Window().partitionBy('SYMBOL').orderBy('DATE', 'TIME').rowsBetween(0, window_period)
df = df.withColumn('VOLATILITY', f.stddev(df['LOG_RETURN']).over(w2))

In [5]:
df.write.csv('result0527.csv')

# df.toPandas().to_csv('newaa_result.csv')
# df.write.format('com.databricks.spark.csv').save('newaa_result.csv')