In [44]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.window import Window

In [45]:
spark = SparkSession.builder.master('local[*]').appName('HTF_Vol').getOrCreate()
spark.conf.set("spark.executor.memory", "2g")



df = spark.read.csv('newaa.csv', header=True)
df = df.withColumn('TIME', f.date_format(df.TIME, 'HH:mm:ss'))

In [46]:
df.show()

+------+--------+--------+-----+-----+
|SYMBOL|    DATE|    TIME|PRICE| SIZE|
+------+--------+--------+-----+-----+
|     A|20021202|09:30:20|19.75|30200|
|     A|20021202|09:30:22|19.75|  100|
|     A|20021202|09:30:22|19.75|  300|
|     A|20021202|09:30:22|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75| 1100|
|     A|20021202|09:30:23|19.75| 1000|
|     A|20021202|09:30:23|19.75|  400|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75|  200|
|     A|20021202|09:30:23|19.75|  100|
|     A|20021202|09:30:23|19.75| 1000|
|     A|20021202|09:30:23|19.75| 3000|
|     A|20021202|09:30:23|19.75|  100|
+------+--------+--------+-----+-----+
only showing top 20 rows



In [47]:
# patition a window to calculcate the u_sequence
w = Window().partitionBy('SYMBOL', 'DATE').orderBy('TIME')

# calculate the u_sequence
df = df.withColumn('PRICE_1', f.lead('PRICE').over(w))
df = df.withColumn('LOG_RETURN', f.log(df.PRICE_1/df.PRICE))

# patition a window to calculate the moving standard deviation
window_period = 100
w2 = Window().partitionBy('SYMBOL', 'DATE').orderBy('TIME').rowsBetween(0, window_period)
df = df.withColumn('VOLATILITY', f.stddev(df['LOG_RETURN']).over(w2))

In [48]:
df_A = df.where(df['SYMBOL'] == 'A').select('*').sample(False, 0.2)
df_A.toPandas

<bound method DataFrame.toPandas of DataFrame[SYMBOL: string, DATE: string, TIME: string, PRICE: string, SIZE: string, PRICE_1: string, LOG_RETURN: double, VOLATILITY: double]>

In [36]:
test = df.sampleBy('SYMBOL', fractions={'A': 0.005, 'AA': 0.005, 'AAP':0.005, 'AAI': 0.005})

In [37]:
df_pd = test.toPandas()

In [38]:
df_pd.head()

Unnamed: 0,SYMBOL,DATE,TIME,PRICE,SIZE,PRICE_1,LOG_RETURN,VOLATILITY
0,AAI,20030121,10:25:04,4.98,100.0,4.95,-0.006042,0.004705
1,AAI,20030321,10:21:14,6.22,1000.0,6.22,0.0,0.001575
2,AAI,20030321,11:40:36,6.25,100.0,6.25,0.0,0.00954
3,AAI,20030321,12:42:04,6.55,200.0,6.55,0.0,0.00228
4,AAI,20030321,14:26:21,6.49,1500.0,6.48,-0.001542,0.003299


In [43]:
%matplotlib
from matplotlib import pyplot as plt
import pandas as pd

fig, axes = plt.subplots(nrows=4, ncols=1)
i = 1
for symbol in ['A', 'AA', 'AAI', 'AAP']:
    plt.subplot(4, 1, i)
    plt.plot(df_pd.loc[df_pd.SYMBOL == symbol,  'VOLATILITY'], label=symbol)
    i+=1
    #plt.ylim(0, 0.005)

    plt.legend(loc='upper right')

plt.suptitle('Realized Volatility')

fig.savefig('Realized_Vol.png')

Using matplotlib backend: MacOSX


IndexingError: Too many indexers

In [42]:
df_pd.to_csv('sampledVol.csv')

In [5]:
df.write.csv('result0527.csv')

# df.toPandas().to_csv('newaa_result.csv')
# df.write.format('com.databricks.spark.csv').save('newaa_result.csv')