In [1]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
import pylab as py

def RealizeVol(r):
    return np.dot(r, r)


def TwoScaledRealizedVol(r, K=300):
    array = np.array(r)
    N = int(array.size)
    slow = np.zeros(K)
    for i in range(K):
        try:
            inr = np.arange(start=i, stop=N, step=K)
            slow[i] = RealizeVol(array[inr])
        except IndexError:
            continue
    return slow.mean()

# set up the spark and initialize the variables
spark = SparkSession.builder.master('local[*]').appName('HTF_Vol').config('spark.executor.memory', '2g').getOrCreate()

# use spark to read the local csv file
df = spark.read.csv('newaa.csv', header=True)

# create a window in order to shift columns
w = Window().partitionBy(df['SYMBOL']).orderBy(df['PRICE'])

# shift one position for the stock price in order to calculate log return
df = df.select('*', f.lead('PRICE').over(w).alias('PRICE_1')).na.drop()
df = df.select('*', f.log(df.PRICE_1/df.PRICE).alias('LOG_RETURN'))
#df = df.select('*').where(df['LOG_RETURN'] != 0)

# transfer data into pandas dataframe to handle the data by each stock
pd_df = df.toPandas()

In [2]:
# set up the index for pandas dataframe
tuples = list(zip(pd_df.iloc[:, 0], pd_df.iloc[:, 1]))
index = pd.MultiIndex.from_tuples(tuples, names=['SYMBOL', 'DATE'])
pd_df = pd_df.set_index(index)
result = pd.DataFrame(index=index)
indexList = list(set(pd_df.index.values))

In [3]:
k = 300
j = 1

for i in indexList:
    array = np.array(pd_df.ix[i,6].astype(float))
    rv = RealizeVol(array)
    tsrv_avg = TwoScaledRealizedVol(array)
    N = int(array.size)
    K = (N - k + 1) / k
    J = (N - j + 1) / j

    tsrv = (1.0 / (1.0 - K/J)) * (tsrv_avg - (K / J) * rv)
    noise = rv / (2 * N)
    result.loc[i, 'RV']  = rv
    result.loc[i, 'TSRV_AVG'] = tsrv_avg
    result.loc[i, 'TSRV'] = tsrv

  return self._getitem_tuple(key)
  handler(stream, idents, msg)
  shell.run_cell(code, store_history=store_history, silent=silent)


In [4]:
q = float(result['TSRV'].quantile(0.95))
result = result[result.TSRV > q]

symbolList = result.index.values

In [7]:
result.to_csv('result0523.csv')

In [6]:
pd_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SYMBOL,DATE,TIME,PRICE,SIZE,PRICE_1,LOG_RETURN
SYMBOL,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,20030312,AA,20030312,10:42:05,18.4,300,18.45,0.002714
AA,20030312,AA,20030312,12:01:36,18.45,5900,18.45,0.0
AA,20030312,AA,20030312,12:02:05,18.45,600,18.45,0.0
AA,20030312,AA,20030312,12:02:27,18.45,400,18.45,0.0
AA,20030312,AA,20030312,12:04:37,18.45,400,18.45,0.0
