In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
config.setMaster("local").setAppName("DataFrameIntradayAsg03")

from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=config).getOrCreate()

sc= spark.sparkContext

22/03/12 04:28:28 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.80.128 instead (on interface ens33)
22/03/12 04:28:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/12 04:28:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/12 04:28:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/12 04:28:30 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
from pyspark.sql.types import StructType, LongType, StringType, IntegerType, DoubleType, DateType, TimestampType
# // TimestampType with , yyyyMMdd

IntradaySchema = StructType()\
                      .add("Symbol", StringType(), True)\
                      .add("Date", StringType(), True)\
                      .add("Time", StringType(), True)\
                      .add("Open", DoubleType(), True)\
                      .add("High", DoubleType(), True)\
                      .add("Low", DoubleType(), True)\
                      .add("Close", DoubleType(), True)\
                      .add("Volume", LongType(), True)\
                      .add("OI", LongType(), True)

In [5]:
intradayDf= spark.read.format("parquet").option("header", False).schema(IntradaySchema).load("hdfs://localhost:9000/silver/")
intradayDf.printSchema()

intradayDf.show(5)

root
 |-- Symbol: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: long (nullable = true)
 |-- OI: long (nullable = true)

+------------+--------+-----+-------+-------+-------+-------+------+-----+
|      Symbol|    Date| Time|   Open|   High|    Low|  Close|Volume|   OI|
+------------+--------+-----+-------+-------+-------+-------+------+-----+
|SILVERMIC_F1|20211201|09:01|62588.0|62707.0|62550.0|62659.0|   101|98752|
|SILVERMIC_F1|20211201|09:02|62662.0|62679.0|62658.0|62670.0|    98|98818|
|SILVERMIC_F1|20211201|09:03|62670.0|62670.0|62662.0|62669.0|    73|98852|
|SILVERMIC_F1|20211201|09:04|62667.0|62675.0|62650.0|62670.0|    71|98884|
|SILVERMIC_F1|20211201|09:05|62675.0|62678.0|62660.0|62670.0|    63|98895|
+------------+--------+-----+-------+-------+-------+-------+------+----

In [42]:
from pyspark.sql.functions import col, concat, lit, to_timestamp, date_format
#.withColumn("DateTimeStr", concat( col("Date"), lit(" "), col("Time")))
dateDf= intradayDf.withColumn("DateTimeStr", concat( col("Date"), lit(" "), col("Time")))\
                  .withColumn("DATEFORMAT", to_timestamp(col("DateTimeStr"), "yyyyMMdd HH:mm" ))\
                  .withColumn("Min",date_format(("DATEFORMAT"),"mm"))\
                  .withColumn("DATETO",date_format(("DATEFORMAT"),"MMM dd, yyyy hh:mm a"))
dateDf.show(10)
dateDf.printSchema()

+------------+--------+-----+-------+-------+-------+-------+------+-----+--------------+-------------------+---+--------------------+
|      Symbol|    Date| Time|   Open|   High|    Low|  Close|Volume|   OI|   DateTimeStr|         DATEFORMAT|Min|              DATETO|
+------------+--------+-----+-------+-------+-------+-------+------+-----+--------------+-------------------+---+--------------------+
|SILVERMIC_F1|20211201|09:01|62588.0|62707.0|62550.0|62659.0|   101|98752|20211201 09:01|2021-12-01 09:01:00| 01|Dec 01, 2021 09:0...|
|SILVERMIC_F1|20211201|09:02|62662.0|62679.0|62658.0|62670.0|    98|98818|20211201 09:02|2021-12-01 09:02:00| 02|Dec 01, 2021 09:0...|
|SILVERMIC_F1|20211201|09:03|62670.0|62670.0|62662.0|62669.0|    73|98852|20211201 09:03|2021-12-01 09:03:00| 03|Dec 01, 2021 09:0...|
|SILVERMIC_F1|20211201|09:04|62667.0|62675.0|62650.0|62670.0|    71|98884|20211201 09:04|2021-12-01 09:04:00| 04|Dec 01, 2021 09:0...|
|SILVERMIC_F1|20211201|09:05|62675.0|62678.0|62660.0|62

In [43]:
hourlyDf= dateDf.filter(col("Min")=="00")
hourlyDf.show(6)

+------------+--------+-----+-------+-------+-------+-------+------+------+--------------+-------------------+---+--------------------+
|      Symbol|    Date| Time|   Open|   High|    Low|  Close|Volume|    OI|   DateTimeStr|         DATEFORMAT|Min|              DATETO|
+------------+--------+-----+-------+-------+-------+-------+------+------+--------------+-------------------+---+--------------------+
|SILVERMIC_F1|20211201|10:00|62605.0|62605.0|62589.0|62598.0|    24| 99182|20211201 10:00|2021-12-01 10:00:00| 00|Dec 01, 2021 10:0...|
|SILVERMIC_F1|20211201|11:00|62640.0|62640.0|62628.0|62633.0|    11| 99589|20211201 11:00|2021-12-01 11:00:00| 00|Dec 01, 2021 11:0...|
|SILVERMIC_F1|20211201|12:00|62766.0|62766.0|62747.0|62764.0|    16| 98843|20211201 12:00|2021-12-01 12:00:00| 00|Dec 01, 2021 12:0...|
|SILVERMIC_F1|20211201|13:00|62768.0|62768.0|62748.0|62755.0|     8| 98762|20211201 13:00|2021-12-01 13:00:00| 00|Dec 01, 2021 01:0...|
|SILVERMIC_F1|20211201|14:00|62636.0|62643.0|626

In [49]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag,lead, desc

windowspec=Window.partitionBy("Symbol").orderBy("DATEFORMAT")


hourlylagDf=hourlyDf.withColumn("lag",lag("Close",1).over(windowspec))\
                    .withColumn("DATEFROM",lag("DATETO",1).over(windowspec))\
                    .withColumn("Gain",col("Close")-col("lag"))\
                    .withColumn("GainP",col("Gain")/col("lag") * 100)

hourlylagDf.show(10)



+-------+--------+-----+-------+-------+-------+-------+------+-----+--------------+-------------------+---+--------------------+-------+--------------------+-----+--------------------+
| Symbol|    Date| Time|   Open|   High|    Low|  Close|Volume|   OI|   DateTimeStr|         DATEFORMAT|Min|              DATETO|    lag|            DATEFROM| Gain|               GainP|
+-------+--------+-----+-------+-------+-------+-------+------+-----+--------------+-------------------+---+--------------------+-------+--------------------+-----+--------------------+
|GOLD_F1|20210101|10:00|50178.0|50178.0|50169.0|50169.0|     3|10065|20210101 10:00|2021-01-01 10:00:00| 00|Jan 01, 2021 10:0...|   null|                null| null|                null|
|GOLD_F1|20210101|11:00|50184.0|50184.0|50184.0|50184.0|     2|10095|20210101 11:00|2021-01-01 11:00:00| 00|Jan 01, 2021 11:0...|50169.0|Jan 01, 2021 10:0...| 15.0|0.029898941577468155|
|GOLD_F1|20210101|12:00|50185.0|50185.0|50185.0|50185.0|     1|10081|2

                                                                                

In [51]:
finalDf = hourlylagDf.select("Symbol","DATEFROM","DATETO","GainP")
finalDf.show(100)



+-------+--------------------+--------------------+--------------------+
| Symbol|            DATEFROM|              DATETO|               GainP|
+-------+--------------------+--------------------+--------------------+
|GOLD_F1|                null|Jan 01, 2021 10:0...|                null|
|GOLD_F1|Jan 01, 2021 10:0...|Jan 01, 2021 11:0...|0.029898941577468155|
|GOLD_F1|Jan 01, 2021 11:0...|Jan 01, 2021 12:0...|0.001992666985493384|
|GOLD_F1|Jan 01, 2021 12:0...|Jan 01, 2021 02:0...|0.029889409186011757|
|GOLD_F1|Jan 01, 2021 02:0...|Jan 01, 2021 04:0...|   0.049800796812749|
|GOLD_F1|Jan 01, 2021 04:0...|Jan 01, 2021 05:0...|0.019910403185664508|
|GOLD_F1|Jan 01, 2021 05:0...|Jan 04, 2021 10:0...|   1.122723200955509|
|GOLD_F1|Jan 04, 2021 10:0...|Jan 04, 2021 12:0...|  0.2578790921081124|
|GOLD_F1|Jan 04, 2021 12:0...|Jan 04, 2021 01:0...|-0.05497741998821...|
|GOLD_F1|Jan 04, 2021 01:0...|Jan 04, 2021 02:0...|  0.4754233625397823|
|GOLD_F1|Jan 04, 2021 02:0...|Jan 04, 2021 03:0...|

                                                                                