In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("MovieLens")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

In [4]:
sectorDf = spark.read.format("csv")\
                .option("header", True)\
                .option("inferSchema", True)\
                .load("hdfs://localhost:9000/stocks/sectors")

sectorDf.printSchema()
sectorDf.show(5)

root
 |-- Company Name: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- ISIN Code: string (nullable = true)

+--------------------+------------------+----------+------+------------+
|        Company Name|          Industry|    Symbol|Series|   ISIN Code|
+--------------------+------------------+----------+------+------------+
|      Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|  Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
|  Bajaj Finserv Ltd.|FINANCIAL SERVICES|BAJAJFINSV|    EQ|INE918I01018|
|Cholamandalam Inv...|FINANCIAL SERVICES|  CHOLAFIN|    EQ|INE121A01024|
|HDFC Asset Manage...|FINANCIAL SERVICES|   HDFCAMC|    EQ|INE127D01025|
+--------------------+------------------+----------+------+------------+
only showing top 5 rows



In [6]:
dailyDf = spark.read.format("csv")\
                .option("header", True)\
                .option("inferSchema", True)\
                .load("hdfs://localhost:9000/stocks/daily")



dailyDf = dailyDf.drop("_c13")

dailyDf.printSchema()
dailyDf.show(5)

root
 |-- SYMBOL: string (nullable = true)
 |-- SERIES: string (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- LAST: double (nullable = true)
 |-- PREVCLOSE: double (nullable = true)
 |-- TOTTRDQTY: integer (nullable = true)
 |-- TOTTRDVAL: double (nullable = true)
 |-- TIMESTAMP: string (nullable = true)
 |-- TOTALTRADES: integer (nullable = true)
 |-- ISIN: string (nullable = true)

+----------+------+-------+-------+-------+-------+-------+---------+---------+-------------+-----------+-----------+------------+
|    SYMBOL|SERIES|   OPEN|   HIGH|    LOW|  CLOSE|   LAST|PREVCLOSE|TOTTRDQTY|    TOTTRDVAL|  TIMESTAMP|TOTALTRADES|        ISIN|
+----------+------+-------+-------+-------+-------+-------+---------+---------+-------------+-----------+-----------+------------+
| 20MICRONS|    EQ|   70.1|   73.6|   70.1|  71.85|  72.05|     71.2|   219912|1.583125505E7|02-MAR-

In [9]:
# add/derive a column withColumn GAIN = CLOSE - OPEN
# add/derive a column withColumn GAINP = ( GAIN / OPEN ) * 100
# where as abs is function from pyspark.sql.functions
from pyspark.sql.functions import col
dailyDf = dailyDf.withColumn("GAIN", col("CLOSE") - col("OPEN"))\
                 .withColumn("GAINP", (col("GAIN") / col("OPEN") )   * 100)

dailyDf.printSchema()
dailyDf.show(5)

root
 |-- SYMBOL: string (nullable = true)
 |-- SERIES: string (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- LAST: double (nullable = true)
 |-- PREVCLOSE: double (nullable = true)
 |-- TOTTRDQTY: integer (nullable = true)
 |-- TOTTRDVAL: double (nullable = true)
 |-- TIMESTAMP: string (nullable = true)
 |-- TOTALTRADES: integer (nullable = true)
 |-- ISIN: string (nullable = true)
 |-- GAIN: double (nullable = true)
 |-- GAINP: double (nullable = true)

+----------+------+-------+-------+-------+-------+-------+---------+---------+-------------+-----------+-----------+------------+--------------------+-------------------+
|    SYMBOL|SERIES|   OPEN|   HIGH|    LOW|  CLOSE|   LAST|PREVCLOSE|TOTTRDQTY|    TOTTRDVAL|  TIMESTAMP|TOTALTRADES|        ISIN|                GAIN|              GAINP|
+----------+------+-------+-------+-------+-------+-------+---------+-------