In [2]:
from utils.quandl_data_fetcher import QuandlDataFetcher
from utils.spark_initializer import SparkInitializer
SparkInitializer.init_spark()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder.appName('Quandl').getOrCreate()

In [5]:
from utils.schema_repo import SchemaRepository

In [6]:
INGA_pdf = QuandlDataFetcher.fetch_data('EURONEXT/INGA')

In [7]:
INGA_pdf.head()

Unnamed: 0,Date,Open,High,Low,Last,Volume,Turnover
0,2014-02-14,10.52,10.64,10.515,10.6,11457475.0,121313500.0
1,2014-02-17,10.655,10.68,10.59,10.61,7394038.0,78533650.0
2,2014-02-18,10.65,10.65,10.515,10.605,11580469.0,122761600.0
3,2014-02-19,10.575,10.595,10.415,10.475,11410268.0,119646300.0
4,2014-02-20,10.3,10.525,10.255,10.525,12632060.0,131836300.0


In [8]:
INGA_sdf = spark.createDataFrame(INGA_pdf, schema=SchemaRepository.stock_schema) 

In [9]:
INGA_sdf.show()

+----------+------+------+------+------+-----------+---------------+
|      Date|  Open|  High|   Low|  Last|     Volumn|       Turnover|
+----------+------+------+------+------+-----------+---------------+
|2014-02-14| 10.52| 10.64|10.515|  10.6|1.1457475E7| 1.2131352577E8|
|2014-02-17|10.655| 10.68| 10.59| 10.61|  7394038.0|  7.853364632E7|
|2014-02-18| 10.65| 10.65|10.515|10.605|1.1580469E7| 1.2276160587E8|
|2014-02-19|10.575|10.595|10.415|10.475|1.1410268E7| 1.1964634487E8|
|2014-02-20|  10.3|10.525|10.255|10.525| 1.263206E7| 1.3183630136E8|
|2014-02-21|  10.6|10.625|  10.5|10.545|1.1276993E7| 1.1893421816E8|
|2014-02-24|  10.5|10.615| 10.47|10.615| 1.008318E7|1.06631915945E8|
|2014-02-25|10.605| 10.67| 10.51|10.665|1.0274855E7| 1.0905679842E8|
|2014-02-26| 10.68| 10.69| 10.51|10.525|  8740996.0|  9.249715499E7|
|2014-02-27| 10.56| 10.57| 10.35|10.505| 1.203792E7| 1.2596515846E8|
|2014-02-28|  10.6|  10.6|10.465| 10.57|1.0919098E7| 1.1520751868E8|
|2014-03-05|10.405| 10.44|10.315| 

In [10]:
INGA_sdf.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Last: float (nullable = true)
 |-- Volumn: double (nullable = true)
 |-- Turnover: double (nullable = true)



In [11]:
INGA_sdf.describe().show()

+-------+------------------+------------------+-----------------+-----------------+--------------------+--------------------+
|summary|              Open|              High|              Low|             Last|              Volumn|            Turnover|
+-------+------------------+------------------+-----------------+-----------------+--------------------+--------------------+
|  count|              1445|              1445|             1445|             1445|                1445|                1445|
|   mean|12.205299662553728|12.324435299408064|12.06736956467678|12.19555294010466|1.7261211542560555E7|2.0791255484968305E8|
| stddev| 2.022902487223828| 2.019372964767173|2.024748348180117|2.024231207669432|   7576531.360696883|  8.86876913860459E7|
|    min|              8.35|             8.471|            8.197|            8.335|           1730312.0|       2.023141016E7|
|    max|             16.64|            16.692|           16.516|           16.666|         9.7104846E7|     9.0223315

In [12]:
INGA_sdf.head(5)[0]

Row(Date=datetime.date(2014, 2, 14), Open=10.520000457763672, High=10.640000343322754, Low=10.515000343322754, Last=10.600000381469727, Volumn=11457475.0, Turnover=121313525.77)

In [13]:
INGA_sdf.withColumn('DoubleOpen', INGA_sdf['Open']*2).show()

+----------+------+------+------+------+-----------+---------------+----------+
|      Date|  Open|  High|   Low|  Last|     Volumn|       Turnover|DoubleOpen|
+----------+------+------+------+------+-----------+---------------+----------+
|2014-02-14| 10.52| 10.64|10.515|  10.6|1.1457475E7| 1.2131352577E8|     21.04|
|2014-02-17|10.655| 10.68| 10.59| 10.61|  7394038.0|  7.853364632E7|     21.31|
|2014-02-18| 10.65| 10.65|10.515|10.605|1.1580469E7| 1.2276160587E8|      21.3|
|2014-02-19|10.575|10.595|10.415|10.475|1.1410268E7| 1.1964634487E8|     21.15|
|2014-02-20|  10.3|10.525|10.255|10.525| 1.263206E7| 1.3183630136E8|      20.6|
|2014-02-21|  10.6|10.625|  10.5|10.545|1.1276993E7| 1.1893421816E8|      21.2|
|2014-02-24|  10.5|10.615| 10.47|10.615| 1.008318E7|1.06631915945E8|      21.0|
|2014-02-25|10.605| 10.67| 10.51|10.665|1.0274855E7| 1.0905679842E8|     21.21|
|2014-02-26| 10.68| 10.69| 10.51|10.525|  8740996.0|  9.249715499E7|     21.36|
|2014-02-27| 10.56| 10.57| 10.35|10.505|

In [14]:
INGA_sdf.createOrReplaceTempView('inga')

In [50]:
sql_rslt = spark.sql('select * from inga where high > 16.5 and open > 16.6')
sql_rslt.show()

+----------+-----+-----+------+------+-----------+----------------+
|      Date| Open| High|   Low|  Last|     Volumn|        Turnover|
+----------+-----+-----+------+------+-----------+----------------+
|2018-01-12|16.63|16.69|16.494|16.612|1.4569804E7| 2.41759308908E8|
|2018-01-15| 16.6|16.64|16.516| 16.59|  9268094.0|1.537521140062E8|
|2018-01-23|16.64|16.65|16.206|16.364|2.1670624E7| 3.55128283566E8|
+----------+-----+-----+------+------+-----------+----------------+



In [51]:
spk_rslt = INGA_sdf.filter('high > 16.5' and 'open > 16.6')
spk_rslt.show()

+----------+-----+-----+------+------+-----------+----------------+
|      Date| Open| High|   Low|  Last|     Volumn|        Turnover|
+----------+-----+-----+------+------+-----------+----------------+
|2018-01-12|16.63|16.69|16.494|16.612|1.4569804E7| 2.41759308908E8|
|2018-01-15| 16.6|16.64|16.516| 16.59|  9268094.0|1.537521140062E8|
|2018-01-23|16.64|16.65|16.206|16.364|2.1670624E7| 3.55128283566E8|
+----------+-----+-----+------+------+-----------+----------------+



In [52]:
cond1 = INGA_sdf['High'] > 16.5
cond2 = INGA_sdf['Open'] > 16.6
py_rslt = INGA_sdf.filter(cond1 & cond2)
py_rslt.show()

+----------+-----+-----+------+------+-----------+----------------+
|      Date| Open| High|   Low|  Last|     Volumn|        Turnover|
+----------+-----+-----+------+------+-----------+----------------+
|2018-01-12|16.63|16.69|16.494|16.612|1.4569804E7| 2.41759308908E8|
|2018-01-15| 16.6|16.64|16.516| 16.59|  9268094.0|1.537521140062E8|
|2018-01-23|16.64|16.65|16.206|16.364|2.1670624E7| 3.55128283566E8|
+----------+-----+-----+------+------+-----------+----------------+



In [53]:
cpy_rslt = py_rslt.collect()

In [54]:
type(cpy_rslt)

list

In [59]:
cpy_rslt[0].asDict()

{'Date': datetime.date(2018, 1, 12),
 'Open': 16.6299991607666,
 'High': 16.690000534057617,
 'Low': 16.493999481201172,
 'Last': 16.61199951171875,
 'Volumn': 14569804.0,
 'Turnover': 241759308.908}