In [111]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType
from pyspark.sql.functions import format_number, max, min, mean, count, countDistinct, corr, year, month

In [66]:
spark = SparkSession.builder.appName("Walmart Trading History").getOrCreate()

In [67]:
wmt = spark.read.csv('walmart_stock.csv', header=True, inferSchema=True)

In [68]:
wmt.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [69]:
wmt.show(10)

+----------+------------------+------------------+------------------+------------------+--------+------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+--------+------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|         59.549999|         58.919998|             59.18| 6679300|51.616215000000004|
|2012-01-10|             59.43|59.709998999999996|             5

In [70]:
# Fix Date column which was infered to be a string
wmt = wmt.withColumn("Date",wmt['Date'].cast(DateType()))

In [71]:
wmt.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [72]:
# Show first 5 columns
wmt.collect()[0:5]

[Row(Date=datetime.date(2012, 1, 3), Open=59.970001, High=61.060001, Low=59.869999, Close=60.330002, Volume=12668800, Adj Close=52.619234999999996),
 Row(Date=datetime.date(2012, 1, 4), Open=60.209998999999996, High=60.349998, Low=59.470001, Close=59.709998999999996, Volume=9593300, Adj Close=52.078475),
 Row(Date=datetime.date(2012, 1, 5), Open=59.349998, High=59.619999, Low=58.369999, Close=59.419998, Volume=12768200, Adj Close=51.825539),
 Row(Date=datetime.date(2012, 1, 6), Open=59.419998, High=59.450001, Low=58.869999, Close=59.0, Volume=8069400, Adj Close=51.45922),
 Row(Date=datetime.date(2012, 1, 9), Open=59.029999, High=59.549999, Low=58.919998, Close=59.18, Volume=6679300, Adj Close=51.616215000000004)]

In [73]:
for i in range(5):
    print(wmt.collect()[i])
    print('\n')

Row(Date=datetime.date(2012, 1, 3), Open=59.970001, High=61.060001, Low=59.869999, Close=60.330002, Volume=12668800, Adj Close=52.619234999999996)


Row(Date=datetime.date(2012, 1, 4), Open=60.209998999999996, High=60.349998, Low=59.470001, Close=59.709998999999996, Volume=9593300, Adj Close=52.078475)


Row(Date=datetime.date(2012, 1, 5), Open=59.349998, High=59.619999, Low=58.369999, Close=59.419998, Volume=12768200, Adj Close=51.825539)


Row(Date=datetime.date(2012, 1, 6), Open=59.419998, High=59.450001, Low=58.869999, Close=59.0, Volume=8069400, Adj Close=51.45922)


Row(Date=datetime.date(2012, 1, 9), Open=59.029999, High=59.549999, Low=58.919998, Close=59.18, Volume=6679300, Adj Close=51.616215000000004)




In [74]:
summary = wmt.describe()
summary.select(summary['Summary'],
               format_number(summary['Open'].cast('float'),2).alias('Open'),
               format_number(summary['High'].cast('float'),2).alias('High'),
               format_number(summary['Low'].cast('float'),2).alias('Low'),
               format_number(summary['Close'].cast('float'),2).alias('Close'),
               format_number(summary['Adj Close'].cast('float'),2).alias('Adj Close'),
               summary['Volume'].cast('int').alias('Volume'),             
              ).show()
            

+-------+--------+--------+--------+--------+---------+--------+
|Summary|    Open|    High|     Low|   Close|Adj Close|  Volume|
+-------+--------+--------+--------+--------+---------+--------+
|  count|1,258.00|1,258.00|1,258.00|1,258.00| 1,258.00|    1258|
|   mean|   72.36|   72.84|   71.92|   72.39|    67.24| 8222093|
| stddev|    6.77|    6.77|    6.74|    6.76|     6.72| 4519780|
|    min|   56.39|   57.06|   56.30|   56.42|    50.36| 2094900|
|    max|   90.80|   90.97|   89.25|   90.47|    84.91|80898100|
+-------+--------+--------+--------+--------+---------+--------+



In [76]:
wmt = wmt.withColumn('HV Ratio',wmt['High']/ wmt['Volume'])

In [77]:
wmt.show(10)

+----------+------------------+------------------+------------------+------------------+--------+------------------+--------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|            HV Ratio|
+----------+------------------+------------------+------------------+------------------+--------+------------------+--------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|4.819714653321546E-6|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|6.290848613094555E-6|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|4.669412994783916E-6|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|7.367338463826307E-6|
|2012-01-09|         59.029999|   

In [78]:
#Best Closing Price Days
wmt.orderBy(wmt['High'], ascending=False).show(5)

+----------+-----------------+-----------------+---------+---------+--------+-----------------+--------------------+
|      Date|             Open|             High|      Low|    Close|  Volume|        Adj Close|            HV Ratio|
+----------+-----------------+-----------------+---------+---------+--------+-----------------+--------------------+
|2015-01-13|        90.800003|        90.970001|    88.93|89.309998| 8215400|        83.825448|1.107310672639189...|
|2015-01-08|        89.209999|90.66999799999999|    89.07|90.470001|12713600|84.91421600000001|7.131732790083060...|
|2015-01-09|            90.32|        90.389999|    89.25|89.349998| 8522500|        83.862993|1.060604271047228E-5|
|2015-01-12|        89.360001|        90.309998|89.220001|90.019997| 7372500|        84.491846|1.224957585622244...|
|2015-01-23|88.41999799999999|        89.260002|87.889999|88.510002| 7565800|83.07458100000001|1.179782732824023...|
+----------+-----------------+-----------------+---------+------

In [79]:
# Average Close
wmt.select(format_number(mean('Close'),3)).withColumnRenamed('format_number(avg(Close), 3)','Avg Close').show()

+---------+
|Avg Close|
+---------+
|   72.388|
+---------+



In [80]:
# Max and Min Volumes
wmt.select(max('Volume'), min('Volume')).withColumnRenamed('max(Volume)','Max Volume') \
                    .withColumnRenamed('min(Volume)','Min Volume').show()

+----------+----------+
|Max Volume|Min Volume|
+----------+----------+
|  80898100|   2094900|
+----------+----------+



In [81]:
# Number of day close less than $60.
wmt.filter(wmt['Close'] < 60).count()

81

In [82]:
wmt.filter(wmt['High'] > 80).count()

115

In [83]:
wmt.count()

1258

In [84]:
#of days in the time range that the high is greather than $80.00
wmt.filter(wmt['High'] > 80).count() / wmt.count() * 100

9.141494435612083

In [85]:
# Correlation between high and volume
wmt.select(corr(wmt['High'], wmt['Volume'])).show()

+-------------------+
| corr(High, Volume)|
+-------------------+
|-0.3384326061737161|
+-------------------+



In [86]:
# Max High per year
wmt = wmt.withColumn('Year', year(wmt['Date']))

In [87]:
wmt.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- HV Ratio: double (nullable = true)
 |-- Year: integer (nullable = true)



In [90]:
# Max High in each year
wmt.groupby('Year').max('High').show()

+----+---------+
|Year|max(High)|
+----+---------+
|2015|90.970001|
|2013|81.370003|
|2014|88.089996|
|2012|77.599998|
|2016|75.190002|
+----+---------+



In [98]:
wmt = wmt.withColumn('Month',month(wmt['Date']))

In [113]:
wmt.groupby('Month').mean('Close').show()

+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|   12|72.84792478301885|
|    1|71.44801958415842|
|    6| 72.4953774245283|
|    3|71.77794377570092|
|    5|72.30971688679247|
|    9|72.18411785294116|
|    4|72.97361900952382|
|    8|73.02981855454546|
|    7|74.43971943925233|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|    2|  71.306804443299|
+-----+-----------------+

