In [1]:
#Step 1: Install Dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz
!pip install -q findspark

#Step 2: Add environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.3.0-bin-hadoop3"

#Step 3: Initialize Pyspark
import findspark
findspark.init()

In [2]:
#creating spark context
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [145]:
from pyspark.sql.functions import year, avg, desc, lead, col, abs, count, sum
from pyspark.sql.window import Window

In [146]:
from google.colab import files
uploaded = files.upload()

Saving WALMART.csv to WALMART (1).csv
Saving TESLA.csv to TESLA (1).csv
Saving AMAZON.csv to AMAZON (1).csv


In [147]:
WAL = spark.read.csv("WALMART.csv",inferSchema=True,header=True)

In [148]:
TSLA = spark.read.csv("TESLA.csv",inferSchema=True,header=True)

In [149]:
AMZN = spark.read.csv("AMAZON.csv",inferSchema=True,header=True)

In [150]:
WAL = WAL.withColumn("Year", WAL["Date"].substr(-4, 4))
avg_close_by_year_WAL = WAL.groupBy("Year") \
                                .agg(avg("Close").alias("avg_close")) \
                                .orderBy("Year")
print("Walmart")
avg_close_by_year_WAL.show()

TSLA = TSLA.withColumn("Year", TSLA["Date"].substr(-4, 4))
avg_close_by_year_TSLA = TSLA.groupBy("Year") \
                                .agg(avg("Close").alias("avg_close")) \
                                .orderBy("Year")
print("Tesla")
avg_close_by_year_TSLA.show()


AMZN = AMZN.withColumn("Year", AMZN["Date"].substr(-4, 4))
avg_close_by_year_AMZN = AMZN.groupBy("Year") \
                                .agg(avg("Close").alias("avg_close")) \
                                .orderBy("Year")
print("Amazon")
avg_close_by_year_AMZN.show()


Walmart
+----+------------------+
|Year|         avg_close|
+----+------------------+
|2012| 69.16402062371135|
|2013| 75.32051611904767|
|2014| 77.32738088888888|
|2015| 72.49111121825403|
|2016| 69.54706346825398|
|2017| 78.96239064940241|
|2018| 92.36984045418325|
|2019|108.40555563888886|
|2020|129.60134389328064|
|2021|141.65138907142858|
|2022|140.29105385964917|
+----+------------------+

Tesla
+----+------------------+
|Year|         avg_close|
+----+------------------+
|2012| 6.220257731958759|
|2013| 20.88024604365079|
|2014|  44.6658174126984|
|2015| 46.00857951587299|
|2016| 41.95345246428568|
|2017| 62.86325898406372|
|2018| 63.46198399999996|
|2019|54.706039706349195|
|2020|289.99706688142294|
|2021| 779.9944821031743|
|2022| 922.0924522280702|
+----+------------------+

Amazon
+----+------------------+
|Year|         avg_close|
+----+------------------+
|2012|230.68232002061865|
|2013|298.03158778968253|
|2014|332.55103267857135|
|2015| 478.1382951269842|
|2016| 699.5231

In [151]:
windowSpec = Window.orderBy("Date")
AMZN_next_day = AMZN.withColumn("next_close", lead("Close", 1).over(windowSpec))
AMZN = AMZN_next_day.withColumn("close_diff", col("next_close") - col("Close"))
result = AMZN.filter(abs(col("close_diff")) > 2).orderBy("Year","Date")
result.show()

+----------+----------+----------+----------+----------+----------+--------+----+----------+------------------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|Year|next_close|        close_diff|
+----------+----------+----------+----------+----------+----------+--------+----+----------+------------------+
| 10/1/2012|255.399994|256.160004|250.490005|252.009995|252.009995| 2581200|2012|320.950012| 68.94001700000001|
|10/10/2012|     252.0|252.460007|244.009995|244.990005|244.990005| 3948300|2012|305.170013| 60.18000799999999|
|10/11/2012|     248.0|249.300003|241.889999|244.220001|244.220001| 3447300|2012|310.890015| 66.67001400000001|
|10/12/2012|243.179993|245.460007|241.910004|242.360001|242.360001| 2203200|2012|550.190002|307.83000100000004|
|10/15/2012|242.850006|244.779999|238.509995|244.179993|244.179993| 2959800|2012|306.399994|62.220000999999996|
|10/16/2012|244.869995|245.770004|242.080002|243.940002|243.940002| 2204100|2012| 310.48999| 66.54998799

In [152]:
WAL = WAL.orderBy(desc("High"))
highest_price_day_WAL = WAL.first()
print("Highest price day for Walmart", highest_price_day_WAL["Date"])

TSLA = TSLA.orderBy(desc("High"))
highest_price_day_TSLA = TSLA.first()
print("Highest price day for Tesla", highest_price_day_TSLA["Date"])

AMZN = AMZN.orderBy(desc("High"))
highest_price_day_AMZN = AMZN.first()
print("Highest price day for Amazon", highest_price_day_AMZN["Date"])


Highest price day for Walmart 12/1/2020
Highest price day for Tesla 11/4/2021
Highest price day for Amazon 7/13/2021


In [153]:
filtered_WAL = WAL.filter(WAL.Close < 50)
num_days_WAL = filtered_WAL.count()
print("Number of days below $50 for Walmart:",num_days_WAL)

filtered_TSLA = TSLA.filter(TSLA.Close < 50)
num_days_TSLA = filtered_TSLA.count()
print("Number of days below $50 for Tesla:",num_days_TSLA)

filtered_AMZN = AMZN.filter(AMZN.Close < 50)
num_days_AMZN = filtered_AMZN.count()
print("Number of days below $50 for Amazon:",num_days_AMZN)


Number of days below $50 for Walmart: 0
Number of days below $50 for Tesla: 1213
Number of days below $50 for Amazon: 0


In [154]:
WAL = WAL.withColumn("Year", WAL.Date.substr(-4, 4))
max_high_per_year_WAL = WAL.groupBy("Year").agg({"High": "max"}).orderBy("Year")
print("Walmart")
max_high_per_year_WAL.show()

TSLA = TSLA.withColumn("Year", TSLA.Date.substr(-4, 4))
max_high_per_year_TSLA = TSLA.groupBy("Year").agg({"High": "max"}).orderBy("Year")
print("Tesla")
max_high_per_year_TSLA.show()

AMZN = AMZN.withColumn("Year", AMZN.Date.substr(-4, 4))
max_high_per_year_AMZN = AMZN.groupBy("Year").agg({"High": "max"}).orderBy("Year")
print("Amazon")
max_high_per_year_AMZN.show()


Walmart
+----+----------+
|Year| max(High)|
+----+----------+
|2012| 77.599998|
|2013| 81.370003|
|2014| 88.089996|
|2015| 90.970001|
|2016| 75.190002|
|2017|100.129997|
|2018|109.980003|
|2019|125.379997|
|2020|153.660004|
|2021|152.570007|
|2022|147.289993|
+----+----------+

Tesla
+----+----------+
|Year| max(High)|
+----+----------+
|2012|      7.99|
|2013| 38.900002|
|2014|    58.284|
|2015| 57.330002|
|2016|    53.868|
|2017| 77.921997|
|2018| 77.491997|
|2019| 87.061996|
|2020|718.719971|
|2021|1243.48999|
|2022|    1208.0|
+----+----------+

Amazon
+----+-----------+
|Year|  max(High)|
+----+-----------+
|2012| 264.109985|
|2013| 405.630005|
|2014| 408.059998|
|2015| 696.440002|
|2016| 847.210022|
|2017|1213.410034|
|2018|     2050.5|
|2019|2035.800049|
|2020|    3552.25|
|2021|3773.080078|
|2022|     3428.0|
+----+-----------+

