In [7]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
from IPython.display import display

In [9]:
spark_application_name = "Spark_Application_Name"

In [10]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

In [11]:
google_path = "GOOGLE.csv"
df = spark.read.csv(google_path, header="true", inferSchema="true", multiLine="true", escape='"')

In [12]:
from functools import reduce
from pyspark.sql import DataFrame

In [13]:
display(df)

DataFrame[Date: timestamp, High: double, Low: double, Open: double, Close: double, Volume: int, Adj Close: double, company_name: string]

In [14]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- company_name: string (nullable = true)



In [15]:
from pyspark.sql.functions import *

df = df.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd"))
df.show()

+----------+-----------------+-----------------+-----------------+-----------------+-------+-----------------+------------+
|      Date|             High|              Low|             Open|            Close| Volume|        Adj Close|company_name|
+----------+-----------------+-----------------+-----------------+-----------------+-------+-----------------+------------+
|2017-01-03|789.6300048828125|775.7999877929688|778.8099975585938|786.1400146484375|1657300|786.1400146484375|      GOOGLE|
|2017-01-04|791.3400268554688|783.1599731445312|788.3599853515625|786.9000244140625|1073000|786.9000244140625|      GOOGLE|
|2017-01-05|  794.47998046875|  785.02001953125|786.0800170898438|  794.02001953125|1335200|  794.02001953125|      GOOGLE|
|2017-01-06|807.9000244140625|792.2039794921875| 795.260009765625|806.1500244140625|1640200|806.1500244140625|      GOOGLE|
|2017-01-09|809.9660034179688|802.8300170898438|806.4000244140625|806.6500244140625|1274600|806.6500244140625|      GOOGLE|
|2017-01

In [16]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- company_name: string (nullable = true)



In [17]:
cleanDF = df.na.fill(value=0, subset=["Volume"])
cleanDF.show()

# We can see that there are 987 Volume columns in apple, microsoft and tesla DFs have null values.
# We do not want to remove every rows that contain a null value because if we do so 987 rows in 3 DFs will be removed,
# which also means a lot of data from other columns of that row will be lost, and these data is important in the analytics of other information of that day

+----------+-----------------+-----------------+-----------------+-----------------+-------+-----------------+------------+
|      Date|             High|              Low|             Open|            Close| Volume|        Adj Close|company_name|
+----------+-----------------+-----------------+-----------------+-----------------+-------+-----------------+------------+
|2017-01-03|789.6300048828125|775.7999877929688|778.8099975585938|786.1400146484375|1657300|786.1400146484375|      GOOGLE|
|2017-01-04|791.3400268554688|783.1599731445312|788.3599853515625|786.9000244140625|1073000|786.9000244140625|      GOOGLE|
|2017-01-05|  794.47998046875|  785.02001953125|786.0800170898438|  794.02001953125|1335200|  794.02001953125|      GOOGLE|
|2017-01-06|807.9000244140625|792.2039794921875| 795.260009765625|806.1500244140625|1640200|806.1500244140625|      GOOGLE|
|2017-01-09|809.9660034179688|802.8300170898438|806.4000244140625|806.6500244140625|1274600|806.6500244140625|      GOOGLE|
|2017-01

In [18]:
outputPath = "stocks-clean.parquet"

noNullsDF = cleanDF.withColumnRenamed("Adj Close", "AdjClose")
noNullsDF.write.mode("overwrite").parquet(outputPath)

## Add Prediction Column

In [19]:
filePath = "stocks-clean.parquet"
stocksDF = spark.read.parquet(filePath)

In [20]:
from pyspark.sql.functions import lead, col
from pyspark.sql import Window

w = Window.orderBy("Date")

stocksDF = stocksDF.withColumn("Next", lead("Volume",1,0).over(w))

In [21]:
stocksDF = stocksDF.filter(col("Next")!=0.0)
display(stocksDF)

DataFrame[Date: date, High: double, Low: double, Open: double, Close: double, Volume: int, AdjClose: double, company_name: string, Next: int]

In [22]:
outputPath = "stocks-final.parquet"
stocksDF.write.mode("overwrite").parquet(outputPath)