# Spark DataFrame Basic Operations

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('oops').getOrCreate()

In [5]:
df = spark.read.csv('appl_stock.csv', inferSchema = True, header = True)

In [6]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [7]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [11]:
df.filter("Close < 200").select(["Open", "Close"]).show()

+------------------+------------------+
|              Open|             Close|
+------------------+------------------+
|206.78000600000001|            197.75|
|        204.930004|        199.289995|
|        201.079996|        192.060003|
|192.36999699999998|        194.729998|
|        195.909998|        195.859997|
|        195.169994|        199.229994|
|        196.730003|        192.050003|
|192.63000300000002|        195.460001|
|        195.690006|194.11999699999998|
|        196.419996|196.19000400000002|
|        195.889997|195.12000700000002|
|        194.880001|        198.669994|
|        199.999998|        197.059998|
|         92.699997|         93.699997|
|         94.730003|             94.25|
|         94.129997|         93.860001|
|         94.040001|         92.290001|
|         92.199997|         91.279999|
|         91.510002|         92.199997|
|         92.309998| 92.08000200000001|
+------------------+------------------+
only showing top 20 rows



In [13]:
df.filter(df['Close'] < 200).select(["Open", "Close"]).show()

+------------------+------------------+
|              Open|             Close|
+------------------+------------------+
|206.78000600000001|            197.75|
|        204.930004|        199.289995|
|        201.079996|        192.060003|
|192.36999699999998|        194.729998|
|        195.909998|        195.859997|
|        195.169994|        199.229994|
|        196.730003|        192.050003|
|192.63000300000002|        195.460001|
|        195.690006|194.11999699999998|
|        196.419996|196.19000400000002|
|        195.889997|195.12000700000002|
|        194.880001|        198.669994|
|        199.999998|        197.059998|
|         92.699997|         93.699997|
|         94.730003|             94.25|
|         94.129997|         93.860001|
|         94.040001|         92.290001|
|         92.199997|         91.279999|
|         91.510002|         92.199997|
|         92.309998| 92.08000200000001|
+------------------+------------------+
only showing top 20 rows



In [15]:
df.filter( (df['Open'] > 100) & (df['Close'] < 100) ).show()

+-------------------+------------------+----------+-----------------+---------+---------+-----------------+
|               Date|              Open|      High|              Low|    Close|   Volume|        Adj Close|
+-------------------+------------------+----------+-----------------+---------+---------+-----------------+
|2014-09-03 00:00:00|        103.099998|103.199997|98.58000200000001|98.940002|125421000|94.34521099999999|
|2014-09-25 00:00:00|        100.510002|100.709999|        97.720001|97.870003|100092000|93.32490200000001|
|2014-10-01 00:00:00|        100.589996|100.690002|        98.699997|    99.18| 51491300|        94.574063|
|2014-10-13 00:00:00|101.33000200000001|101.779999|        99.809998|99.809998| 53583400|        95.174803|
|2014-10-14 00:00:00|        100.389999|100.519997|            98.57|    98.75| 63688600|94.16403199999999|
|2016-01-12 00:00:00|        100.550003|100.690002|        98.839996|99.959999| 49154200|        97.362258|
|2016-01-13 00:00:00|       

In [16]:
df.filter(df['Open'] > 100).filter(df['Close'] < 100).show()

+-------------------+------------------+----------+-----------------+---------+---------+-----------------+
|               Date|              Open|      High|              Low|    Close|   Volume|        Adj Close|
+-------------------+------------------+----------+-----------------+---------+---------+-----------------+
|2014-09-03 00:00:00|        103.099998|103.199997|98.58000200000001|98.940002|125421000|94.34521099999999|
|2014-09-25 00:00:00|        100.510002|100.709999|        97.720001|97.870003|100092000|93.32490200000001|
|2014-10-01 00:00:00|        100.589996|100.690002|        98.699997|    99.18| 51491300|        94.574063|
|2014-10-13 00:00:00|101.33000200000001|101.779999|        99.809998|99.809998| 53583400|        95.174803|
|2014-10-14 00:00:00|        100.389999|100.519997|            98.57|    98.75| 63688600|94.16403199999999|
|2016-01-12 00:00:00|        100.550003|100.690002|        98.839996|99.959999| 49154200|        97.362258|
|2016-01-13 00:00:00|       

In [17]:
df.filter(df['Low'] == 197.16).select("Date").show()

+-------------------+
|               Date|
+-------------------+
|2010-01-22 00:00:00|
+-------------------+



In [21]:
result = df.filter(df['Low'] == 197.16).collect()

In [23]:
result

[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]

In [24]:
result[0]

Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)

In [25]:
row = result[0]

In [28]:
dic = row.asDict()

In [30]:
dic

{'Date': datetime.datetime(2010, 1, 22, 0, 0),
 'Open': 206.78000600000001,
 'High': 207.499996,
 'Low': 197.16,
 'Close': 197.75,
 'Volume': 220441900,
 'Adj Close': 25.620401}

In [31]:
dic["Volume"]

220441900