# Some Basic Dataframe Operations

### Create a SparkSession with name 'Ops'

In [2]:
from pyspark.sql import SparkSession
spark1 = SparkSession.builder.appName('Ops').getOrCreate()

### Read csv file and other basic operations

In [3]:
df = spark1.read.csv('appl_data.csv',inferSchema=True,header=True)

In [4]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [5]:
df.summary().show()

+-------+------------------+------------------+------------------+------------------+-------------------+------------------+
|summary|              Open|              High|               Low|             Close|             Volume|         Adj Close|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+
|  count|              1762|              1762|              1762|              1762|               1762|              1762|
|   mean| 313.0763111589103| 315.9112880164581| 309.8282405079457| 312.9270656379113|9.422577587968218E7| 75.00174115607275|
| stddev|185.29946803981522|186.89817686485767|183.38391664371008| 185.1471036170943|6.020518776592709E7| 28.57492972179906|
|    min|              90.0|         90.699997|         89.470001|         90.279999|           11475900|         24.881912|
|    25%|        115.199997|        116.349998|             114.0|        115.190002|           49161400|         50.260037|


In [7]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [8]:
df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [9]:
df.head(2)

[Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039),
 Row(Date=datetime.datetime(2010, 1, 5, 0, 0), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]

In [12]:
dict1 = df.head(2)[1].asDict()

In [13]:
dict1

{'Date': datetime.datetime(2010, 1, 5, 0, 0),
 'Open': 214.599998,
 'High': 215.589994,
 'Low': 213.249994,
 'Close': 214.379993,
 'Volume': 150476200,
 'Adj Close': 27.774976000000002}

### Filtering Operations

In [14]:
df.filter("High > 210").show(5)

+-------------------+----------+----------+------------------+------------------+---------+------------------+
|               Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
o

In [18]:
df.filter("High > 210").select(['Date','Open','Close']).show(3)

+-------------------+----------+----------+
|               Date|      Open|     Close|
+-------------------+----------+----------+
|2010-01-04 00:00:00|213.429998|214.009998|
|2010-01-05 00:00:00|214.599998|214.379993|
|2010-01-06 00:00:00|214.379993|210.969995|
+-------------------+----------+----------+
only showing top 3 rows



In [16]:
df.filter("High > 210 AND Close > 215").select(['Date','Open','Close']).show(3)

+-------------------+------------------+------------------+
|               Date|              Open|             Close|
+-------------------+------------------+------------------+
|2010-01-19 00:00:00|        208.330002|        215.039995|
|2010-03-05 00:00:00|        214.940006|218.95000499999998|
|2010-03-08 00:00:00|220.01000200000001|        219.079994|
+-------------------+------------------+------------------+
only showing top 3 rows



#### Now the same operations can be performed by Dataframe Syntax

In [17]:
df.filter(df['High'] > 210).show(5)

+-------------------+----------+----------+------------------+------------------+---------+------------------+
|               Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
o

In [23]:
df.filter((df["High"] > 210) & (df["Close"] > 215)).show(3)

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-19 00:00:00|        208.330002|215.18999900000003|        207.240004|        215.039995|182501900|27.860484999999997|
|2010-03-05 00:00:00|        214.940006|219.69999500000003|214.62999900000003|218.95000499999998|224905100|28.367064000000003|
|2010-03-08 00:00:00|220.01000200000001|        220.090004|        218.250002|        219.079994|107472400|         28.383906|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
only showing top 3 rows



In [24]:
df.filter(df['Low']==197.16).show()

+-------------------+------------------+----------+------+------+---------+---------+
|               Date|              Open|      High|   Low| Close|   Volume|Adj Close|
+-------------------+------------------+----------+------+------+---------+---------+
|2010-01-22 00:00:00|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|
+-------------------+------------------+----------+------+------+---------+---------+



### Use the collect method instead of show, to collect the actual data

In [25]:
low_data = df.filter(df['Low']==197.16).collect()

In [None]:
low_data