# Spark DataFrame Basics

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [6]:
df = spark.read.json('people.json')

In [7]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
df.columns

['age', 'name']

**Updating the Schema to change datatype**

In [12]:
from pyspark.sql.types import (StructField, StringType, 
                               IntegerType, StructType)

In [13]:
data_schema = [StructField('age', IntegerType(), True),
              StructField('name', StringType(), True)]

In [14]:
final_struc = StructType(fields=data_schema)

In [15]:
df = spark.read.json('people.json', schema=final_struc)

In [16]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [17]:
type(df['age'])

pyspark.sql.column.Column

In [21]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [25]:
df.head(2)[0]

Row(age=None, name='Michael')

In [27]:
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [31]:
#create a new column
#not permanent
df.withColumn('newage', df['age'] + 1).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    31|
|  19| Justin|    20|
+----+-------+------+



In [30]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [33]:
#rename
df.withColumnRenamed('age', 'my_new_age').show()

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



In [38]:
#Use sql language
#Create a temp table
df.createOrReplaceTempView('people')

In [39]:
results = spark.sql('SELECT * FROM people')

In [40]:
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [42]:
new_results = spark.sql("SELECT * FROM people WHERE age=30")

In [43]:
new_results.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



# Spark DataFrames Basic Operations

In [44]:
spark = SparkSession.builder.appName('ops').getOrCreate()

In [45]:
df = spark.read.csv('appl_stock.csv', inferSchema=True, header=True)

In [46]:
df.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      