# Spark Data Frame Basics

In [1]:
import findspark

In [2]:
findspark.init("/usr/local/spark/spark-2.2.1-bin-hadoop2.7")

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('basics').getOrCreate()

In [6]:
df = spark.read.json('people.json')

In [7]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
df.columns

['age', 'name']

In [10]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [11]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



Schema is the important part and it should always be correct. Lets walked through schema.

In [12]:
from pyspark.sql.types import (StructField, StringType,
                               IntegerType, StructType)

    StructFiel('name of the column', Type or class, null or not)
  if specified as True then if the value is not present then it would be Ok, but if False and the value is not present then it will through and error

In [13]:
data_schema = [StructField('age', IntegerType(), True),
               StructField('name', StringType(),True)]

In [14]:
final_struc = StructType(fields = data_schema)

In [15]:
df = spark.read.json('people.json', schema=final_struc)

In [16]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [17]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [18]:
#selecting a column
df.age

Column<b'age'>

In [19]:
#the above will return a column object i.e., the column is of columkn type e.g.
type(df.age)

pyspark.sql.column.Column

In [20]:
#selecting a column 
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [21]:
#type of return of above command
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [22]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [23]:
df.head(2)[0]

Row(age=None, name='Michael')

In [24]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [25]:
#selecting multiple columns as list
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [26]:
#creates a new column syntax df.withcolumn('name for column',value) these changes would not be in place.
#the original dataframe will remain as it is until overwritten.
df.withColumn('double_age', df.age * 2).show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [27]:
#renaming a columnname this will not occurred in place change
df.withColumnRenamed('age','My_Age').show()

+------+-------+
|My_Age|   name|
+------+-------+
|  null|Michael|
|    30|   Andy|
|    19| Justin|
+------+-------+



In [28]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [29]:
#for doing spark sql we need to register dataframe as sql temporary view
df.createOrReplaceTempView('people')
results = spark.sql("SELECT * FROM people")

In [30]:
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [31]:
new_Results = spark.sql("SELECT * FROM people WHERE age=30")
new_Results.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

