In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark Tutorials").getOrCreate()

## Read a Json file in Spark

In [74]:
df = spark.read.json("data.json")

## Read a Parquet file in Spark

In [None]:
df = spark.read.parquet("filname.paraquet")

## Infer a schema in Spark DataFrame 

In [None]:
example_schema = types.StructType([
    types.StructField('page', types.StringType(), False),
    types.StructField('page_title', types.StringType(), False),
    types.StructField('page_voews', types.IntegerType(), False),
    types.StructField('page_size', types.IntegerType(), False),])

data = spark.read.format('csv').load(inputs, schema=observation_schema)

## Show a Spark DataFrame 

In [75]:
df.show(truncate=False)

+---+-----+
|age|name |
+---+-----+
|25 |Sethu|
|26 |Manan|
|25 |Shree|
+---+-----+



## Select a column in Spark DataFrame 

In [134]:
df.select("name").show()

+-----+
| name|
+-----+
|Sethu|
|Manan|
|Shree|
+-----+



## Union of two DataFrames in Spark 

In [135]:
union_example=df.union(df)
union_example.show()

+---+-----+
|age| name|
+---+-----+
| 25|Sethu|
| 26|Manan|
| 25|Shree|
| 25|Sethu|
| 26|Manan|
| 25|Shree|
+---+-----+



## Split a column in Spark Dataframe

In [136]:
from pyspark.sql.functions import explode
df2=spark.read.json("data2.json")
explode_example = df2.select(explode("grades").alias("grades_column"))
explode_example.show()

+-------------+
|grades_column|
+-------------+
|            A|
|            A|
|            B|
|            B|
|            B|
|            B|
|            A|
|            C|
|            B|
+-------------+



## Filter row in Spark DataFrame

In [137]:
filter_example=df2.filter(df2.name=="Kashish")
filter_example.show()

+---+---------+-------+
|age|   grades|   name|
+---+---------+-------+
| 24|[B, B, B]|Kashish|
+---+---------+-------+



## Where clause in Spark DataFrame

In [138]:
where_example=df2.where(df2.name=="Kashish")
where_example.show()

+---+---------+-------+
|age|   grades|   name|
+---+---------+-------+
| 24|[B, B, B]|Kashish|
+---+---------+-------+



## Sorting in Spark DataFrame

In [139]:
sorting_example=df2.sort(df2.age)
sorting_example.show()

+---+---------+-------+
|age|   grades|   name|
+---+---------+-------+
| 23|[A, C, B]| Slavvy|
| 24|[B, B, B]|Kashish|
| 25|[A, A, B]|  Ankit|
+---+---------+-------+



## Filling null values in Spark DataFrame

In [140]:
fill_null_values=df.fillna("--")
fill_null_values.show()

+---+-----+
|age| name|
+---+-----+
| 25|Sethu|
| 26|Manan|
| 25|Shree|
+---+-----+



## Groupby in Spark DataFrame

In [141]:
groupby_Example=union_example.groupby(union_example.name).count()
groupby_Example.show()

+-----+-----+
| name|count|
+-----+-----+
|Sethu|    2|
|Manan|    2|
|Shree|    2|
+-----+-----+



## Sum of a column in Spark DataFrame

In [142]:
sum_example=df2.groupby(df2.age).sum().alias("sum")
sum_example.show()

+---+--------+
|age|sum(age)|
+---+--------+
| 25|      25|
| 23|      23|
| 24|      24|
+---+--------+



## Join in Spark DataFrame

In [143]:
join_example=df.join(df2,(df.age==df2.age))
join_example.show()

+---+-----+---+---------+-----+
|age| name|age|   grades| name|
+---+-----+---+---------+-----+
| 25|Shree| 25|[A, A, B]|Ankit|
| 25|Sethu| 25|[A, A, B]|Ankit|
+---+-----+---+---------+-----+



## Order by in Spark DataFrame

In [144]:
orderby_example=df.orderBy(df.age)
orderby_example.show()

+---+-----+
|age| name|
+---+-----+
| 25|Shree|
| 25|Sethu|
| 26|Manan|
+---+-----+



## Add a new column in Spark DataFrame

In [145]:
from pyspark.sql.functions import lit
new_column_example=df.withColumn("new_col",lit(1))
new_column_example.show()

+---+-----+-------+
|age| name|new_col|
+---+-----+-------+
| 25|Sethu|      1|
| 26|Manan|      1|
| 25|Shree|      1|
+---+-----+-------+

