In [0]:
from pyspark.sql.types import StructType, StructField,StringType, IntegerType, LongType

In [0]:
df_flightdata = spark.read.format('json')\
    .option("header", True)\
        .option("inferSchema", True)\
            .load('/FileStore/tables/2015_summary.json')
#Transformation

In [0]:
display(df_flightdata.schema)

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

In [0]:
manualSchema = StructType(
    [
        StructField("DEST_COUNTRY_NAME", StringType(), True),
        StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
        StructField("count", IntegerType(), True),
    ]
)

In [0]:
df =spark.read.format('json')\
    .option("header", True)\
        .schema(manualSchema)\
            .load('/FileStore/tables/2015_summary.json')

In [0]:
df.printSchema()
display(df.schema)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', IntegerType(), True)])

In [0]:
from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")

Out[53]: Column<'someColumnName'>

In [0]:
from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

Out[54]: Column<'((((someCol + 5) * 200) - 6) < otherCol)'>

In [0]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

In [0]:
print(myRow[0])
print(myRow[3])

Hello
False


In [0]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
  StructField("some", StringType(), True),
  StructField("col", StringType(), True),
  StructField("names", LongType(), False)
])
myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)

In [0]:
myDf.display()
myDf.printSchema()

some,col,names
Hello,,1


root
 |-- some: string (nullable = true)
 |-- col: string (nullable = true)
 |-- names: long (nullable = false)



In [0]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).display()

destination
United States
United States
United States
Egypt
United States
United States
United States
Costa Rica
Senegal
Moldova


In [0]:
df.selectExpr("DEST_COUNTRY_NAME as destination", "DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|  destination|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [0]:
df.display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1


In [0]:
df.selectExpr(
  "*", # all original columns
  "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
      .where(col("withinCountry") == True)\
  .display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,withinCountry
United States,United States,370002,True


In [0]:
df.selectExpr("avg(count) as countAvg", "count(distinct(DEST_COUNTRY_NAME)) as distCountry").display()

countAvg,distCountry
1770.765625,132


In [0]:
from pyspark.sql.functions import lit
df.select(expr("*"), lit(100).alias("One")).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|100|
|    United States|            Croatia|    1|100|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [0]:
df.withColumn("numberOne", lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [0]:
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))\
    .filter(col("withinCountry") == True).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,withinCountry
United States,United States,370002,True


In [0]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

Out[67]: ['dest', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
dfWithLongColName = df.withColumn(
    "This Long Column-Name",
    expr("ORIGIN_COUNTRY_NAME"))

In [0]:
dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`")\
  .show(2)

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows



In [0]:
dfWithLongColName.select(expr("`This Long Column-Name`")).columns

Out[70]: ['This Long Column-Name']

In [0]:
df.filter(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia").limit(2).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Singapore,1
Moldova,United States,1


In [0]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct()

Out[72]: DataFrame[ORIGIN_COUNTRY_NAME: string, DEST_COUNTRY_NAME: string]

In [0]:
from pyspark.sql import Row

schema = df.schema
newRows = [
    Row("New Country", "Other Country", 5),
    Row("New Country 2", "Other Country 3", 1),
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)
display(newDF)

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
New Country,Other Country,5
New Country 2,Other Country 3,1


In [0]:
df.union(newDF).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1


In [0]:
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--

In [0]:
from pyspark.sql.functions import desc, asc
df.orderBy(expr("count desc")).show(2)
df.orderBy(col("count").desc(), col("ORIGIN_COUNTRY_NAME").asc()).show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
|   United Kingdom|      United States|  2025|
|    United States|     United Kingdom|  1970|
|            Japan|      United States|  1548|
|    United States|              Japan|  1496|
|          Germany|      United States|  1468|
+-----------------+-------------------+------+
only showing top 10 rows



In [0]:
df.limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
df.orderBy(expr("count desc")).limit(6).show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
+--------------------+-------------------+-----+



In [0]:
df.orderBy(col("count").desc()).limit(6).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002
United States,Canada,8483
Canada,United States,8399
United States,Mexico,7187
Mexico,United States,7140
United Kingdom,United States,2025
