In [0]:
from pyspark.sql.types import StructType, StructField,StringType, IntegerType, LongType

In [0]:
df_flightdata = spark.read.format('json')\
    .option("header", True)\
        .option("inferSchema", True)\
            .load('/FileStore/tables/2015_summary.json')
#Transformation

In [0]:
display(df_flightdata.schema)

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

In [0]:
manualSchema = StructType(
    [
        StructField("DEST_COUNTRY_NAME", StringType(), True),
        StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
        StructField("count", IntegerType(), True),
    ]
)

In [0]:
df =spark.read.format('json')\
    .option("header", True)\
        .schema(manualSchema)\
            .load('/FileStore/tables/2015_summary.json')

In [0]:
df.printSchema()
display(df.schema)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', IntegerType(), True)])

In [0]:
from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")

Out[20]: Column<'someColumnName'>

In [0]:
from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

Out[21]: Column<'((((someCol + 5) * 200) - 6) < otherCol)'>

In [0]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

In [0]:
print(myRow[0])
print(myRow[4])

Hello


[0;31m---------------------------------------------------------------------------[0m
[0;31mIndexError[0m                                Traceback (most recent call last)
File [0;32m<command-773797539336978>:2[0m
[1;32m      1[0m [38;5;28mprint[39m(myRow[[38;5;241m0[39m])
[0;32m----> 2[0m [38;5;28mprint[39m(myRow[[38;5;241m4[39m])

File [0;32m/databricks/spark/python/pyspark/sql/types.py:2027[0m, in [0;36mRow.__getitem__[0;34m(self, item)[0m
[1;32m   2025[0m [38;5;28;01mdef[39;00m [38;5;21m__getitem__[39m([38;5;28mself[39m, item: Any) [38;5;241m-[39m[38;5;241m>[39m Any:
[1;32m   2026[0m     [38;5;28;01mif[39;00m [38;5;28misinstance[39m(item, ([38;5;28mint[39m, [38;5;28mslice[39m)):
[0;32m-> 2027[0m         [38;5;28;01mreturn[39;00m [38;5;28;43msuper[39;49m[43m([49m[43mRow[49m[43m,[49m[43m [49m[38;5;28;43mself[39;49m[43m)[49m[38;5;241;43m.[39;49m[38;5;21;43m__getitem__[39;49m[43m([49m[43mitem[49m[43m)[49m
[1;32m   

In [0]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
  StructField("some", StringType(), True),
  StructField("col", StringType(), True),
  StructField("names", LongType(), False)
])
myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)

In [0]:
myDf.display()
myDf.printSchema()

some,col,names
Hello,,1


root
 |-- some: string (nullable = true)
 |-- col: string (nullable = true)
 |-- names: long (nullable = false)



In [0]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).display()

destination
United States
United States
United States
Egypt
United States
United States
United States
Costa Rica
Senegal
Moldova


In [0]:
df.selectExpr("DEST_COUNTRY_NAME as destination", "DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|  destination|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [0]:
df.display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1


In [0]:
df.selectExpr(
  "*", # all original columns
  "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
      .where(col("withinCountry") == True)\
  .display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,withinCountry
United States,United States,370002,True


In [0]:
df.selectExpr("avg(count) as countAvg", "count(distinct(DEST_COUNTRY_NAME)) as distCountry").display()

countAvg,distCountry
1770.765625,132


In [0]:
from pyspark.sql.functions import lit
df.select(expr("*"), lit(100).alias("One")).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|100|
|    United States|            Croatia|    1|100|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [0]:
df.withColumn("numberOne", lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [0]:
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))\
    .filter(col("withinCountry") == True).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,withinCountry
United States,United States,370002,True


In [0]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

Out[49]: ['dest', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
dfWithLongColName = df.withColumn(
    "This Long Column-Name",
    expr("ORIGIN_COUNTRY_NAME"))

In [0]:
dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`")\
  .show(2)

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows

