###DataFrame

A DataFrame is equivalent to a relational table in Spark SQL, and can be created using various functions in SparkSession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("joins").getOrCreate()

In [3]:
from pyspark.sql.types import IntegerType,StructField,StructType,DoubleType,LongType,DecimalType

In [4]:
data_schema = [StructField("id_adesao", DecimalType(30,0), nullable=False)
               ,StructField("id_cancelado", DecimalType(30,0), nullable=True)
               ,StructField("id_vendedor", LongType(), nullable=True)
               ,StructField("id_portal", IntegerType(), nullable=True)
               ,StructField("id_loja", DecimalType(30,0), nullable=True)
               ,StructField("id_dt_emissao", IntegerType(), nullable=True)
               ,StructField("id_dt_canc", IntegerType(), nullable=True)
               ,StructField("valor_premio", DoubleType(), nullable=False)
               ,StructField("valor_produto", DoubleType(), nullable=False)]

final_struc = StructType(fields=data_schema)

In [5]:
fato = spark.read.csv('/FileStore/tables/fato_seguro.csv',schema=final_struc,header=True)

fato.show(5)

In [6]:
fato.printSchema()

In [7]:
dim_adesao = spark.read.csv('/FileStore/tables/dim_adesao.csv',inferSchema=True,header=True)

dim_adesao.head(1)

###Joins with DataFrame

In [9]:
fato.join(dim_adesao, on=[fato.id_adesao == dim_adesao.id_adesao], how='right').head(1)

In [10]:
for x in range(1,3):
  print("Register number: %i" % x )
  print(fato.join(dim_adesao, on='id_adesao', how='inner').filter(fato.id_cancelado.isNull()).head(x))
  print('---'*30)

In [11]:
dim_cancelado = spark.read.csv('/FileStore/tables/dim_cancelado.csv',inferSchema=True,header=True)

dim_cancelado.head(1)

In [12]:
fato.join(dim_cancelado, on='id_cancelado', how='left').select(fato.id_adesao.alias('fato.id_adesao'),fato.id_cancelado.alias('fato.id_cancelado'),
                                                               dim_cancelado.data_cancelamento.alias('dim_cancelado.dt_canc')).show(5)

###SQL

To use SQL queries directly with the dataframe, you will need to register it to a temporary view:

In [14]:
# Register the DataFrame as a SQL temporary view
fato.createOrReplaceTempView("fato")
dim_adesao.createOrReplaceTempView('dim_adesao')
dim_cancelado.createOrReplaceTempView('dim_cancelado')

In [15]:
spark.sql('''SELECT d.nome_segurado, a.id_adesao, d.nome_aparelho,a.valor_premio, c.id_cancelado  
              FROM fato a
              JOIN dim_adesao d
              ON d.id_adesao = a.id_adesao
              LEFT JOIN dim_cancelado c
              ON c.id_cancelado = a.id_cancelado
              WHERE a.valor_premio > 500''').show()