### DataFrame Joining and Grouping

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('Joining and Grouping').getOrCreate()

In [22]:
logs = (
    spark.read.csv('./DataAnalysisPythonPySpark/data/data/broadcast_logs/ReferenceTables/LogIdentifier.csv',
    sep="|", header=True, inferSchema=True)
    .where(F.col("PrimaryFG")==1)    
)

In [23]:
# printing schema of logs DF
logs.printSchema()

# Count the records where 
# primaryFG is 1
print("Number of Records: ", logs.count())

root
 |-- LogIdentifierID: string (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- PrimaryFG: integer (nullable = true)

Number of Records:  758


In [15]:
csvlogs = (
    spark.read.csv('./DataAnalysisPythonPySpark/data/data/broadcast_logs/BroadcastLogs_2018_Q3_M8_sample.csv', sep='|',
    header=True, inferSchema=True, timestampFormat='yyyy-MM-dd')
)

csvlogs.select(F.col('LogServiceID')).show(5)

+------------+
|LogServiceID|
+------------+
|        3157|
|        3157|
|        3157|
|        3157|
|        3157|
+------------+
only showing top 5 rows



In [41]:
# use of join method.
csvlogs.join(
    logs,
    on='LogServiceID',
    how='inner'
).select(F.col('LogServiceID')
    ,F.col('Subtitle')
    ,F.col('Language1')
    ,F.col('LogIdentifierID')
    ,F.col('PrimaryFG')
).where(F.col('Subtitle') != 'null').show(5)

+------------+--------------------+---------+---------------+---------+
|LogServiceID|            Subtitle|Language1|LogIdentifierID|PrimaryFG|
+------------+--------------------+---------+---------------+---------+
|        3157|          I Do, I Do|       94|           13ST|        1|
|        3157| Much Ado About Mara|       94|           13ST|        1|
|        3157|          I Do, I Do|       94|           13ST|        1|
|        3157| Much Ado About Mara|       94|           13ST|        1|
|        3157|Fly Into a Rage, ...|       94|           13ST|        1|
+------------+--------------------+---------+---------------+---------+
only showing top 5 rows



#### Join cases
- inner join
- left_outer
- right_outer
- outer
- full
- full_outer
- left_semi
- left_anti
- cross