### DataFrame Joining and Grouping

In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [5]:
import pandas as pd
import numpy as np

In [6]:
spark = SparkSession.builder.appName('Joining and Grouping').getOrCreate()

In [22]:
logs = (
    spark.read.csv('./DataAnalysisPythonPySpark/data/data/broadcast_logs/ReferenceTables/LogIdentifier.csv',
    sep="|", header=True, inferSchema=True)
    .where(F.col("PrimaryFG")==1)    
)

In [23]:
# printing schema of logs DF
logs.printSchema()

# Count the records where 
# primaryFG is 1
print("Number of Records: ", logs.count())

root
 |-- LogIdentifierID: string (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- PrimaryFG: integer (nullable = true)

Number of Records:  758


In [15]:
csvlogs = (
    spark.read.csv('./DataAnalysisPythonPySpark/data/data/broadcast_logs/BroadcastLogs_2018_Q3_M8_sample.csv', sep='|',
    header=True, inferSchema=True, timestampFormat='yyyy-MM-dd')
)

csvlogs.select(F.col('LogServiceID')).show(5)

+------------+
|LogServiceID|
+------------+
|        3157|
|        3157|
|        3157|
|        3157|
|        3157|
+------------+
only showing top 5 rows



In [41]:
# use of join method.
csvlogs.join(
    logs,
    on='LogServiceID',
    how='inner'
).select(F.col('LogServiceID')
    ,F.col('Subtitle')
    ,F.col('Language1')
    ,F.col('LogIdentifierID')
    ,F.col('PrimaryFG')
).where(F.col('Subtitle') != 'null').show(5)

+------------+--------------------+---------+---------------+---------+
|LogServiceID|            Subtitle|Language1|LogIdentifierID|PrimaryFG|
+------------+--------------------+---------+---------------+---------+
|        3157|          I Do, I Do|       94|           13ST|        1|
|        3157| Much Ado About Mara|       94|           13ST|        1|
|        3157|          I Do, I Do|       94|           13ST|        1|
|        3157| Much Ado About Mara|       94|           13ST|        1|
|        3157|Fly Into a Rage, ...|       94|           13ST|        1|
+------------+--------------------+---------+---------------+---------+
only showing top 5 rows



#### Join cases
- inner join
- left_outer
- right_outer
- outer
- full
- full_outer
- left_semi
- left_anti
- cross

In [7]:
cdcat = spark.read.csv('./DataAnalysisPythonPySpark/data/data/broadcast_logs/ReferenceTables/CD_Category.csv',
    sep='|', header=True, inferSchema=True
    ).select(
        "CategoryID", "CategoryCD", 
        F.col('EnglishDescription').alias('Category_Description')
    )

cdprog = spark.read.csv('./DataAnalysisPythonPySpark/data/data/broadcast_logs/ReferenceTables/CD_ProgramClass.csv',
    sep="|", header=True, inferSchema=True
).select(
    "ProgramClassID", "ProgramClassCD",
    F.col("EnglishDescription").alias('ProgramClass_Desciption')
)

In [13]:
print("CD Category: \n", cdcat.columns)
cdcat.select('*').show(5, False)
print("CD programm: \n",cdprog.columns)
cdprog.select('*').show(5, False)

CD Category: 
 ['CategoryID', 'CategoryCD', 'Category_Description']
+----------+----------+---------------------------+
|CategoryID|CategoryCD|Category_Description       |
+----------+----------+---------------------------+
|1         |010       |NEWS                       |
|2         |02        |CANREC  ANALYSIS (old)     |
|3         |02A       |ANALYSIS AND INTERPRETATION|
|4         |02B       |LONG-FORM DOCUMENTARY      |
|5         |030       |REPORTING & ACTUALITIES    |
+----------+----------+---------------------------+
only showing top 5 rows

CD programm: 
 ['ProgramClassID', 'ProgramClassCD', 'ProgramClass_Desciption']
+--------------+--------------+-----------------------+
|ProgramClassID|ProgramClassCD|ProgramClass_Desciption|
+--------------+--------------+-----------------------+
|1             |AUT           |AUTOPROMOTION          |
|2             |BAL           |BALANCE PROGRAMMING    |
|3             |COM           |COMMERCIAL MESSAGE     |
|4             |COR     

In [38]:
(cdcat.join(cdprog,on=cdcat['CategoryID']==cdprog['ProgramClassID'], how='inner').select(
        F.col('CategoryID').alias('ID'),
        F.col('ProgramClassID'),
        F.col('CategoryCD'),
        F.col('Category_Description'),
        F.col('ProgramClassCD'),
        F.col('ProgramClass_Desciption'),

    ) 
    .groupBy("ProgramClassCD", "ProgramClass_Desciption")
    .agg(F.sum('ID').alias('ID sum'))
    .orderBy('ID sum', ascending=False)
    .show(truncate=False)
)

+--------------+----------------------------------------+------+
|ProgramClassCD|ProgramClass_Desciption                 |ID sum|
+--------------+----------------------------------------+------+
|MOS           |Mosaic                                  |30    |
|SPO           |SPONSORSHIP MESSAGE                     |29    |
|SOL           |SOLICITATION MESSAGE                    |28    |
|SO            |MAY IDENTIFY THE SIGN ON\OFF OF A DAY   |27    |
|SEG           |SEGMENT OF A PROGRAM                    |26    |
|REG           |REGIONAL                                |25    |
|REF           |REFERENDUM                              |24    |
|PSA           |PUBLIC SERVICE ANNOUNCEMENT             |23    |
|PRP           |PROMOTION DURING DESIGNATED PREVIEW TIME|22    |
|PRO           |PROMOTION OF NON-CANADIAN PROGRAM       |21    |
|PRC           |PROMOTION OF UPCOMING CANADIAN PROGRAM  |20    |
|PGR           |PROGRAM                                 |19    |
|PGI           |PROGRAM I