## Data Lab: Load and Join Employees and Departments data

In [None]:
import logging
from pyspark.sql import SparkSession

#### Create Spark Session. Remember this is our entry point for Dataframes or Datasets, not RDDs! 

In [None]:

def rdd_to_dataframe(data, schema):
    """
    Example: This fn creates a Spark RDD, loads it into a Spark DataFrame, and returns the DataFrame 
    """
        
    # Create a SparkSession
    spark = SparkSession.builder.appName("RDDToDataFrame").getOrCreate()

    try:
        # Create an RDD from the input data, using Spark Context not Session!
        rdd = spark.sparkContext.parallelize(data)

        # Convert RDD to DataFrame
        df = spark.createDataFrame(rdd, schema)

        # Return the DataFrame, without stopping the SparkSession
        return df

    except Exception as e:
        # Log error and Stop the SparkSession
        logging.error('Error while transforming RDD to DF: {}'.format(e))
        spark.stop()


----
Create some random data

In [None]:
# Data sample
dept_data = [(1,"Big Data"), (2, "Finance"), (3,"Marketing")]
dept_schema = ["department_id", "department_name"]

In [None]:
# Data sample
emp_data = [(1,"Carlos", 17), (1,"Bob", 30), (2,"Jasmin", 26)]
emp_schema = ["department_id","employee_name", "age"]

---

### Let's now use the Spark RDD as a Spark Dataframe

In [None]:
# Call function, to transform RDD into DF
df_emp = rdd_to_dataframe(emp_data, emp_schema)
df_dept = rdd_to_dataframe(dept_data, dept_schema)

In [None]:
# Show schema
df_dept.show()

In [None]:
df_emp.printSchema()

### Use Spark SQL, to join 2 datasets

In [None]:
# Do we have a session running?
spark = SparkSession.builder.appName("RDDToDataFrame").getOrCreate()

In [None]:
# Register as view
df_emp.createOrReplaceTempView('employees')
df_dept.createOrReplaceTempView('departments')

In [None]:
# Query sample, using Spark SQL
spark.sql('''
            select emp.*, dept.*
            from employees as emp
                inner join departments as dept on (emp.department_id = dept.department_id) 
            where age >= 18
            ''').show()

In [None]:
# Let's now save the JOINED RESULTSET into a new Temporary View -- NO WHERE CLAUSE
spark.sql('''
        select emp.employee_name, emp.age, emp.department_id, dept.department_name
        from employees as emp
            inner join departments as dept on (emp.department_id = dept.department_id)
             where age >= 18
        ''').createOrReplaceTempView('dept_employees')

In [None]:
# Let's now save the JOINED RESULTSET into a new Temporary View -- NO WHERE CLAUSE
spark.sql('''
        select * from dept_employees where department_id is not null
        ''').show()

## 💾 Let's save this output for our Business Data Consumers

In [37]:
# Define output location
output_location = 'output/dept_employees/'

# Let's now save the JOINED RESULTSET to local storage. This could be Amazon S3 or other. 
spark.sql('''
        select * from dept_employees where department_id is not null
        ''').write.mode('append').csv(output_location)