# Data Access Exercise

In [None]:
import sys
sys.path.append("..")
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

### Create spark session

In [None]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("Data Access")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

### Read CSV File

In [None]:
# create a DataFrame using an ifered Schema 
inputFile = translate_to_file_string("../data/churn.csv")
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   
print_df(df,10)


### Read Json file

In [None]:
# create a DataFrame from Json
inputFile = translate_to_file_string("../data/census_2010.json")
df = spark.read.json(inputFile)   
print_df(df,10)

### Read HDFS

In [None]:
""" inputFile = "hdfs:///data/churn.csv"
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   
print_df(df,10) """

### Read text file

In [None]:
inputFile = translate_to_file_string("../data/ghEmployees.txt")
input = spark.sparkContext.textFile(inputFile)
counts = input.flatMap(lambda line : line.split(";")).map(lambda word : [word, 1]).reduceByKey(lambda a, b : a + b)
df = spark.createDataFrame(counts)
print_df (df,10)

### Read Parquet Files

In [None]:
inputFile = translate_to_file_string("../data/census_2010.parquet")
df = spark.read.parquet(inputFile)   
print_df(df,10)

In [None]:
spark.stop()