## Pyspark Data Loading and Writing 
How to use basic spark guide for all the components needed to create Data Science Projects with spark


In [51]:
# imports and logging setup
from pyspark.ml import PipelineModel
from pyspark.sql.types import  IntegerType, FloatType
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark import SparkContext 
from pyspark import SparkFiles
import numpy as np
import pandas as pd
import os

## setups logging
import logging

try: 
    logger.debug('logger is up')
except:
    name = 'pysparkCustomEstimator'
    formatter = logging.Formatter(fmt='%(asctime)s -  %(name)s - %(levelname)s  - %(message)s')
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)



## create a temp_data directory to work from
data_dir = 'temp_data'
try:
    os.mkdir(data_dir)
except:
    pass

#### Starting  Spark Context
This starts a local spark context


In [52]:
try:
    sc = SparkContext("local", "PysparkHowtoGuide")
    spark = SparkSession.builder.getOrCreate()
except ValueError:
    pass

#### Downloading Data Frome a URL
In this case the downloading the age of every congressman from 538


In [53]:
url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/congress-age/congress-terms.csv'
sc.addFile(url)
df = spark.read.csv(SparkFiles.get("congress-terms.csv"), header=True, inferSchema= True)
# displays the first two rows
df.limit(2).show()

+--------+-------+--------+---------+----------+---------+------+-------------------+-----+-----+---------+-------------------+----+
|congress|chamber|bioguide|firstname|middlename| lastname|suffix|           birthday|state|party|incumbent|          termstart| age|
+--------+-------+--------+---------+----------+---------+------+-------------------+-----+-----+---------+-------------------+----+
|      80|  house| M000112|   Joseph| Jefferson|Mansfield|  null|1861-02-09 00:00:00|   TX|    D|      Yes|1947-01-03 00:00:00|85.9|
|      80|  house| D000448|   Robert|       Lee| Doughton|  null|1863-11-07 00:00:00|   NC|    D|      Yes|1947-01-03 00:00:00|83.2|
+--------+-------+--------+---------+----------+---------+------+-------------------+-----+-----+---------+-------------------+----+



In [54]:
#prints out the data schema
df.schema

StructType(List(StructField(congress,IntegerType,true),StructField(chamber,StringType,true),StructField(bioguide,StringType,true),StructField(firstname,StringType,true),StructField(middlename,StringType,true),StructField(lastname,StringType,true),StructField(suffix,StringType,true),StructField(birthday,TimestampType,true),StructField(state,StringType,true),StructField(party,StringType,true),StructField(incumbent,StringType,true),StructField(termstart,TimestampType,true),StructField(age,DoubleType,true)))

#### Write Data to a CSV File 

In [55]:
path = data_dir + '/congress.csv'
df.write.csv(path,  header=True, mode='overwrite') ## write dataframe to csv, using overwrite and including colnams

#### Read Data From a CSV File
Reads in a single csv file with head from the path

In [56]:
df_read = spark.read.format("csv").option("header", "true").load(path)

#### Parquet vs CSV
'Apache Parquet is designed to bring efficient columnar storage of data compared to row-based files like CSV.'
+ Pyspark and read and write both csv and parquet, be parquet is much much quicker.
+ Parque has rules about columnames and enforces schema

#### Writing Parque Files in Chunks to a Directory
here partition is demonstrated where, data is written with paritioning and essentially every partition creates it's own sub directory
+ a sub directory of the data_dir is create
+ data is written with partitioning 

Note, with a spark session active, spark essentially maintains a connection csv files that are read into data frames

In [57]:
# creates a new subdirectory inside data dir to store the partition data
partitioned_dir = data_dir + '/part'
try:
    os.mkdir(partitioned_dir )
except:
    pass

# Choose a column in the data frame to parition by
partition_col = 'congress'

# writes the data
df.write.parquet(partitioned_dir, mode='overwrite', partitionBy=partition_col)

logger.info('writing df to {0} using col: {1} for paritioning'.format(partitioned_dir,partition_col  ))
# This creates on parquet file for each congress
os.listdir(partitioned_dir)[0:5]

['._SUCCESS.crc',
 'congress=100',
 'congress=101',
 'congress=102',
 'congress=103']

#### Reading a Directory of files as one Data Frame
Since is is essentually one file for each congress, all the files can be read in once 
+ enter only the directory path

In [58]:
df_read = spark.read.format("parquet").option("header", "true").load(partitioned_dir)
logger.debug('rows form original df {}'.format(df.count()))
logger.debug('rows form parque batch files read in df {}'.format(df_read.count()))