In [1]:
# import pyspark
import pyspark

In [2]:
# create your spark instance
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [4]:
# read csv data into spark
spark.read.csv('./sa311/source.csv')

DataFrame[_c0: string, _c1: string]

In [5]:
# spark can also read from:
# spark.read.json
# spark.read.parquet
# + more

In [6]:
spark.read.format('csv').load('./sa311/source.csv').show(5)

+---------+----------------+
|      _c0|             _c1|
+---------+----------------+
|source_id| source_username|
|   100137|Merlene Blodgett|
|   103582|     Carmen Cura|
|   106463| Richard Sanchez|
|   119403|  Betty De Hoyos|
+---------+----------------+
only showing top 5 rows



In [8]:
(spark.read\
.option('header', True)\
.format('csv')\
.load('./sa311/source.csv')).show()

+---------+--------------------+
|source_id|     source_username|
+---------+--------------------+
|   100137|    Merlene Blodgett|
|   103582|         Carmen Cura|
|   106463|     Richard Sanchez|
|   119403|      Betty De Hoyos|
|   119555|      Socorro Quiara|
|   119868| Michelle San Miguel|
|   120752|      Eva T. Kleiber|
|   124405|           Lori Lara|
|   132408|       Leonard Silva|
|   135723|        Amy Cardenas|
|   136202|    Michelle Urrutia|
|   136979|      Leticia Garcia|
|   137943|    Pamela K. Baccus|
|   138605|        Marisa Ozuna|
|   138650|      Kimberly Green|
|   138650|Kimberly Green-Woods|
|   138793| Guadalupe Rodriguez|
|   138810|       Tawona Martin|
|   139342|     Jessica Mendoza|
|   139344|        Isis Mendoza|
+---------+--------------------+
only showing top 20 rows



In [9]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField('source_id', StringType()),
    StructField('source_username', StringType())
])

In [13]:
# in one notebook:
df.write.csv('sa311_source.csv')
# read into another notebook:
spark.read.csv('sa311_source.csv')

DataFrame[_c0: string, _c1: string]

In [43]:
df = (spark.read\
    .option('header', True)\
    .csv('./sa311/case.csv'))

In [44]:
df.createOrReplaceTempView('sources')

In [45]:
spark.sql('SELECT * FROM sources')

DataFrame[case_id: string, case_opened_date: string, case_closed_date: string, SLA_due_date: string, case_late: string, num_days_late: string, case_closed: string, dept_division: string, service_request_type: string, SLA_days: string, case_status: string, source_id: string, request_address: string, council_district: string]

In [46]:
spark.read.csv('./sa311/case.csv', header=True).select('case_late', 'num_days_late').show()

+---------+-------------------+
|case_late|      num_days_late|
+---------+-------------------+
|       NO| -998.5087616000001|
|       NO|-2.0126041669999997|
|       NO|       -3.022337963|
|       NO|       -15.01148148|
|      YES|0.37216435200000003|
|       NO|       -29.74398148|
|       NO|       -14.70673611|
|       NO|       -14.70662037|
|       NO|       -14.70662037|
|       NO|       -14.70649306|
|       NO|       -14.70649306|
|       NO|       -14.70636574|
|       NO|          -14.70625|
|       NO|       -14.70636574|
|       NO|       -14.70623843|
|       NO|-14.705891199999998|
|       NO|       -14.70600694|
|       NO|       -14.70576389|
|       NO|       -14.70576389|
|       NO|       -14.70564815|
+---------+-------------------+
only showing top 20 rows



In [47]:
## missed some stuff figuring out above, see github

In [48]:
df.select(df.case_late).distinct().show()

+---------+
|case_late|
+---------+
|      YES|
|       NO|
+---------+



In [49]:
df.createOrReplaceTempView('cases')

In [50]:
spark.sql('''
SELECT case_late, COUNT(*)
FROM cases
GROUP BY case_late
''').show()

+---------+--------+
|case_late|count(1)|
+---------+--------+
|      YES|   94503|
|       NO|  747201|
+---------+--------+



In [51]:
df.groupBy(df.case_late).count().show()

+---------+------+
|case_late| count|
+---------+------+
|      YES| 94503|
|       NO|747201|
+---------+------+



In [52]:
from pyspark.sql.functions import col, expr, count
df.groupBy(col('case_late')).agg(expr('count(*)')).show()

+---------+--------+
|case_late|count(1)|
+---------+--------+
|      YES|   94503|
|       NO|  747201|
+---------+--------+



In [53]:
df.groupBy(df.case_late).agg(count(df.case_late)).show()

+---------+----------------+
|case_late|count(case_late)|
+---------+----------------+
|      YES|           94503|
|       NO|          747201|
+---------+----------------+

