In [1]:
# Import spark packages
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import col,count
import pyspark.sql.functions as fn
from pyspark.sql.catalog import Catalog
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, StringType

Start the `SparkSession`

In [2]:
conf = SparkConf().setAppName("Spark SQL")
sc = SparkContext(conf=conf)

spark = (SparkSession
    .builder
    .appName("Spark SQL")
    .getOrCreate()
)

# Aggregations

## Examples using the API

In [47]:
from pyspark.sql import functions as fn

products = spark.createDataFrame([
    ('1', 'mouse', 'microsoft', 39.99),
    ('2', 'mouse', 'microsoft', 59.99),
    ('3', 'keyboard', 'microsoft', 59.99),
    ('4', 'keyboard', 'logitech', 59.99),
    ('5', 'mouse', 'logitech', 29.99),
], ['prod_id', 'prod_cat', 'prod_brand', 'prod_value'])

products.groupBy('prod_cat').avg('prod_value').show()

+--------+-----------------+
|prod_cat|  avg(prod_value)|
+--------+-----------------+
|keyboard|            59.99|
|   mouse|43.32333333333333|
+--------+-----------------+



In [48]:
products.groupBy('prod_cat').agg(fn.avg('prod_value')).show()

+--------+-----------------+
|prod_cat|  avg(prod_value)|
+--------+-----------------+
|keyboard|            59.99|
|   mouse|43.32333333333333|
+--------+-----------------+



In [49]:
from pyspark.sql import functions as fn

products.groupBy('prod_brand', 'prod_cat')\
    .agg(fn.avg('prod_value')).show()

+----------+--------+---------------+
|prod_brand|prod_cat|avg(prod_value)|
+----------+--------+---------------+
| microsoft|   mouse|          49.99|
|  logitech|keyboard|          59.99|
| microsoft|keyboard|          59.99|
|  logitech|   mouse|          29.99|
+----------+--------+---------------+



In [64]:
from pyspark.sql import functions as fn

In [50]:
from pyspark.sql import functions as fn

products.groupBy('prod_brand').agg(
    fn.round(fn.avg('prod_value'), 1).alias('average'),
    fn.ceil(fn.sum('prod_value')).alias('sum'),
    fn.min('prod_value').alias('min')
).show()

+----------+-------+---+-----+
|prod_brand|average|sum|  min|
+----------+-------+---+-----+
|  logitech|   45.0| 90|29.99|
| microsoft|   53.3|160|39.99|
+----------+-------+---+-----+



## Analsye US Baby Names 1880-2017
=======================


Description
: US baby names provided by the SSA. 

This dataset contains all names used
for at least 5 children of either sex during a year. 


The file is made of `1924665` lines and  4 columns.

```
|-- name: string (nullable = true)
    |-- n: integer (nullable = true)
    |-- sex: string (nullable = true)
    |-- year: integer (nullable = true)
```

Each row indicates for a given name, sex, and year the number of babies 
of the given sex who were given that name during the given year. Names 
with less than 5 occurrences during the year were note recorded. 

|    name|  n|sex|year|
|:--------|:---:|:---:|:----:|
|  Emilia|112|  F|1985|
|   Kelsi|112|  F|1985|
|  Margot|112|  F|1985|
|  Mariam|112|  F|1985|
|Scarlett|112|  F|1985|

Load `babynames` from a `csv` or a `parquet` file

In [5]:
df_sp = spark.read\
             .format('csv')\
             .option("header", "true")\
             .option("mode", "FAILFAST")\
             .option("inferSchema", "true")\
             .option("sep", ",")\
             .load("babynames_short.csv")
# from parquet
#df_sp = spark.read.parquet('../data/baby_names_unclean.parquet')
df_sp.printSchema()

root
 |-- name: string (nullable = true)
 |-- n: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- year: double (nullable = true)




Ensure that the dataframe has the following schema:

    root
        |-- name: string (nullable = true)
        |-- n: integer (nullable = true)
        |-- sex: string (nullable = true)
        |-- year: integer (nullable = true)




For the first 4 tasks we will use The dataframe API 

1. What are the 10 most popular names for Females in year 2000.
2. What are the 10 most popular names for Males in year 2000.

In [5]:
#Girls Dataframe
df_girls=df_sp.select('name','n','sex','year').where((df_sp.sex=='F') & (df_sp.year==2000)).orderBy(df_sp.n,ascending=False).limit(10)
df_girls.show()

+---------+-----+---+------+
|     name|    n|sex|  year|
+---------+-----+---+------+
|    Emily|25953|  F|2000.0|
|   Hannah|23080|  F|2000.0|
|  Madison|19967|  F|2000.0|
|   Ashley|17997|  F|2000.0|
|    Sarah|17697|  F|2000.0|
|   Alexis|17629|  F|2000.0|
| Samantha|17266|  F|2000.0|
|  Jessica|15709|  F|2000.0|
|Elizabeth|15094|  F|2000.0|
|   Taylor|15078|  F|2000.0|
+---------+-----+---+------+



In [6]:
#Boys Dataframe
df_boys=df_sp.select('name','n','sex','year').where((df_sp.sex=='M') & (df_sp.year==2000)).orderBy(df_sp.n,ascending=False).limit(10)
df_boys.show()

+-----------+-----+---+------+
|       name|    n|sex|  year|
+-----------+-----+---+------+
|      Jacob|34471|  M|2000.0|
|    Michael|32035|  M|2000.0|
|    Matthew|28572|  M|2000.0|
|     Joshua|27538|  M|2000.0|
|Christopher|24931|  M|2000.0|
|   Nicholas|24652|  M|2000.0|
|     Andrew|23639|  M|2000.0|
|     Joseph|22825|  M|2000.0|
|     Daniel|22312|  M|2000.0|
|      Tyler|21503|  M|2000.0|
+-----------+-----+---+------+



3. Which year had

- a) the most distinct female names 

In [7]:
df_sp.groupBy('year','sex').agg(count('*').alias('count')).where(col('sex')=='F').orderBy('count',ascending=False).limit(1).show()

+------+---+-----+
|  year|sex|count|
+------+---+-----+
|2007.0|  F|20560|
+------+---+-----+



- b) the most distinct male names

In [9]:
df_sp.groupBy('year','sex').agg(count('*').alias('count')).where(col('sex')=='M').orderBy('count',ascending=False).limit(1).show()

+------+---+-----+
|  year|sex|count|
+------+---+-----+
|2008.0|  M|14613|
+------+---+-----+



- c) the most distict names (both male and female)

In [10]:
df_sp.groupBy('year').agg(count('*').alias('count')).orderBy('count',ascending=False).limit(1).show()

+------+-----+
|  year|count|
+------+-----+
|2008.0|35070|
+------+-----+



4. In the year 2010, how many names where assigned to both males and females.

In [25]:
#filter on 2010
#count per name
#df_x=filter >2
#count df_x 
df_2010=df_sp.select('*').where(df_sp.year==2010)
df_2010.groupBy('name').agg(count('*').alias('count')).where(col('count')>1).count()

2444

5. Create a new column that shows the length of each name.


In [56]:
df_sp.withColumn("name_length", fn.length("name")).show()

+----------+---+---+------+-----------+
|      name|  n|sex|  year|name_length|
+----------+---+---+------+-----------+
|    Emilia|112|  F|1985.0|          6|
|     Kelsi|112|  F|1985.0|          5|
|    Margot|112|  F|1985.0|          6|
|    Mariam|112|  F|1985.0|          6|
|  Scarlett|112|  F|1985.0|          8|
|      Aida|111|  F|1985.0|          4|
|    Ashlei|111|  F|1985.0|          6|
|     Greta|111|  F|1985.0|          5|
|    Jaimee|111|  F|1985.0|          6|
|     Lorna|111|  F|1985.0|          5|
|   Rosario|111|  F|1985.0|          7|
|     Sandi|111|  F|1985.0|          5|
|   Sharina|111|  F|1985.0|          7|
|    Tashia|111|  F|1985.0|          6|
|     Adina|110|  F|1985.0|          5|
|    Ahsley|110|  F|1985.0|          6|
|Alessandra|110|  F|1985.0|         10|
|    Amalia|110|  F|1985.0|          6|
|    Chelsi|110|  F|1985.0|          6|
|    Darcie|110|  F|1985.0|          6|
+----------+---+---+------+-----------+
only showing top 20 rows



6. Create a new column that shows the total number of times the name have been given to a baby across all years.

In [24]:
df_sp.groupBy(df_sp.name).agg(fn.sum(df_sp.n)).orderBy('sum(n)' , ascending = False).limit(20).show()

+-----------+-------+
|       name| sum(n)|
+-----------+-------+
|      James|5173828|
|       John|5137142|
|     Robert|4834915|
|    Michael|4372536|
|       Mary|4138360|
|    William|4118553|
|      David|3624225|
|     Joseph|2614083|
|    Richard|2572613|
|    Charles|2398453|
|     Thomas|2313415|
|Christopher|2031626|
|     Daniel|1915517|
|  Elizabeth|1634860|
|    Matthew|1595949|
|   Patricia|1576654|
|     George|1474117|
|   Jennifer|1471118|
|      Linda|1456006|
|    Anthony|1439784|
+-----------+-------+

