In [0]:
items_exploded_path = "s3://full-stack-bigdata-datasets/Big_Data/YOUTUBE/items_exploded.json"

df = spark.read.json(items_exploded_path)

In [0]:
df.count()

Out[2]: 3907

In [0]:
df.printSchema()

root
 |-- items: struct (nullable = true)
 |    |-- contentDetails: struct (nullable = true)
 |    |    |-- caption: string (nullable = true)
 |    |    |-- contentRating: struct (nullable = true)
 |    |    |    |-- ytRating: string (nullable = true)
 |    |    |-- definition: string (nullable = true)
 |    |    |-- dimension: string (nullable = true)
 |    |    |-- duration: string (nullable = true)
 |    |    |-- licensedContent: boolean (nullable = true)
 |    |    |-- projection: string (nullable = true)
 |    |    |-- regionRestriction: struct (nullable = true)
 |    |    |    |-- allowed: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- blocked: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |-- etag: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- kind: string (nullable = true)
 |    |-- snippet: struct (nullable = true)
 |    |    |-- categoryId: st

In [0]:
from pyspark.sql import functions as F

df.select('items.snippet.title').show(5)
df.withColumn('items.snippet.title', F.col('items.snippet.title'))
# Alternative
# df.select(F.col('items').getField('snippet').getField('title')).show(5)

+--------------------+
|               title|
+--------------------+
|VOLO. "L'air d'un...|
|Julian Jeweil - A...|
| Justice - D.A.N.C.E|
|Gramatik - Tortur...|
|Ben Howard - Oats...|
+--------------------+
only showing top 5 rows

Out[4]: DataFrame[items: struct<contentDetails:struct<caption:string,contentRating:struct<ytRating:string>,definition:string,dimension:string,duration:string,licensedContent:boolean,projection:string,regionRestriction:struct<allowed:array<string>,blocked:array<string>>>,etag:string,id:string,kind:string,snippet:struct<categoryId:string,channelId:string,channelTitle:string,defaultAudioLanguage:string,defaultLanguage:string,description:string,liveBroadcastContent:string,localized:struct<description:string,title:string>,publishedAt:string,tags:array<string>,thumbnails:struct<default:struct<height:bigint,url:string,width:bigint>,high:struct<height:bigint,url:string,width:bigint>,maxres:struct<height:bigint,url:string,width:bigint>,medium:struct<height:bigint,url

In [0]:
# Let's give you the intuition for the flattening function we will share with you now
# The idea is to automatically dig deeper and deeper into the schema in order to extract
# all the column names in the form "array1.array2.array3.field1", let's go!

# we'll work with the schema in json format, which will be way easier to manipulate
df.schema.jsonValue()

# It's nothing more than a dictionnary with keys

Out[5]: {'type': 'struct',
 'fields': [{'name': 'items',
   'type': {'type': 'struct',
    'fields': [{'name': 'contentDetails',
      'type': {'type': 'struct',
       'fields': [{'name': 'caption',
         'type': 'string',
         'nullable': True,
         'metadata': {}},
        {'name': 'contentRating',
         'type': {'type': 'struct',
          'fields': [{'name': 'ytRating',
            'type': 'string',
            'nullable': True,
            'metadata': {}}]},
         'nullable': True,
         'metadata': {}},
        {'name': 'definition',
         'type': 'string',
         'nullable': True,
         'metadata': {}},
        {'name': 'dimension',
         'type': 'string',
         'nullable': True,
         'metadata': {}},
        {'name': 'duration',
         'type': 'string',
         'nullable': True,
         'metadata': {}},
        {'name': 'licensedContent',
         'type': 'boolean',
         'nullable': True,
         'metadata': {}},
        {'name': 

In [0]:
df.schema.jsonValue().keys()
# Only two keys at this stage, type and fields, let's explore the type key

Out[6]: dict_keys(['type', 'fields'])

In [0]:
df.schema.jsonValue()["type"]
# The value associated is struct

Out[7]: 'struct'

In [0]:
# let's explore the content of the other key
df.schema.jsonValue()["fields"]

Out[8]: [{'name': 'items',
  'type': {'type': 'struct',
   'fields': [{'name': 'contentDetails',
     'type': {'type': 'struct',
      'fields': [{'name': 'caption',
        'type': 'string',
        'nullable': True,
        'metadata': {}},
       {'name': 'contentRating',
        'type': {'type': 'struct',
         'fields': [{'name': 'ytRating',
           'type': 'string',
           'nullable': True,
           'metadata': {}}]},
        'nullable': True,
        'metadata': {}},
       {'name': 'definition',
        'type': 'string',
        'nullable': True,
        'metadata': {}},
       {'name': 'dimension',
        'type': 'string',
        'nullable': True,
        'metadata': {}},
       {'name': 'duration',
        'type': 'string',
        'nullable': True,
        'metadata': {}},
       {'name': 'licensedContent',
        'type': 'boolean',
        'nullable': True,
        'metadata': {}},
       {'name': 'projection',
        'type': 'string',
        'nullable': Tr

In [0]:
#it's a list, what's the first element?
df.schema.jsonValue()["fields"][0]

Out[9]: {'name': 'items',
 'type': {'type': 'struct',
  'fields': [{'name': 'contentDetails',
    'type': {'type': 'struct',
     'fields': [{'name': 'caption',
       'type': 'string',
       'nullable': True,
       'metadata': {}},
      {'name': 'contentRating',
       'type': {'type': 'struct',
        'fields': [{'name': 'ytRating',
          'type': 'string',
          'nullable': True,
          'metadata': {}}]},
       'nullable': True,
       'metadata': {}},
      {'name': 'definition',
       'type': 'string',
       'nullable': True,
       'metadata': {}},
      {'name': 'dimension',
       'type': 'string',
       'nullable': True,
       'metadata': {}},
      {'name': 'duration', 'type': 'string', 'nullable': True, 'metadata': {}},
      {'name': 'licensedContent',
       'type': 'boolean',
       'nullable': True,
       'metadata': {}},
      {'name': 'projection',
       'type': 'string',
       'nullable': True,
       'metadata': {}},
      {'name': 'regionRestri

In [0]:
# what keys does it have ?
df.schema.jsonValue()["fields"][0].keys()

Out[10]: dict_keys(['name', 'type', 'nullable', 'metadata'])

In [0]:
# the key name contains the name of the field
df.schema.jsonValue()["fields"][0]["name"]

Out[11]: 'items'

In [0]:
# if we have the key type then we have subfields inside it
df.schema.jsonValue()["fields"][0]["type"]
# and we are back to the same structure we had at the beginning and we can start digging again
# that's the spirit of the function below

Out[12]: {'type': 'struct',
 'fields': [{'name': 'contentDetails',
   'type': {'type': 'struct',
    'fields': [{'name': 'caption',
      'type': 'string',
      'nullable': True,
      'metadata': {}},
     {'name': 'contentRating',
      'type': {'type': 'struct',
       'fields': [{'name': 'ytRating',
         'type': 'string',
         'nullable': True,
         'metadata': {}}]},
      'nullable': True,
      'metadata': {}},
     {'name': 'definition',
      'type': 'string',
      'nullable': True,
      'metadata': {}},
     {'name': 'dimension', 'type': 'string', 'nullable': True, 'metadata': {}},
     {'name': 'duration', 'type': 'string', 'nullable': True, 'metadata': {}},
     {'name': 'licensedContent',
      'type': 'boolean',
      'nullable': True,
      'metadata': {}},
     {'name': 'projection',
      'type': 'string',
      'nullable': True,
      'metadata': {}},
     {'name': 'regionRestriction',
      'type': {'type': 'struct',
       'fields': [{'name': 'allowed

In [0]:
from pyspark.sql.types import StructType, StructField
from typing import List, Dict, Generator, Union, Callable

# This is actually written like a scala function, we'll walk you through it
def walkSchema(schema: Union[StructType, StructField]) -> Generator[str, None, None]:
    """Explores a PySpark schema:
    
    schema: StructType | StructField
    
    Yield
    -----
    A generator of strings, the name of each field in the schema
    """
    
    # we define a function _walk that produces a string generator from
    # a dictionnary "schema_dct", and a string "prefix"
    def _walk(schema_dct: Dict['str', Union['str', list, dict]],
              prefix: str = "") -> Generator[str, None, None]:
        assert isinstance(prefix, str), "prefix should be a string" # check if prefix is a string
        
        # this function returns "name" if there's no prefix and "prefix.name" if prefix exists
        fullName: Callable[str, str] = lambda name: ( 
            name if not prefix else f"{prefix}.{name}")
        
        # we get the next name one level lower from the dictionnary
        name = schema_dct.get('name', '')
        
        # if the type is struct then we search for the fields key
        # if fields is there we apply the function again and dig one level deeper in
        # the schema and set a prefix
        if schema_dct['type'] == 'struct':
            assert 'fields' in schema_dct, (
                "It's a StructType, we should have some fields")
            for field in schema_dct['fields']:
                yield from _walk(field, prefix=prefix)
        # if we have a dict type and we can't find fields then we
        # dig one level deeper and apply the _walk function again
        elif isinstance(schema_dct['type'], dict):
            assert 'fields' not in schema_dct, (
                "We're missing some keys here")
            yield from _walk(schema_dct['type'], prefix=fullName(name))
        # If we finally reached the end and found a name we yield the full name
        elif name:
            yield fullName(name)
    
    yield from _walk(schema.jsonValue())

# yield as opposed to return, returns a result but does not stop the function from running, it keeps
# running even after returning one result.

In [0]:
col_names = walkSchema(df.schema)
col_names

Out[14]: <generator object walkSchema at 0x7fc424480dd0>

In [0]:
for col_name in walkSchema(df.schema):
  print(col_name)

items.contentDetails.caption
items.contentDetails.contentRating.ytRating
items.contentDetails.definition
items.contentDetails.dimension
items.contentDetails.duration
items.contentDetails.licensedContent
items.contentDetails.projection
items.etag
items.id
items.kind
items.snippet.categoryId
items.snippet.channelId
items.snippet.channelTitle
items.snippet.defaultAudioLanguage
items.snippet.defaultLanguage
items.snippet.description
items.snippet.liveBroadcastContent
items.snippet.localized.description
items.snippet.localized.title
items.snippet.publishedAt
items.snippet.thumbnails.default.height
items.snippet.thumbnails.default.url
items.snippet.thumbnails.default.width
items.snippet.thumbnails.high.height
items.snippet.thumbnails.high.url
items.snippet.thumbnails.high.width
items.snippet.thumbnails.maxres.height
items.snippet.thumbnails.maxres.url
items.snippet.thumbnails.maxres.width
items.snippet.thumbnails.medium.height
items.snippet.thumbnails.medium.url
items.snippet.thumbnails.medi

In [0]:
from functools import reduce

from pyspark.sql import functions as F

# Non-functional way: unpacking the generator
# exploded_df = df.select(*walkSchema(df.schema))

# The functional way, using functools' reduce
exploded_df = reduce(
  lambda memo_df, col_name: memo_df.withColumn(col_name, F.col(col_name)),
  walkSchema(df.schema), df
).drop('items')

exploded_df.limit(5).toPandas()

Unnamed: 0,items.contentDetails.caption,items.contentDetails.contentRating.ytRating,items.contentDetails.definition,items.contentDetails.dimension,items.contentDetails.duration,items.contentDetails.licensedContent,items.contentDetails.projection,items.etag,items.id,items.kind,...,items.statistics.dislikeCount,items.statistics.favoriteCount,items.statistics.likeCount,items.statistics.viewCount,items.status.embeddable,items.status.license,items.status.madeForKids,items.status.privacyStatus,items.status.publicStatsViewable,items.status.uploadStatus
0,False,,sd,2d,PT3M33S,True,rectangular,SqP7uUVSol30dxvuScN6JUny6T4,t1l8Z6gLPzo,youtube#video,...,26,0,1028,223172,True,youtube,False,public,True,processed
1,False,,hd,2d,PT7M46S,False,rectangular,m3DnhzTEw9ABiqzBvdasfk5Av_8,we5gzZq5Avg,youtube#video,...,3,0,124,13409,True,youtube,False,public,True,processed
2,False,,sd,2d,PT3M7S,False,rectangular,zyzs7STAR3NG-_pZe-0nGkbKoqg,49esza4eiK4,youtube#video,...,780,0,25540,10106655,True,youtube,False,public,True,processed
3,False,,hd,2d,PT3M43S,False,rectangular,hX2C15F6fdO5A-stUFMU5Az2PvI,BoO6LfR7ca0,youtube#video,...,0,0,255,29153,True,youtube,False,public,True,processed
4,False,,hd,2d,PT5M,False,rectangular,rYHoV38PLpMbRuX_zhGTVBKNotw,DaH4W1rY9us,youtube#video,...,1784,0,136033,16488714,True,youtube,False,public,True,processed


In [0]:
output_filename = 's3://full-stack-bigdata-datasets/Big_Data/YOUTUBE/items_tidy.parquet'
exploded_df.write \
  .parquet(output_filename, mode='overwrite')

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-262717713763680>:2[0m
[1;32m      1[0m output_filename [38;5;241m=[39m [38;5;124m'[39m[38;5;124ms3://full-stack-bigdata-datasets/Big_Data/YOUTUBE/items_tidy.parquet[39m[38;5;124m'[39m
[0;32m----> 2[0m exploded_df[38;5;241m.[39mwrite \
[1;32m      3[0m   [38;5;241m.[39mparquet(output_filename, mode[38;5;241m=[39m[38;5;124m'[39m[38;5;124moverwrite[39m[38;5;124m'[39m)

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;2