In [0]:
INPUT_FILENAME = 's3://full-stack-bigdata-datasets/Big_Data/YOUTUBE/items_clean.parquet'
songs_raw = spark.read.parquet(INPUT_FILENAME)
songs_raw.printSchema()

root
 |-- contentDetails_caption: string (nullable = true)
 |-- contentDetails_contentRating_ytRating: string (nullable = true)
 |-- contentDetails_definition: string (nullable = true)
 |-- contentDetails_dimension: string (nullable = true)
 |-- contentDetails_duration: string (nullable = true)
 |-- contentDetails_licensedContent: boolean (nullable = true)
 |-- contentDetails_projection: string (nullable = true)
 |-- etag: string (nullable = true)
 |-- id: string (nullable = true)
 |-- kind: string (nullable = true)
 |-- snippet_categoryId: string (nullable = true)
 |-- snippet_channelId: string (nullable = true)
 |-- snippet_channelTitle: string (nullable = true)
 |-- snippet_defaultAudioLanguage: string (nullable = true)
 |-- snippet_defaultLanguage: string (nullable = true)
 |-- snippet_liveBroadcastContent: string (nullable = true)
 |-- snippet_publishedAt: string (nullable = true)
 |-- snippet_thumbnails_default_height: long (nullable = true)
 |-- snippet_thumbnails_default_url: s

In [0]:
len(songs_raw.columns)

Out[5]: 44

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame

In [0]:
def value_counts(songs_raw, col_name):
  return songs_raw.select(col_name) \
           .groupBy(col_name) \
           .count() \
           .orderBy(F.desc('count'))

In [0]:
for col_name in songs_raw.columns:
  value_counts(songs_raw, col_name).show()

+----------------------+-----+
|contentDetails_caption|count|
+----------------------+-----+
|                 false| 3844|
|                  true|   63|
+----------------------+-----+

+-------------------------------------+-----+
|contentDetails_contentRating_ytRating|count|
+-------------------------------------+-----+
|                                 null| 3880|
|                      ytAgeRestricted|   27|
+-------------------------------------+-----+

+-------------------------+-----+
|contentDetails_definition|count|
+-------------------------+-----+
|                       hd| 1991|
|                       sd| 1916|
+-------------------------+-----+

+------------------------+-----+
|contentDetails_dimension|count|
+------------------------+-----+
|                      2d| 3907|
+------------------------+-----+

+-----------------------+-----+
|contentDetails_duration|count|
+-----------------------+-----+
|                PT3M31S|   38|
|                 PT4M1S|   34|
|    

In [0]:
from enum import Enum

class Status(Enum):
  NOT_SELECTED = 'not selected'
  SELECTED = 'selected'
  LATER = 'later'
  MAYBE = 'maybe'

In [0]:
selection_dict = {
  'contentDetails_caption': Status.NOT_SELECTED,
  'contentDetails_contentRating_ytRating': Status.NOT_SELECTED,
  'contentDetails_definition': Status.MAYBE,
  'contentDetails_dimension': Status.NOT_SELECTED,
  'contentDetails_duration': Status.SELECTED,
  'contentDetails_licensedContent': Status.MAYBE,
  'contentDetails_projection': Status.NOT_SELECTED,
  'etag': Status.NOT_SELECTED,
  'id': Status.SELECTED,
  'kind': Status.NOT_SELECTED,
  'snippet_categoryId': Status.MAYBE,
  'snippet_channelId': Status.SELECTED,
  'snippet_channelTitle': Status.SELECTED,
  'snippet_defaultAudioLanguage': Status.LATER,
  'snippet_defaultLanguage': Status.LATER,
  'snippet_description': Status.LATER,
  'snippet_liveBroadcastContent': Status.NOT_SELECTED,
  'snippet_localized_description': Status.NOT_SELECTED,
  'snippet_localized_title': Status.LATER,
  'snippet_publishedAt': Status.SELECTED,
  'snippet_thumbnails_default_height': Status.NOT_SELECTED,
  'snippet_thumbnails_default_url': Status.NOT_SELECTED,
  'snippet_thumbnails_default_width': Status.NOT_SELECTED,
  'snippet_thumbnails_high_height': Status.NOT_SELECTED,
  'snippet_thumbnails_high_url': Status.NOT_SELECTED,
  'snippet_thumbnails_high_width': Status.NOT_SELECTED,
  'snippet_thumbnails_maxres_height': Status.NOT_SELECTED,
  'snippet_thumbnails_maxres_url': Status.NOT_SELECTED,
  'snippet_thumbnails_maxres_width': Status.NOT_SELECTED,
  'snippet_thumbnails_medium_height': Status.NOT_SELECTED,
  'snippet_thumbnails_medium_url': Status.NOT_SELECTED,
  'snippet_thumbnails_medium_width': Status.NOT_SELECTED,
  'snippet_thumbnails_standard_height': Status.NOT_SELECTED,
  'snippet_thumbnails_standard_url': Status.NOT_SELECTED,
  'snippet_thumbnails_standard_width': Status.NOT_SELECTED,
  'snippet_title': Status.SELECTED,
  'statistics_commentCount': Status.SELECTED,
  'statistics_dislikeCount': Status.SELECTED,
  'statistics_favoriteCount': Status.NOT_SELECTED,
  'statistics_viewCount': Status.SELECTED,
  'status_embeddable': Status.LATER,
  'status_license': Status.LATER,
  'status_privacyStatus': Status.LATER,
  'status_publicStatsViewable': Status.LATER,
  'status_uploadStatus': Status.NOT_SELECTED
}

import pandas as pd

selection_df = pd.DataFrame.from_dict({k: v.value for k, v in selection_dict.items()},
                                      orient='index', columns=['status'])
selection_df



Unnamed: 0,status
contentDetails_caption,not selected
contentDetails_contentRating_ytRating,not selected
contentDetails_definition,maybe
contentDetails_dimension,not selected
contentDetails_duration,selected
contentDetails_licensedContent,maybe
contentDetails_projection,not selected
etag,not selected
id,selected
kind,not selected


In [0]:
selection_df \
  .groupby('status') \
  .agg({'status': 'count'}) \
  .rename(columns={'status': 'count'})

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
later,8
maybe,3
not selected,25
selected,9


In [0]:
selected_columns = list(selection_df.loc[selection_df['status'] == "selected"].index)
selected_columns

Out[12]: ['contentDetails_duration',
 'id',
 'snippet_channelId',
 'snippet_channelTitle',
 'snippet_publishedAt',
 'snippet_title',
 'statistics_commentCount',
 'statistics_dislikeCount',
 'statistics_viewCount']

In [0]:
songs = songs_raw.select(selected_columns)
songs.printSchema()
print("Shape: ", (songs.count(), len(songs.columns)))

root
 |-- contentDetails_duration: string (nullable = true)
 |-- id: string (nullable = true)
 |-- snippet_channelId: string (nullable = true)
 |-- snippet_channelTitle: string (nullable = true)
 |-- snippet_publishedAt: string (nullable = true)
 |-- snippet_title: string (nullable = true)
 |-- statistics_commentCount: double (nullable = true)
 |-- statistics_dislikeCount: double (nullable = true)
 |-- statistics_viewCount: long (nullable = true)

Shape:  (3907, 9)


In [0]:
songs.write \
  .parquet("s3://full-stack-bigdata-datasets/Big_Data/YOUTUBE/items_selected.parquet", mode='overwrite')

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-4417087527232138>:1[0m
[0;32m----> 1[0m [43msongs[49m[38;5;241;43m.[39;49m[43mwrite[49m[43m [49m[43m\[49m
[1;32m      2[0m [43m  [49m[38;5;241;43m.[39;49m[43mparquet[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43ms3://full-stack-bigdata-datasets/Big_Data/YOUTUBE/items_selected.parquet[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[43mmode[49m[38;5;241;43m=[39;49m[38;5;124;43m'[39;49m[38;5;124;43moverwrite[39;49m[38;5;124;43m'[39;49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39