# Transform Channel Data - Notebook

####  Set up and start session


In [1]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import col, when, year, month, dayofmonth

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 231c86c2-0ba3-46d8-8167-3192ecc88b37
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 231c86c2-0ba3-46d8-8167-3192ecc88b37 to get into ready status...
Session 231c86c2-0ba3-46d8-8167-3192ecc88b37 ha

In [8]:
def format_channel_df(base_df):
    """
    Format table containing details of YT channels
    """
    
    base_df = base_df.withColumn(
    "channelTier",
    when(col("subscriberCount") < 999, "Graphite")
    .when((col("subscriberCount") >= 1000) & (col("subscriberCount") < 9999), "Opal")
    .when((col("subscriberCount") >= 10000) & (col("subscriberCount") < 99999), "Bronze")
    .when((col("subscriberCount") >= 100000) & (col("subscriberCount") < 499999), "Silver-Low")
    .when((col("subscriberCount") >= 500000) & (col("subscriberCount") < 999999), "Silver-High")
    .when((col("subscriberCount") >= 1000000) & (col("subscriberCount") < 9999999), "Gold")
    .otherwise("Diamond")
    )
    # Seperating year, month and date of Extraction Date - allowing partitioning
    base_df = base_df.withColumn('extractYear',year(col('extractDate')))
    base_df = base_df.withColumn('extractMonth',month(col('extractDate')))
    base_df = base_df.withColumn("extractDay", dayofmonth(col("extractDate")))
    
    return base_df




#### Configuring current date to read from S3 bucket

In [14]:
from datetime import datetime
current_date = datetime.now().strftime("%Y/%m/%d")
#current_date = "2025/03/09"
current_date

'2025/03/06'


In [3]:
s3_path = f"s3://youtube-channel-data-v1-02032025/raw/{current_date}/channel_data.parquet"
s3_path

's3://youtube-channel-data-v1-02032025/raw/2025/03/05/channel_data.parquet'


#### Reading from S3 bucket

In [4]:
dynamic_frame = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",  # Source type
    connection_options={"paths": [s3_path]},  # Path to the S3 data
    format="parquet"  # Specify the format of the data (parquet in this case)
)




In [5]:
# Convert DynamicFrame to DataFrame for processing
df = dynamic_frame.toDF()



In [6]:
df.printSchema()

root
 |-- channelId: string (nullable = true)
 |-- channelName: string (nullable = true)
 |-- subscriberCount: long (nullable = true)
 |-- videoCount: long (nullable = true)
 |-- viewCount: long (nullable = true)
 |-- uploadPlaylistId: string (nullable = true)
 |-- extractDate: date (nullable = true)


#### Transforming dataframe

In [9]:
df_transformed = format_channel_df(df)
df_transformed.show()

+--------------------+--------------------+---------------+----------+-----------+--------------------+-----------+-----------+-----------+------------+----------+
|           channelId|         channelName|subscriberCount|videoCount|  viewCount|    uploadPlaylistId|extractDate|channelTier|extractYear|extractMonth|extractDay|
+--------------------+--------------------+---------------+----------+-----------+--------------------+-----------+-----------+-----------+------------+----------+
|UC7cs8q-gJRlGwj4A...|    Alex The Analyst|         999000|       349|   46609471|UU7cs8q-gJRlGwj4A...| 2025-03-05|Silver-High|       2025|           3|         5|
|UCVpWDEFirsEfz2WG...|Tiny Technical Tu...|          76300|       209|    4983727|UUVpWDEFirsEfz2WG...| 2025-03-05|     Bronze|       2025|           3|         5|
|UCmLGJ3VYBcfRaWbP...|    Seattle Data Guy|         108000|       293|    5844552|UUmLGJ3VYBcfRaWbP...| 2025-03-05| Silver-Low|       2025|           3|         5|
|UCAq9f7jFEA7Mtl

In [10]:
df_transformed.printSchema()

root
 |-- channelId: string (nullable = true)
 |-- channelName: string (nullable = true)
 |-- subscriberCount: long (nullable = true)
 |-- videoCount: long (nullable = true)
 |-- viewCount: long (nullable = true)
 |-- uploadPlaylistId: string (nullable = true)
 |-- extractDate: date (nullable = true)
 |-- channelTier: string (nullable = false)
 |-- extractYear: integer (nullable = true)
 |-- extractMonth: integer (nullable = true)
 |-- extractDay: integer (nullable = true)


#### Writing back to S3 bucket

In [11]:
dynamic_frame_transformed_for_analysis = DynamicFrame.fromDF(df_transformed, glueContext, "dynamic_frame_transformed")




In [12]:
# Define the S3 path dynamically
s3_path_analysis = f"s3://youtube-channel-data-v1-02032025/analysis/channel_data/"
s3_path_analysis

's3://youtube-channel-data-v1-02032025/analysis/channel_data/'


In [None]:
# Write file to S3
s3output = glueContext.getSink(
    path=s3_path_analysis,
    connection_type="s3",
    updateBehavior="UPDATE_IN_DATABASE",
    partitionKeys=['channelId','extractYear','extractMonth','extractDay'],
    compression="snappy",
    enableUpdateCatalog=True
)
s3output.setCatalogInfo(catalogDatabase="glue_metadata_db_analysis_files", catalogTableName="channel_data")
s3output.setFormat("glueparquet")
s3output.writeFrame(dynamic_frame_transformed_for_analysis)

<awsglue.dynamicframe.DynamicFrame object at 0x7ff73e88bf90>
