Skip to content

Commit 66b3161

Browse files
authored
Merge pull request #190 from microsoft/SnehaGunda-patch-4
Updating docs-samples/data-engineering/Lakehouse Tutorial Source Code/0…
2 parents c44ab33 + b3391a6 commit 66b3161

File tree

2 files changed

+2
-1417
lines changed

2 files changed

+2
-1417
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"cells":[{"cell_type":"markdown","source":["### Spark session configuration\n","This cell sets Spark session settings to enable _Verti-Parquet_ and _Optimize on Write_. More details about _Verti-Parquet_ and _Optimize on Write_ in tutorial document."],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"372795e6-01a9-4914-8c4f-93e94971bd1f"},{"cell_type":"code","source":["# Copyright (c) Microsoft Corporation.\n","# Licensed under the MIT License.\n","\n","spark.conf.set(\"spark.sql.parquet.vorder.enabled\", \"true\")\n","spark.conf.set(\"spark.microsoft.delta.optimizeWrite.enabled\", \"true\")\n","spark.conf.set(\"spark.microsoft.delta.optimizeWrite.binSize\", \"1073741824\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":3,"statement_ids":[3],"livy_statement_state":"available","session_id":"080a2fa1-6a46-4087-b392-f7536a0c8802","state":"finished","normalized_state":"finished","queued_time":"2024-07-19T18:02:35.1114499Z","session_start_time":"2024-07-19T18:02:35.3453281Z","execution_start_time":"2024-07-19T18:02:46.2233778Z","execution_finish_time":"2024-07-19T18:02:48.728715Z","parent_msg_id":"7ca3fbc0-d822-455e-85a6-fa7bfec4dd7d"},"text/plain":"StatementMeta(, 080a2fa1-6a46-4087-b392-f7536a0c8802, 3, Finished, Available, Finished)"},"metadata":{}}],"execution_count":1,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"96a19297-6e9f-4020-937b-5d0ae7a10dd6"},{"cell_type":"markdown","source":["### Fact - Sale\n","\n","This cell reads raw data from the _Files_ section of the lakehouse, adds additional columns for different date parts and the same information is being used to create partitioned fact delta table."],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"19b25a84-a630-470a-9b52-a546214a1b86"},{"cell_type":"code","source":["from pyspark.sql.functions import col, year, month, quarter\n","\n","table_name = 'fact_sale'\n","\n","df = spark.read.format(\"parquet\").load('Files/wwi-raw-data/WideWorldImportersDW/parquet/full/fact_sale_1y_full')\n","df = df.withColumn('Year', year(col(\"InvoiceDateKey\")))\n","df = df.withColumn('Quarter', quarter(col(\"InvoiceDateKey\")))\n","df = df.withColumn('Month', month(col(\"InvoiceDateKey\")))\n","\n","df.write.mode(\"overwrite\").format(\"delta\").partitionBy(\"Year\",\"Quarter\").save(\"Tables/\" + table_name)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":5,"statement_ids":[5],"livy_statement_state":"available","session_id":"080a2fa1-6a46-4087-b392-f7536a0c8802","state":"finished","normalized_state":"finished","queued_time":"2024-07-19T18:03:39.9856435Z","session_start_time":null,"execution_start_time":"2024-07-19T18:03:40.3888878Z","execution_finish_time":"2024-07-19T18:05:13.1549177Z","parent_msg_id":"4daa6a66-b7e1-4424-93e9-a4b4b75d883f"},"text/plain":"StatementMeta(, 080a2fa1-6a46-4087-b392-f7536a0c8802, 5, Finished, Available, Finished)"},"metadata":{}}],"execution_count":3,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"23da9c7a-a5ef-413c-98c0-38d9d344f958"},{"cell_type":"markdown","source":["### Dimensions\n","This cell creates a function to read raw data from the _Files_ section of the lakehouse for the table name passed as a parameter. Next, it creates a list of dimension tables. Finally, it has a _for loop_ to loop through the list of tables and call above function with each table name as parameter to read data for that specific table and create delta table."],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"d140d7c6-c0c2-45b5-8038-9d062747e957"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","\n","def loadFullDataFromSource(table_name):\n"," df = spark.read.format(\"parquet\").load('Files/wwi-raw-data/WideWorldImportersDW/parquet/full/' + table_name)\n"," df = df.select([c for c in df.columns if c != 'Photo'])\n"," df.write.mode(\"overwrite\").format(\"delta\").save(\"Tables/\" + table_name)\n","\n","full_tables = [\n"," 'dimension_city',\n"," 'dimension_customer',\n"," 'dimension_date',\n"," 'dimension_employee',\n"," 'dimension_stock_item'\n"," ]\n","\n","for table in full_tables:\n"," loadFullDataFromSource(table)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":6,"statement_ids":[6],"livy_statement_state":"available","session_id":"080a2fa1-6a46-4087-b392-f7536a0c8802","state":"finished","normalized_state":"finished","queued_time":"2024-07-19T18:05:57.3175551Z","session_start_time":null,"execution_start_time":"2024-07-19T18:05:57.7343426Z","execution_finish_time":"2024-07-19T18:06:32.5841133Z","parent_msg_id":"c13e809b-dba1-46b8-b684-c3f855511f0a"},"text/plain":"StatementMeta(, 080a2fa1-6a46-4087-b392-f7536a0c8802, 6, Finished, Available, Finished)"},"metadata":{}}],"execution_count":4,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"e52fe7a3-01e5-4cfa-ac58-b7cb17b7153d"}],"metadata":{"language_info":{"name":"python"},"kernelspec":{"name":"synapse_pyspark","display_name":"Synapse PySpark"},"widgets":{},"kernel_info":{"name":"synapse_pyspark"},"notebook_environment":{},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"synapse_widget":{"version":"0.1","state":{}},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"enableDebugMode":false,"conf":{}}},"dependencies":{"lakehouse":{"known_lakehouses":[{"id":"82e78e41-11a3-448e-a1aa-0bc48fc09cb6"},{"id":"2c52a91e-6ef4-4364-a525-33ceae21618a"}],"default_lakehouse":"2c52a91e-6ef4-4364-a525-33ceae21618a","default_lakehouse_name":"wwilakehouse","default_lakehouse_workspace_id":"953110e1-d237-4866-9e9a-f4278a2eefcd"}}},"nbformat":4,"nbformat_minor":5}
1+
{"cells":[{"cell_type":"markdown","source":["### Spark session configuration\n","This cell sets Spark session settings to enable _Verti-Parquet_ and _Optimize on Write_. More details about _Verti-Parquet_ and _Optimize on Write_ in tutorial document."],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"372795e6-01a9-4914-8c4f-93e94971bd1f"},{"cell_type":"code","source":["# Copyright (c) Microsoft Corporation.\n","# Licensed under the MIT License.\n","\n","spark.conf.set(\"spark.sql.parquet.vorder.enabled\", \"true\")\n","spark.conf.set(\"spark.microsoft.delta.optimizeWrite.enabled\", \"true\")\n","spark.conf.set(\"spark.microsoft.delta.optimizeWrite.binSize\", \"1073741824\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":4,"statement_ids":[4],"state":"finished","livy_statement_state":"available","session_id":"08013a4a-347d-4246-9218-ca8e26e49480","normalized_state":"finished","queued_time":"2025-08-29T22:26:36.0038484Z","session_start_time":null,"execution_start_time":"2025-08-29T22:26:36.0050333Z","execution_finish_time":"2025-08-29T22:26:36.3755565Z","parent_msg_id":"1fbac74c-9d2c-4bbf-9868-f7665b40f127"},"text/plain":"StatementMeta(, 08013a4a-347d-4246-9218-ca8e26e49480, 4, Finished, Available, Finished)"},"metadata":{}}],"execution_count":2,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"96a19297-6e9f-4020-937b-5d0ae7a10dd6"},{"cell_type":"markdown","source":["### Fact - Sale\n","\n","This cell reads raw data from the _Files_ section of the lakehouse, adds additional columns for different date parts and the same information is being used to create partitioned fact delta table."],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"19b25a84-a630-470a-9b52-a546214a1b86"},{"cell_type":"code","source":["from pyspark.sql.functions import col, year, month, quarter\n","\n","table_name = 'fact_sale'\n","\n","df = spark.read.format(\"parquet\").load('Files/wwi-raw-data/full/fact_sale_1y_full')\n","df = df.withColumn('Year', year(col(\"InvoiceDateKey\")))\n","df = df.withColumn('Quarter', quarter(col(\"InvoiceDateKey\")))\n","df = df.withColumn('Month', month(col(\"InvoiceDateKey\")))\n","\n","df.write.mode(\"overwrite\").format(\"delta\").partitionBy(\"Year\",\"Quarter\").save(\"Tables/\" + table_name)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":5,"statement_ids":[5],"state":"finished","livy_statement_state":"available","session_id":"08013a4a-347d-4246-9218-ca8e26e49480","normalized_state":"finished","queued_time":"2025-08-29T22:26:48.9925607Z","session_start_time":null,"execution_start_time":"2025-08-29T22:26:48.9937029Z","execution_finish_time":"2025-08-29T22:28:54.9518209Z","parent_msg_id":"a7ea93d7-0e38-41d2-8d8d-fe0f3b34f59c"},"text/plain":"StatementMeta(, 08013a4a-347d-4246-9218-ca8e26e49480, 5, Finished, Available, Finished)"},"metadata":{}}],"execution_count":3,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"23da9c7a-a5ef-413c-98c0-38d9d344f958"},{"cell_type":"markdown","source":["### Dimensions\n","This cell creates a function to read raw data from the _Files_ section of the lakehouse for the table name passed as a parameter. Next, it creates a list of dimension tables. Finally, it has a _for loop_ to loop through the list of tables and call above function with each table name as parameter to read data for that specific table and create delta table."],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"d140d7c6-c0c2-45b5-8038-9d062747e957"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","\n","def loadFullDataFromSource(table_name):\n"," df = spark.read.format(\"parquet\").load('Files/wwi-raw-data/full/' + table_name)\n"," df = df.select([c for c in df.columns if c != 'Photo'])\n"," df.write.mode(\"overwrite\").format(\"delta\").save(\"Tables/\" + table_name)\n","\n","full_tables = [\n"," 'dimension_city',\n"," 'dimension_customer',\n"," 'dimension_date',\n"," 'dimension_employee',\n"," 'dimension_stock_item'\n"," ]\n","\n","for table in full_tables:\n"," loadFullDataFromSource(table)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":6,"statement_ids":[6],"state":"finished","livy_statement_state":"available","session_id":"08013a4a-347d-4246-9218-ca8e26e49480","normalized_state":"finished","queued_time":"2025-08-29T22:29:06.8516704Z","session_start_time":null,"execution_start_time":"2025-08-29T22:29:06.8529804Z","execution_finish_time":"2025-08-29T22:29:19.0185047Z","parent_msg_id":"585755dd-7d6c-4035-b7d2-5dcc76ac8538"},"text/plain":"StatementMeta(, 08013a4a-347d-4246-9218-ca8e26e49480, 6, Finished, Available, Finished)"},"metadata":{}}],"execution_count":4,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"e52fe7a3-01e5-4cfa-ac58-b7cb17b7153d"}],"metadata":{"language_info":{"name":"python"},"kernelspec":{"name":"synapse_pyspark","display_name":"Synapse PySpark"},"widgets":{},"kernel_info":{"name":"synapse_pyspark"},"notebook_environment":{},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"synapse_widget":{"version":"0.1","state":{}},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"enableDebugMode":false,"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"known_lakehouses":[{"id":"82e78e41-11a3-448e-a1aa-0bc48fc09cb6"},{"id":"2c52a91e-6ef4-4364-a525-33ceae21618a"},{"id":"4ef0d3a2-2a6a-451f-b69e-65ae4441c156"}],"default_lakehouse":"4ef0d3a2-2a6a-451f-b69e-65ae4441c156","default_lakehouse_name":"wwilakehouse","default_lakehouse_workspace_id":"e7126b3a-f0ff-4e7f-9def-81473274c10c"}}},"nbformat":4,"nbformat_minor":5}

0 commit comments

Comments
 (0)