From b2ef5f6b229a5cde5a1b926537916805904b20ae Mon Sep 17 00:00:00 2001 From: shivsood Date: Wed, 17 Apr 2019 16:18:00 -0700 Subject: [PATCH 1/4] minor doc fixes --- .../sql-big-data-cluster/spark/README.md | 28 +++++++++-- .../spark_to_sql/spark_to_sql_jdbc.ipynb | 46 +++++++++++++------ ...in_score_export_ml_models_with_spark.ipynb | 2 +- 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/samples/features/sql-big-data-cluster/spark/README.md b/samples/features/sql-big-data-cluster/spark/README.md index 4708699695..01c8321c58 100644 --- a/samples/features/sql-big-data-cluster/spark/README.md +++ b/samples/features/sql-big-data-cluster/spark/README.md @@ -2,12 +2,30 @@ The new built-in notebooks in Azure Data Studio enables data scientists and data engineers to run Python, R, or Scala code against the cluster. -## Instructions +## Instructions to open a notebook from Azure Data Studio + +1. Connect to the SQL Server Master instance in a big data cluster + +1. Right-click on the server name, select **Manage**, switch to **SQL Server Big Data Cluster** tab, and use open Notebook + +## __[dataloading](dataloading/)__ + +This folder contains samples that show how to load data using Spark. + +[PySpark Hello World](dataloading/hello_PySpark.ipynb) -1. Download and save the notebook file [spark-sql.ipynb](spark-sql.ipynb/) locally. +[Scala Hello World ](dataloading/hello_Scala.ipynb) + +[SparkR Hello World ](dataloading/hello_sparkR.ipynb) + +[DataLoading - Transforming CSV to Parquet](dataloading/transform-csv-files.ipynb/) + +[Data Transfer - Spark to SQL using JDBC ](spark_to_sql/spark_to_sql_jdbc.ipynb/) + +## Instructions -1. Open the notebook file in Azure Data Studio (right click on the SQL Server big data cluster server name-> **Manage**-> Open Notebook. +1. Download and save the notebook file [dataloading/transnform-csv-files.ipynb](dataloading/transform-csv-files.ipynb/) locally. -1. Wait for the “Kernel” and the target context (“Attach to”) to be populated. Set the “Kernel” to **PySpark3** and “Attach to” needs to be the IP address of your big data cluster endpoint. +2. Open the notebook in Azure Data Studio, wait for the “Kernel” and the target context (“Attach to”) to be populated. Set the “Kernel” to **PySpark3** and **Attach to** needs to be the IP address of your big data cluster endpoint. -1. Run each cell in the Notebook sequentially using Azure Data Studio. \ No newline at end of file +3. Run each cell in the Notebook sequentially. \ No newline at end of file diff --git a/samples/features/sql-big-data-cluster/spark/spark_to_sql/spark_to_sql_jdbc.ipynb b/samples/features/sql-big-data-cluster/spark/spark_to_sql/spark_to_sql_jdbc.ipynb index 0531f80d7a..d574d79298 100644 --- a/samples/features/sql-big-data-cluster/spark/spark_to_sql/spark_to_sql_jdbc.ipynb +++ b/samples/features/sql-big-data-cluster/spark/spark_to_sql/spark_to_sql_jdbc.ipynb @@ -19,7 +19,7 @@ "cells": [ { "cell_type": "markdown", - "source": "# Read and write from Spark to SQL\r\nA typical big data scenario is large scale ETL in Spark and writing the processed data to SQLServer. The following samples shows \r\n- reading a HDFS file, \r\n- some basic processing on it and \r\n- then processed data to SQL Server table.\r\n\r\nNeed a database precreated in SQL for this sample. Here we are using database name \"MyTestDatabase\" that can be created using SQL statements below.\r\n\r\n``` sql\r\nCreate DATABASE MyTestDatabase\r\nGO \r\n``` \r\n ", + "source": "# Read and write from Spark to SQL\r\nA typical big data scenario is large scale ETL in Spark and post processing the data is written out to SQLServer for access to LOB applications. This sample shows how to write to SQLServer from Spark. The main steps in the sample are \r\n- Reading a HDFS file, \r\n- Basic processing on it and \r\n- Then writing processed data to SQL Server table using JDBC\r\n\r\nPreReq : \r\n- The sample uses a SQL database named \"MyTestDatabase\". Create this before you run this sample. The database can be created as follows\r\n ``` sql\r\n Create DATABASE MyTestDatabase\r\n GO \r\n ``` \r\n- Download [AdultCensusIncome.csv]( https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv ) to your local machine. Create a hdfs folder named spark_data and upload the file there. \r\n\r\n \r\n ", "metadata": {} }, { @@ -28,12 +28,30 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n|age| workclass|fnlwgt|education|education-num| marital-status| occupation| relationship| race| sex|capital-gain|capital-loss|hours-per-week|native-country|income|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n| 39| State-gov| 77516|Bachelors| 13| Never-married| Adm-clerical|Not-in-family|White| Male| 2174| 0| 40| United-States| <=50K|\n| 50|Self-emp-not-inc| 83311|Bachelors| 13|Married-civ-spouse| Exec-managerial| Husband|White| Male| 0| 0| 13| United-States| <=50K|\n| 38| Private|215646| HS-grad| 9| Divorced|Handlers-cleaners|Not-in-family|White| Male| 0| 0| 40| United-States| <=50K|\n| 53| Private|234721| 11th| 7|Married-civ-spouse|Handlers-cleaners| Husband|Black| Male| 0| 0| 40| United-States| <=50K|\n| 28| Private|338409|Bachelors| 13|Married-civ-spouse| Prof-specialty| Wife|Black|Female| 0| 0| 40| Cuba| <=50K|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\nonly showing top 5 rows" + "text": "Starting Spark application\n", + "output_type": "stream" + }, + { + "data": { + "text/plain": "", + "text/html": "\n
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
2application_1554755839506_0003pyspark3idleLinkLink
" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "text": "SparkSession available as 'spark'.\n", + "output_type": "stream" + }, + { + "name": "stdout", + "text": "+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n|age| workclass|fnlwgt|education|education-num| marital-status| occupation| relationship| race| sex|capital-gain|capital-loss|hours-per-week|native-country|income|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n| 39| State-gov| 77516|Bachelors| 13| Never-married| Adm-clerical|Not-in-family|White| Male| 2174| 0| 40| United-States| <=50K|\n| 50|Self-emp-not-inc| 83311|Bachelors| 13|Married-civ-spouse| Exec-managerial| Husband|White| Male| 0| 0| 13| United-States| <=50K|\n| 38| Private|215646| HS-grad| 9| Divorced|Handlers-cleaners|Not-in-family|White| Male| 0| 0| 40| United-States| <=50K|\n| 53| Private|234721| 11th| 7|Married-civ-spouse|Handlers-cleaners| Husband|Black| Male| 0| 0| 40| United-States| <=50K|\n| 28| Private|338409|Bachelors| 13|Married-civ-spouse| Prof-specialty| Wife|Black|Female| 0| 0| 40| Cuba| <=50K|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\nonly showing top 5 rows", + "output_type": "stream" } ], - "execution_count": 8 + "execution_count": 3 }, { "cell_type": "code", @@ -41,25 +59,25 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n|age| workclass|fnlwgt|education|education_num| marital_status| occupation| relationship| race| sex|capital_gain|capital_loss|hours_per_week|native_country|income|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n| 39| State-gov| 77516|Bachelors| 13| Never-married| Adm-clerical|Not-in-family|White| Male| 2174| 0| 40| United-States| <=50K|\n| 50|Self-emp-not-inc| 83311|Bachelors| 13|Married-civ-spouse| Exec-managerial| Husband|White| Male| 0| 0| 13| United-States| <=50K|\n| 38| Private|215646| HS-grad| 9| Divorced|Handlers-cleaners|Not-in-family|White| Male| 0| 0| 40| United-States| <=50K|\n| 53| Private|234721| 11th| 7|Married-civ-spouse|Handlers-cleaners| Husband|Black| Male| 0| 0| 40| United-States| <=50K|\n| 28| Private|338409|Bachelors| 13|Married-civ-spouse| Prof-specialty| Wife|Black|Female| 0| 0| 40| Cuba| <=50K|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\nonly showing top 5 rows" + "text": "+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n|age| workclass|fnlwgt|education|education_num| marital_status| occupation| relationship| race| sex|capital_gain|capital_loss|hours_per_week|native_country|income|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n| 39| State-gov| 77516|Bachelors| 13| Never-married| Adm-clerical|Not-in-family|White| Male| 2174| 0| 40| United-States| <=50K|\n| 50|Self-emp-not-inc| 83311|Bachelors| 13|Married-civ-spouse| Exec-managerial| Husband|White| Male| 0| 0| 13| United-States| <=50K|\n| 38| Private|215646| HS-grad| 9| Divorced|Handlers-cleaners|Not-in-family|White| Male| 0| 0| 40| United-States| <=50K|\n| 53| Private|234721| 11th| 7|Married-civ-spouse|Handlers-cleaners| Husband|Black| Male| 0| 0| 40| United-States| <=50K|\n| 28| Private|338409|Bachelors| 13|Married-civ-spouse| Prof-specialty| Wife|Black|Female| 0| 0| 40| Cuba| <=50K|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\nonly showing top 5 rows", + "output_type": "stream" } ], - "execution_count": 9 + "execution_count": 4 }, { "cell_type": "code", - "source": "#Write from Spark to SQL table using JDBC\r\nprint(\"Use build in JDBC connector to write to SQLServer master instance in Big data \")\r\n\r\nservername = \"jdbc:sqlserver://mssql-master-pool-0.service-master-pool\"\r\ndbname = \"MyTestDatabase\"\r\nurl = servername + \";\" + \"databaseName=\" + dbname + \";\"\r\n\r\nc = \"dbo.AdultCensus\"\r\nuser = \"sa\"\r\npassword = \"****\"\r\n\r\nprint(\"url is \", url)\r\n\r\ntry:\r\n df.write \\\r\n .format(\"jdbc\") \\\r\n .mode(\"overwrite\") \\\r\n .option(\"url\", url) \\\r\n .option(\"dbtable\", dbtable) \\\r\n .option(\"user\", user) \\\r\n .option(\"password\", password)\\\r\n .save()\r\nexcept ValueError as error :\r\n print(\"JDBC Write failed\", error)\r\n\r\nprint(\"JDBC Write done \")\r\n\r\n\r\n", + "source": "#Write from Spark to SQL table using JDBC\r\nprint(\"Use build in JDBC connector to write to SQLServer master instance in Big data \")\r\n\r\nservername = \"jdbc:sqlserver://master-0.master-svc\"\r\ndbname = \"MyTestDatabase\"\r\nurl = servername + \";\" + \"databaseName=\" + dbname + \";\"\r\n\r\ndbtable = \"dbo.AdultCensus\"\r\nuser = \"sa\"\r\npassword = \"Yukon900\"\r\n\r\nprint(\"url is \", url)\r\n\r\ntry:\r\n df.write \\\r\n .format(\"jdbc\") \\\r\n .mode(\"overwrite\") \\\r\n .option(\"url\", url) \\\r\n .option(\"dbtable\", dbtable) \\\r\n .option(\"user\", user) \\\r\n .option(\"password\", password)\\\r\n .save()\r\nexcept ValueError as error :\r\n print(\"JDBC Write failed\", error)\r\n\r\nprint(\"JDBC Write done \")\r\n\r\n\r\n", "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "Use build in JDBC connector to write to SQLServer master instance in Big data \nurl is jdbc:sqlserver://mssql-master-pool-0.service-master-pool;databaseName=MyTestDatabase;\nJDBC Write done" + "text": "Use build in JDBC connector to write to SQLServer master instance in Big data \nurl is jdbc:sqlserver://master-0.master-svc;databaseName=MyTestDatabase;\nJDBC Write done", + "output_type": "stream" } ], - "execution_count": 10 + "execution_count": 9 }, { "cell_type": "code", @@ -67,12 +85,12 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "read data from SQL server table \n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n|age| workclass|fnlwgt|education|education_num| marital_status| occupation| relationship| race| sex|capital_gain|capital_loss|hours_per_week|native_country|income|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n| 39| State-gov| 77516|Bachelors| 13| Never-married| Adm-clerical|Not-in-family|White| Male| 2174| 0| 40| United-States| <=50K|\n| 50|Self-emp-not-inc| 83311|Bachelors| 13|Married-civ-spouse| Exec-managerial| Husband|White| Male| 0| 0| 13| United-States| <=50K|\n| 38| Private|215646| HS-grad| 9| Divorced|Handlers-cleaners|Not-in-family|White| Male| 0| 0| 40| United-States| <=50K|\n| 53| Private|234721| 11th| 7|Married-civ-spouse|Handlers-cleaners| Husband|Black| Male| 0| 0| 40| United-States| <=50K|\n| 28| Private|338409|Bachelors| 13|Married-civ-spouse| Prof-specialty| Wife|Black|Female| 0| 0| 40| Cuba| <=50K|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\nonly showing top 5 rows" + "text": "read data from SQL server table \n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n|age| workclass|fnlwgt|education|education_num| marital_status| occupation| relationship| race| sex|capital_gain|capital_loss|hours_per_week|native_country|income|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n| 39| State-gov| 77516|Bachelors| 13| Never-married| Adm-clerical|Not-in-family|White| Male| 2174| 0| 40| United-States| <=50K|\n| 50|Self-emp-not-inc| 83311|Bachelors| 13|Married-civ-spouse| Exec-managerial| Husband|White| Male| 0| 0| 13| United-States| <=50K|\n| 38| Private|215646| HS-grad| 9| Divorced|Handlers-cleaners|Not-in-family|White| Male| 0| 0| 40| United-States| <=50K|\n| 53| Private|234721| 11th| 7|Married-civ-spouse|Handlers-cleaners| Husband|Black| Male| 0| 0| 40| United-States| <=50K|\n| 28| Private|338409|Bachelors| 13|Married-civ-spouse| Prof-specialty| Wife|Black|Female| 0| 0| 40| Cuba| <=50K|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\nonly showing top 5 rows", + "output_type": "stream" } ], - "execution_count": 13 + "execution_count": 11 } ] } \ No newline at end of file diff --git a/samples/features/sql-big-data-cluster/spark/sparkml/train_score_export_ml_models_with_spark.ipynb b/samples/features/sql-big-data-cluster/spark/sparkml/train_score_export_ml_models_with_spark.ipynb index 26567e6d8c..df4bd26895 100644 --- a/samples/features/sql-big-data-cluster/spark/sparkml/train_score_export_ml_models_with_spark.ipynb +++ b/samples/features/sql-big-data-cluster/spark/sparkml/train_score_export_ml_models_with_spark.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "markdown", - "source": "## Step 1 - Explore your data\r\n### Load the data\r\nFor this example we'll use **AdultCensusIncome** data from [here]( https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv ). From your Azure Data Studio connect to the HDFS/Spark gateway and create a directory called spark_ml under HDFS. \r\nDownload [AdultCensusIncome.csv]( https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv ) to your local machine and upload to HDFS.Upload AdultCensusIncome.csv to the folder we created.\r\n\r\n### Exploratory Analysis\r\n- Baisc exploration on the data\r\n- Labels & Features\r\n1. **Label** - This refers to predicted value. This is represented as a column in the data. Label is **income** \r\n2. **Features** - This refers to the characteristics that are used to predict. **age** and **hours_per_week**\r\n\r\nNote : In reality features are chosen by applying some correlations techniques to understand what best characterize the Label we are predicting.\r\n\r\n### The Model we will build\r\nIn AdultCensusIncome.csv contains several columsn like Income range, age, hours-per-week, education, occupation etc. We'll build a model that can predict income range would be >50K or <50K.\r\n", + "source": "## Step 1 - Explore your data\r\n### Load the data\r\nFor this example we'll use **AdultCensusIncome** data from [here]( https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv ). From your Azure Data Studio connect to the HDFS/Spark gateway and create a directory called spark_data under HDFS. \r\nDownload [AdultCensusIncome.csv]( https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv ) to your local machine and upload to HDFS.Upload AdultCensusIncome.csv to the folder we created.\r\n\r\n### Exploratory Analysis\r\n- Baisc exploration on the data\r\n- Labels & Features\r\n1. **Label** - This refers to predicted value. This is represented as a column in the data. Label is **income** \r\n2. **Features** - This refers to the characteristics that are used to predict. **age** and **hours_per_week**\r\n\r\nNote : In reality features are chosen by applying some correlations techniques to understand what best characterize the Label we are predicting.\r\n\r\n### The Model we will build\r\nIn AdultCensusIncome.csv contains several columsn like Income range, age, hours-per-week, education, occupation etc. We'll build a model that can predict income range would be >50K or <50K.\r\n", "metadata": {} }, { From e67a4c87fdb9b725f0391500735e2da37fedac91 Mon Sep 17 00:00:00 2001 From: shivsood Date: Wed, 17 Apr 2019 18:06:11 -0700 Subject: [PATCH 2/4] fixed the readme doc --- .../sql-big-data-cluster/spark/README.md | 31 ++++--------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/samples/features/sql-big-data-cluster/spark/README.md b/samples/features/sql-big-data-cluster/spark/README.md index b81b1b592f..e2d7ac009a 100644 --- a/samples/features/sql-big-data-cluster/spark/README.md +++ b/samples/features/sql-big-data-cluster/spark/README.md @@ -1,17 +1,8 @@ # SQL Server big data clusters -The new built-in notebooks in Azure Data Studio enables data scientists and data engineers to run Python, R, or Scala code against the cluster. +SQL Server Big Data cluster bundles Spark and HDFS together with SQL server. Azure Data Studio IDE provides built in notebooks that enables data scientists and data engineers to run Spark notebooks and job in Python, R, or Scala code against the Big Data Cluster. This folder contains spark sample notebook on using Spark in SQL server Big data cluster -## Instructions to open a notebook from Azure Data Studio - -1. Connect to the SQL Server Master instance in a big data cluster - -1. Right-click on the server name, select **Manage**, switch to **SQL Server Big Data Cluster** tab, and use open Notebook - -## __[dataloading](dataloading/)__ -<<<<<<< HEAD - -This folder contains samples that show how to load data using Spark. +## Folder contents [PySpark Hello World](dataloading/hello_PySpark.ipynb) @@ -22,23 +13,13 @@ This folder contains samples that show how to load data using Spark. [DataLoading - Transforming CSV to Parquet](dataloading/transform-csv-files.ipynb/) [Data Transfer - Spark to SQL using JDBC ](spark_to_sql/spark_to_sql_jdbc.ipynb/) -======= - -This folder contains samples that show how to load data using Spark. - -[dataloading/transform-csv-files.ipynb](dataloading/transform-csv-files.ipynb/) ->>>>>>> upstream/master -## Instructions +## Instructions on how to run in Azure Data Studio 1. Download and save the notebook file [dataloading/transnform-csv-files.ipynb](dataloading/transform-csv-files.ipynb/) locally. -<<<<<<< HEAD -2. Open the notebook in Azure Data Studio, wait for the “Kernel” and the target context (“Attach to”) to be populated. Set the “Kernel” to **PySpark3** and **Attach to** needs to be the IP address of your big data cluster endpoint. +2. From Azure Data Studio Connect to the SQL Server Master instance in a big data cluster. -3. Run each cell in the Notebook sequentially. -======= -1. Open the notebook in Azure Data Studio, wait for the “Kernel” and the target context (“Attach to”) to be populated. Set the “Kernel” to **PySpark3** and **Attach to** needs to be the IP address of your big data cluster endpoint. +3. Right-click on the server name, select **Manage**, switch to **SQL Server Big Data Cluster** tab, and open the notebook in Azure Data Studio. Wait for the “Kernel” and the target context (“Attach to”) to be populated. If required set the relevant “Kernel” ( e.g **PySpark3** ) and **Attach to** needs to be the IP address of your big data cluster endpoint. -1. Run each cell in the Notebook sequentially. ->>>>>>> upstream/master +4. Run each cell in the Notebook sequentially. From 720268a5d863b7e9835ab4f6006c1eb7819a6954 Mon Sep 17 00:00:00 2001 From: shivsood Date: Tue, 23 Apr 2019 12:42:41 -0700 Subject: [PATCH 3/4] Read me fixes --- samples/features/sql-big-data-cluster/spark/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/samples/features/sql-big-data-cluster/spark/README.md b/samples/features/sql-big-data-cluster/spark/README.md index c638796393..199ab595a4 100644 --- a/samples/features/sql-big-data-cluster/spark/README.md +++ b/samples/features/sql-big-data-cluster/spark/README.md @@ -12,7 +12,9 @@ SQL Server Big Data cluster bundles Spark and HDFS together with SQL server. Azu [DataLoading - Transforming CSV to Parquet](dataloading/transform-csv-files.ipynb/) -[Data Transfer - Spark to SQL using JDBC ](spark_to_sql/spark_to_sql_jdbc.ipynb/) +[Data Transfer - Spark to SQL using Spark JDBC connector](data-virtualization/spark_to_sql_jdbc.ipynb/) + +[Data Transfer - Spark to SQL using MSSQL Spark connector](spark_to_sql/mssql_spark_connector.ipynb/) ## Instructions on how to run in Azure Data Studio From 6d5bdc9b8ebb7403122ba88fc5f7732fa1418236 Mon Sep 17 00:00:00 2001 From: shivsood Date: Tue, 23 Apr 2019 12:46:23 -0700 Subject: [PATCH 4/4] cleaup of results, remove pass --- .../spark_to_sql/mssql_spark_connector.ipynb | 58 ++----------------- 1 file changed, 6 insertions(+), 52 deletions(-) diff --git a/samples/features/sql-big-data-cluster/spark/spark_to_sql/mssql_spark_connector.ipynb b/samples/features/sql-big-data-cluster/spark/spark_to_sql/mssql_spark_connector.ipynb index 2af3eb51fe..def8a13a1c 100644 --- a/samples/features/sql-big-data-cluster/spark/spark_to_sql/mssql_spark_connector.ipynb +++ b/samples/features/sql-big-data-cluster/spark/spark_to_sql/mssql_spark_connector.ipynb @@ -31,24 +31,7 @@ "cell_type": "code", "source": "%%configure -f\r\n{\"conf\": {\"spark.jars\": \"/jar/spark-mssql-connector-assembly-1.0.0.jar\"}}\r\n\r\n\r\n\r\n", "metadata": {}, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": "", - "text/html": "Current session configs: {'conf': {'spark.jars': '/jar/spark-mssql-connector-assembly-1.0.0.jar'}, 'kind': 'pyspark3'}
" - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": "", - "text/html": "No active sessions." - }, - "metadata": {} - } - ], + "outputs": [], "execution_count": 4 }, { @@ -60,31 +43,14 @@ "cell_type": "code", "source": "#Read a file and then write it to the SQL table\r\ndatafile = \"/spark_data/AdultCensusIncome.csv\"\r\ndf = spark.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)\r\ndf.show(5)\r\n", "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "Starting Spark application\n" - }, - { - "output_type": "stream", - "name": "stderr", - "text": "The code failed because of a fatal error:\n\tSession 98 unexpectedly reached final status 'error'. See logs:\nstdout: \n\nstderr: \n19/04/20 02:07:37 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-api-0.5.33476.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-rsc-0.5.33476.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/netty-all-4.1.17.Final.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-api-jdo-3.2.6.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-core-3.2.10.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-rdbms-3.2.9.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/spark/metrics-influxdb-1.1.8.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/spark/spark-influx-sink-0.4.0.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/jar/spark-mssql-connector-assembly-1.0.0.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/repl_2.11-jars/commons-codec-1.9.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/repl_2.11-jars/livy-core_2.11-0.5.33476.jar.\n19/04/20 02:07:38 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/repl_2.11-jars/livy-repl_2.11-0.5.33476.jar.\n19/04/20 02:07:38 INFO client.RMProxy: Connecting to ResourceManager at mssql-master-pool-0.service-master-pool/10.244.1.8:8032\n19/04/20 02:07:38 INFO yarn.Client: Requesting a new application from cluster with 2 NodeManagers\n19/04/20 02:07:38 INFO yarn.Client: Verifying our application has not requested more than the maximum memory capability of the cluster (18432 MB per container)\n19/04/20 02:07:38 INFO yarn.Client: Will allocate AM container, with 2432 MB memory including 384 MB overhead\n19/04/20 02:07:38 INFO yarn.Client: Setting up container launch context for our AM\n19/04/20 02:07:38 INFO yarn.Client: Setting up the launch environment for our AM container\n19/04/20 02:07:38 INFO yarn.Client: Preparing resources for our AM container\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs:/spark/spark_libs.zip\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-api-0.5.33476.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-rsc-0.5.33476.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/netty-all-4.1.17.Final.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-api-jdo-3.2.6.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-core-3.2.10.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-rdbms-3.2.9.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/spark/metrics-influxdb-1.1.8.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/spark/spark-influx-sink-0.4.0.jar\n19/04/20 02:07:38 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/jar/spark-mssql-connector-assembly-1.0.0.jar\n19/04/20 02:07:38 INFO yarn.Client: Deleted staging directory hdfs://mssql-master-pool-0.service-master-pool:9000/user/root/.sparkStaging/application_1554316083160_0119\nException in thread \"main\" java.io.FileNotFoundException: File does not exist: hdfs://mssql-master-pool-0.service-master-pool:9000/jar/spark-mssql-connector-assembly-1.0.0.jar\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1533)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1526)\n\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1541)\n\tat org.apache.spark.deploy.yarn.ClientDistributedCacheManager$$anonfun$1.apply(ClientDistributedCacheManager.scala:71)\n\tat org.apache.spark.deploy.yarn.ClientDistributedCacheManager$$anonfun$1.apply(ClientDistributedCacheManager.scala:71)\n\tat scala.collection.MapLike$class.getOrElse(MapLike.scala:128)\n\tat scala.collection.AbstractMap.getOrElse(Map.scala:59)\n\tat org.apache.spark.deploy.yarn.ClientDistributedCacheManager.addResource(ClientDistributedCacheManager.scala:71)\n\tat org.apache.spark.deploy.yarn.Client.org$apache$spark$deploy$yarn$Client$$distribute$1(Client.scala:478)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:598)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:597)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:597)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:596)\n\tat scala.collection.immutable.List.foreach(List.scala:392)\n\tat org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:596)\n\tat org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:864)\n\tat org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:178)\n\tat org.apache.spark.deploy.yarn.Client.run(Client.scala:1134)\n\tat org.apache.spark.deploy.yarn.YarnClusterApplication.start(Client.scala:1526)\n\tat org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:849)\n\tat org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:167)\n\tat org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:195)\n\tat org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)\n\tat org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:924)\n\tat org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:933)\n\tat org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)\n19/04/20 02:07:38 INFO util.ShutdownHookManager: Shutdown hook called\n19/04/20 02:07:38 INFO util.ShutdownHookManager: Deleting directory /tmp/spark-57b2c705-b20a-46e7-8fa4-d4b3ef432906\n\nYARN Diagnostics: .\n\nSome things to try:\na) Make sure Spark has enough available resources for Jupyter to create a Spark context.\nb) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.\nc) Restart the kernel.\n" - } - ], + "outputs": [], "execution_count": 6 }, { "cell_type": "code", "source": "\r\n#Process this data. Very simple data cleanup steps. Replacing \"-\" with \"_\" in column names\r\ncolumns_new = [col.replace(\"-\", \"_\") for col in df.columns]\r\ndf = df.toDF(*columns_new)\r\ndf.show(5)\r\n\r\n", "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": "The code failed because of a fatal error:\n\tSession 96 unexpectedly reached final status 'error'. See logs:\nstdout: \n\nstderr: \n19/04/20 02:02:03 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-api-0.5.33476.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-rsc-0.5.33476.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/netty-all-4.1.17.Final.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-api-jdo-3.2.6.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-core-3.2.10.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-rdbms-3.2.9.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/spark/metrics-influxdb-1.1.8.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/spark/spark-influx-sink-0.4.0.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/jar/spark-mssql-connector-assembly-1.0.0.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/repl_2.11-jars/commons-codec-1.9.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/repl_2.11-jars/livy-core_2.11-0.5.33476.jar.\n19/04/20 02:02:04 WARN deploy.DependencyUtils: Skip remote jar hdfs://mssql-master-pool-0.service-master-pool:9000/livy/repl_2.11-jars/livy-repl_2.11-0.5.33476.jar.\n19/04/20 02:02:04 INFO client.RMProxy: Connecting to ResourceManager at mssql-master-pool-0.service-master-pool/10.244.1.8:8032\n19/04/20 02:02:04 INFO yarn.Client: Requesting a new application from cluster with 2 NodeManagers\n19/04/20 02:02:04 INFO yarn.Client: Verifying our application has not requested more than the maximum memory capability of the cluster (18432 MB per container)\n19/04/20 02:02:04 INFO yarn.Client: Will allocate AM container, with 2432 MB memory including 384 MB overhead\n19/04/20 02:02:04 INFO yarn.Client: Setting up container launch context for our AM\n19/04/20 02:02:04 INFO yarn.Client: Setting up the launch environment for our AM container\n19/04/20 02:02:04 INFO yarn.Client: Preparing resources for our AM container\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs:/spark/spark_libs.zip\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-api-0.5.33476.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/livy-rsc-0.5.33476.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/rsc-jars/netty-all-4.1.17.Final.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-api-jdo-3.2.6.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-core-3.2.10.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/livy/spark/datanucleus-rdbms-3.2.9.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/spark/metrics-influxdb-1.1.8.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/spark/spark-influx-sink-0.4.0.jar\n19/04/20 02:02:04 INFO yarn.Client: Source and destination file systems are the same. Not copying hdfs://mssql-master-pool-0.service-master-pool:9000/jar/spark-mssql-connector-assembly-1.0.0.jar\n19/04/20 02:02:04 INFO yarn.Client: Deleted staging directory hdfs://mssql-master-pool-0.service-master-pool:9000/user/root/.sparkStaging/application_1554316083160_0117\nException in thread \"main\" java.io.FileNotFoundException: File does not exist: hdfs://mssql-master-pool-0.service-master-pool:9000/jar/spark-mssql-connector-assembly-1.0.0.jar\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1533)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1526)\n\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1541)\n\tat org.apache.spark.deploy.yarn.ClientDistributedCacheManager$$anonfun$1.apply(ClientDistributedCacheManager.scala:71)\n\tat org.apache.spark.deploy.yarn.ClientDistributedCacheManager$$anonfun$1.apply(ClientDistributedCacheManager.scala:71)\n\tat scala.collection.MapLike$class.getOrElse(MapLike.scala:128)\n\tat scala.collection.AbstractMap.getOrElse(Map.scala:59)\n\tat org.apache.spark.deploy.yarn.ClientDistributedCacheManager.addResource(ClientDistributedCacheManager.scala:71)\n\tat org.apache.spark.deploy.yarn.Client.org$apache$spark$deploy$yarn$Client$$distribute$1(Client.scala:478)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:598)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:597)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:597)\n\tat org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:596)\n\tat scala.collection.immutable.List.foreach(List.scala:392)\n\tat org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:596)\n\tat org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:864)\n\tat org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:178)\n\tat org.apache.spark.deploy.yarn.Client.run(Client.scala:1134)\n\tat org.apache.spark.deploy.yarn.YarnClusterApplication.start(Client.scala:1526)\n\tat org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:849)\n\tat org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:167)\n\tat org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:195)\n\tat org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)\n\tat org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:924)\n\tat org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:933)\n\tat org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)\n19/04/20 02:02:04 INFO util.ShutdownHookManager: Shutdown hook called\n19/04/20 02:02:04 INFO util.ShutdownHookManager: Deleting directory /tmp/spark-fecb2af2-6fa9-4098-aeae-fc5a91bb022e\n\nYARN Diagnostics: .\n\nSome things to try:\na) Make sure Spark has enough available resources for Jupyter to create a Spark context.\nb) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.\nc) Restart the kernel.\n" - } - ], + "outputs": [], "execution_count": 8 }, { @@ -94,15 +60,9 @@ }, { "cell_type": "code", - "source": "#Write from Spark to SQL table using MSSQL Spark Connector\r\nprint(\"Use MSSQL connector to write to master SQL instance \")\r\n\r\nservername = \"jdbc:sqlserver://master-0.master-svc\"\r\ndbname = \"MyTestDatabase\"\r\nurl = servername + \";\" + \"databaseName=\" + dbname + \";\"\r\n\r\ndbtable = \"dbo.AdultCensus\"\r\nuser = \"sa\"\r\npassword = \"Yukon900\"\r\n\r\n\r\ntry:\r\n df.write \\\r\n .format(\"com.microsoft.sqlserver.jdbc.spark\") \\\r\n .mode(\"overwrite\") \\\r\n .option(\"url\", url) \\\r\n .option(\"dbtable\", dbtable) \\\r\n .option(\"user\", user) \\\r\n .option(\"password\", password)\\\r\n .save()\r\nexcept ValueError as error :\r\n print(\"MSSQL Connector write failed\", error)\r\n\r\nprint(\"MSSQL Connector write succeeded \")\r\n\r\n\r\n", + "source": "#Write from Spark to SQL table using MSSQL Spark Connector\r\nprint(\"Use MSSQL connector to write to master SQL instance \")\r\n\r\nservername = \"jdbc:sqlserver://master-0.master-svc\"\r\ndbname = \"MyTestDatabase\"\r\nurl = servername + \";\" + \"databaseName=\" + dbname + \";\"\r\n\r\ndbtable = \"dbo.AdultCensus\"\r\nuser = \"sa\"\r\npassword = \"****\" # Please specify password here\r\n\r\n\r\ntry:\r\n df.write \\\r\n .format(\"com.microsoft.sqlserver.jdbc.spark\") \\\r\n .mode(\"overwrite\") \\\r\n .option(\"url\", url) \\\r\n .option(\"dbtable\", dbtable) \\\r\n .option(\"user\", user) \\\r\n .option(\"password\", password)\\\r\n .save()\r\nexcept ValueError as error :\r\n print(\"MSSQL Connector write failed\", error)\r\n\r\nprint(\"MSSQL Connector write succeeded \")\r\n\r\n\r\n", "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "Use build in JDBC connector to write to SQLServer master instance in Big data \nMSSQL Connector write succeeded" - } - ], + "outputs": [], "execution_count": 10 }, { @@ -114,13 +74,7 @@ "cell_type": "code", "source": "#Read from SQL table using MSSQ Connector\r\nprint(\"read data from SQL server table \")\r\njdbcDF = spark.read \\\r\n .format(\"com.microsoft.sqlserver.jdbc.spark\") \\\r\n .option(\"url\", url) \\\r\n .option(\"dbtable\", dbtable) \\\r\n .option(\"user\", user) \\\r\n .option(\"password\", password) \\\r\n .load()\r\n\r\njdbcDF.show(5)", "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "read data from SQL server table \n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n|age| workclass|fnlwgt|education|education_num| marital_status| occupation| relationship| race| sex|capital_gain|capital_loss|hours_per_week|native_country|income|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\n| 39| State-gov| 77516|Bachelors| 13| Never-married| Adm-clerical|Not-in-family|White| Male| 2174| 0| 40| United-States| <=50K|\n| 50|Self-emp-not-inc| 83311|Bachelors| 13|Married-civ-spouse| Exec-managerial| Husband|White| Male| 0| 0| 13| United-States| <=50K|\n| 38| Private|215646| HS-grad| 9| Divorced|Handlers-cleaners|Not-in-family|White| Male| 0| 0| 40| United-States| <=50K|\n| 53| Private|234721| 11th| 7|Married-civ-spouse|Handlers-cleaners| Husband|Black| Male| 0| 0| 40| United-States| <=50K|\n| 28| Private|338409|Bachelors| 13|Married-civ-spouse| Prof-specialty| Wife|Black|Female| 0| 0| 40| Cuba| <=50K|\n+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+\nonly showing top 5 rows" - } - ], + "outputs": [], "execution_count": 11 } ]