In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import os

In [2]:
jar_path = os.path.abspath("./conector_dll/mssql-jdbc-13.2.1.jre11.jar")
dll_path = os.path.abspath("./conector_dll/mssql-jdbc_auth-13.2.1.x64.dll")  

os.environ["PATH"] = os.path.dirname(dll_path) + os.pathsep + os.environ["PATH"]

In [3]:
spark = SparkSession.builder \
    .appName("storeToClean") \
    .config("spark.jars", jar_path) \
    .getOrCreate()

In [4]:
csv_path = "./dataset_files/store.csv"

In [5]:
store_df = spark.read.csv(csv_path, header = "True")
store_df.show(5)

+-----+---------+----------+-------------------+-------------------------+------------------------+------+---------------+---------------+---------------+
|Store|StoreType|Assortment|CompetitionDistance|CompetitionOpenSinceMonth|CompetitionOpenSinceYear|Promo2|Promo2SinceWeek|Promo2SinceYear|  PromoInterval|
+-----+---------+----------+-------------------+-------------------------+------------------------+------+---------------+---------------+---------------+
|    1|        c|         a|               1270|                        9|                    2008|     0|           NULL|           NULL|           NULL|
|    2|        a|         a|                570|                       11|                    2007|     1|             13|           2010|Jan,Apr,Jul,Oct|
|    3|        a|         a|              14130|                       12|                    2006|     1|             14|           2011|Jan,Apr,Jul,Oct|
|    4|        c|         c|                620|                      

In [6]:
num_colunas = len(store_df.columns)
print("Número de colunas:", num_colunas)

Número de colunas: 10


In [7]:
Schema = StructType([
    StructField('Store', StringType(), nullable=True),
    StructField('StoreType', StringType(), nullable=True),
    StructField('Assortment', StringType(), nullable=True),
    StructField('CompetitionDistance', FloatType(), nullable=True),
    StructField('CompetitionOpenSinceMonth', IntegerType(), nullable=True),
    StructField('CompetitionOpenSinceYear', IntegerType(), nullable=True),
    StructField('Promo2', IntegerType(), nullable=True),
    StructField('Promo2SinceWeek', IntegerType(), nullable=True),
    StructField('Promo2SinceYear', IntegerType(), nullable=True),
    StructField('PromoInterval', StringType(), nullable=True),
])

df = spark.read.option("header", True).schema(Schema).csv(csv_path)
df.show()

+-----+---------+----------+-------------------+-------------------------+------------------------+------+---------------+---------------+----------------+
|Store|StoreType|Assortment|CompetitionDistance|CompetitionOpenSinceMonth|CompetitionOpenSinceYear|Promo2|Promo2SinceWeek|Promo2SinceYear|   PromoInterval|
+-----+---------+----------+-------------------+-------------------------+------------------------+------+---------------+---------------+----------------+
|    1|        c|         a|             1270.0|                        9|                    2008|     0|           NULL|           NULL|            NULL|
|    2|        a|         a|              570.0|                       11|                    2007|     1|             13|           2010| Jan,Apr,Jul,Oct|
|    3|        a|         a|            14130.0|                       12|                    2006|     1|             14|           2011| Jan,Apr,Jul,Oct|
|    4|        c|         c|              620.0|                

In [8]:
df.dtypes

[('Store', 'string'),
 ('StoreType', 'string'),
 ('Assortment', 'string'),
 ('CompetitionDistance', 'float'),
 ('CompetitionOpenSinceMonth', 'int'),
 ('CompetitionOpenSinceYear', 'int'),
 ('Promo2', 'int'),
 ('Promo2SinceWeek', 'int'),
 ('Promo2SinceYear', 'int'),
 ('PromoInterval', 'string')]

In [9]:
df_malformed=spark.read.option("header", True).option("mode",'DROPMALFORMED').csv(csv_path)

In [10]:
df.fillna(value=0).show()

+-----+---------+----------+-------------------+-------------------------+------------------------+------+---------------+---------------+----------------+
|Store|StoreType|Assortment|CompetitionDistance|CompetitionOpenSinceMonth|CompetitionOpenSinceYear|Promo2|Promo2SinceWeek|Promo2SinceYear|   PromoInterval|
+-----+---------+----------+-------------------+-------------------------+------------------------+------+---------------+---------------+----------------+
|    1|        c|         a|             1270.0|                        9|                    2008|     0|              0|              0|            NULL|
|    2|        a|         a|              570.0|                       11|                    2007|     1|             13|           2010| Jan,Apr,Jul,Oct|
|    3|        a|         a|            14130.0|                       12|                    2006|     1|             14|           2011| Jan,Apr,Jul,Oct|
|    4|        c|         c|              620.0|                

In [None]:
df.write \
    .format("jdbc") \
    .mode("overwrite") \
    .option("url", "jdbc:sqlserver://localhost:1433;databaseName=Datasets;integratedSecurity=true;encrypt=true;trustServerCertificate=true;") \
    .option("dbtable", "dbo.ETLBasico") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

Tentando gravar no SQL Server...
DADOS GRAVADOS COM SUCESSO!
