In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import os

In [None]:
jar_path = os.path.abspath("./conector_dll/mssql-jdbc-13.2.1.jre11.jar")
dll_path = os.path.abspath("./conector_dll/mssql-jdbc_auth-13.2.1.x64.dll")  

os.environ["PATH"] = os.path.dirname(dll_path) + os.pathsep + os.environ["PATH"]

In [None]:
spark = SparkSession.builder \
    .appName("storeToClean") \
    .config("spark.jars", jar_path) \
    .getOrCreate()

In [None]:
csv_path = "./dataset_files/store.csv"

In [None]:
store_df = spark.read.csv(csv_path, header = "True")
store_df.show(5)

In [None]:
Schema = StructType([
    StructField('Store', StringType(), nullable=True),
    StructField('StoreType', StringType(), nullable=True),
    StructField('Assortment', StringType(), nullable=True),
    StructField('CompetitionDistance', FloatType(), nullable=True),
    StructField('CompetitionOpenSinceMonth', IntegerType(), nullable=True),
    StructField('CompetitionOpenSinceYear', IntegerType(), nullable=True),
    StructField('Promo2', IntegerType(), nullable=True),
    StructField('Promo2SinceWeek', IntegerType(), nullable=True),
    StructField('Promo2SinceYear', IntegerType(), nullable=True),
    StructField('PromoInterval', StringType(), nullable=True),
])

df = spark.read.option("header", True).option("mode", "DROPMALFORMED").schema(Schema).csv(csv_path)
df.show()

In [None]:
df.dtypes

In [None]:
df.fillna(value=0).show()

In [None]:
df.write \
    .format("jdbc") \
    .mode("overwrite") \
    .option("url", "jdbc:sqlserver://localhost:1433;databaseName=Datasets;integratedSecurity=true;encrypt=true;trustServerCertificate=true;") \
    .option("dbtable", "dbo.ETLBasico") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()