### step 1. configuration and lib imports

In [0]:
dbutils.widgets.text("csv_url", "url")
dbutils.widgets.text("par_url", "url")

In [0]:
csv_file = dbutils.widgets.get("csv_url")
parquet_file = dbutils.widgets.get("par_url")

In [0]:
import pandas
import requests
from io import StringIO, BytesIO

from pyspark.sql.functions import lit, col, when


### step 2. fetching files from online storage 
_(can't import files from free tier databricks, so this is alternative)_

In [0]:
csv_response = requests.get(csv_file)
par_response = requests.get(parquet_file)

In [0]:
pandas_csv = pandas.read_csv(StringIO(csv_response.text), sep=',', header=0)
pandas_parquet = pandas.read_parquet(BytesIO(par_response.content))

### step 3. Creating databricks dataframe

In [0]:
df_csv = spark.createDataFrame(pandas_csv)

In [0]:
# anonimizing merchant names (requested in Data Dictionary)
df_csv = df_csv.withColumn('merchant_name', lit('Merchant'))

In [0]:
df_par = spark.createDataFrame(pandas_parquet)

In [0]:
display(df_csv.limit(10))

In [0]:
display(df_par.limit(10))

### step4. CSV data quailty checks

In [0]:
# checking for duplicates
duplicate_rows_csv = df_csv.groupBy(df_csv.merchant_id).count().filter(col("count") > 1).orderBy(col("count").desc())
display(duplicate_rows_csv.limit(10))

In [0]:
cleaned_df_csv = df_csv.join(duplicate_rows_csv, df_csv.merchant_id == duplicate_rows_csv.merchant_id, 'left')\
    .withColumn("quailty_status", when(duplicate_rows_csv.merchant_id.isNull(), lit("clean")).otherwise(lit("duplicate")))\
        .drop(duplicate_rows_csv.merchant_id).drop("count")
display(cleaned_df_csv.limit(10))

### step5. parquet data quailty checks

In [0]:
# checking consistency
display(df_par.select("authorized_flag").distinct())
display(df_par.select("category").distinct())
display(df_par.selectExpr("max(state_id)"))