In [0]:
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [0]:
def exec_eda(df):
    numerical_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, (T.IntegerType, T.FloatType, T.DoubleType, T.LongType))]

    print(f" Total rows/ distinct rows / not null rows: {df.count()}/{df.dropDuplicates().count()}/{df.dropna(how='all').count()}")
    df.printSchema()
    df.show(5)
    df.describe().display()

    for col_name in numerical_cols:
        quantiles = df.approxQuantile(col_name, [0.25, 0.75], 0.05)
        if len(quantiles) == 2:
            Q1, Q3 = quantiles[0], quantiles[1]
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            outliers = df.filter((F.col(col_name) < lower_bound) | (F.col(col_name) > upper_bound))
            print(f"Outliers in {col_name}: {outliers.count()}")

In [0]:
def compare_schemas(schema1, schema2):
    cols1 = set([field.name for field in schema1.fields])
    cols2 = set([field.name for field in schema2.fields])

    only_in_schema1 = cols1 - cols2
    only_in_schema2 = cols2 - cols1
    common_columns = cols1 & cols2

    return {
        "only_in_schema1": list(only_in_schema1),
        "only_in_schema2": list(only_in_schema2),
    }

In [0]:
s3_path_202301 = "s3://case-tecnico-ifood/raw/fhvhv/2023-01"
s3_path_202302 = "s3://case-tecnico-ifood/raw/fhvhv/2023-02"
s3_path_202303 = "s3://case-tecnico-ifood/raw/fhvhv/2023-03"
s3_path_202304 = "s3://case-tecnico-ifood/raw/fhvhv/2023-04"
s3_path_202305 = "s3://case-tecnico-ifood/raw/fhvhv/2023-05"

In [0]:
df_202301 = spark.read.parquet(s3_path_202301)
df_202302 = spark.read.parquet(s3_path_202302)
df_202303 = spark.read.parquet(s3_path_202303)
df_202304 = spark.read.parquet(s3_path_202304)
df_202305 = spark.read.parquet(s3_path_202305)

In [0]:
if df_202301.schema == df_202302.schema == df_202303.schema == df_202304.schema == df_202305.schema:
    print("Schemas are the same")
else:
    print("Schemas are different")
    print(compare_schemas(df_202301.schema, df_202302.schema))
    print(compare_schemas(df_202301.schema, df_202303.schema))
    print(compare_schemas(df_202301.schema, df_202304.schema))
    print(compare_schemas(df_202301.schema, df_202305.schema))

In [0]:
exec_eda(df_202301)

In [0]:
exec_eda(df_202302)

In [0]:
exec_eda(df_202303)

In [0]:
exec_eda(df_202304)

In [0]:
exec_eda(df_202305)