In [91]:
from pyspark.sql import SparkSession

In [92]:
spark = SparkSession \
    .builder \
    .appName("Unifica_Arquivos_CSV") \
    .getOrCreate()
    

In [124]:
from airflow.models import DAG
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
import os
from airflow.utils.dates import days_ago
from pyspark.sql.functions import col, year, month, to_timestamp, count


In [94]:
# Define base do fs do projeto
base_path = '/opt/airflow/dags/desafio-data-engineer/'
# Define base do fs dos arquivos de input e output
relative_path_input = os.path.join(base_path,'datalake', 'raw','covid19')
relative_path_output = os.path.join(base_path, 'datalake', 'trusted')

# Para os arquivos de input, geera os respectivos dfs
path_confirmados = os.path.join(relative_path_input,'time_series_covid19_confirmed_global.csv')
df_confirmados = spark.read.csv(path_confirmados, header=True, inferSchema=True)
path_mortes = os.path.join(relative_path_input,'time_series_covid19_deaths_global.csv')
df_mortes = spark.read.csv(path_mortes, header=True, inferSchema=True)
path_recuperados = os.path.join(relative_path_input,'time_series_covid19_recovered_global.csv')
df_recuperados = spark.read.csv(path_recuperados, header=True, inferSchema=True)

In [135]:
df_confirmados.summary().show()

23/07/18 21:00:56 WARN DAGScheduler: Broadcasting large task binary with size 1397.5 KiB
23/07/18 21:01:04 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
                                                                                

+-------+--------------+--------------+------------------+-----------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-------------

In [129]:
print((df_confirmados.count(), len(df_confirmados.columns)))

(275, 481)


In [136]:
df_mortes.summary().show()

23/07/18 21:01:09 WARN DAGScheduler: Broadcasting large task binary with size 1397.5 KiB
23/07/18 21:01:12 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
                                                                                

+-------+--------------+--------------+------------------+-----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+----------------

In [130]:
print((df_mortes.count(), len(df_mortes.columns)))

(275, 481)


In [137]:
df_recuperados.summary().show()

23/07/18 21:01:18 WARN DAGScheduler: Broadcasting large task binary with size 1397.5 KiB
23/07/18 21:01:20 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
[Stage 485:>                                                        (0 + 1) / 1]

+-------+--------------+--------------+------------------+------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------

                                                                                

In [131]:
print((df_recuperados.count(), len(df_recuperados.columns)))

(260, 481)


In [95]:
from pyspark.sql.functions import array, col, explode, struct, lit

def ajusta_df(df, by, kname, vname):

    # Filter dtypes and split into column names and type description
    cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
    # Spark SQL supports only homogeneous columns
    assert len(set(dtypes)) == 1, "All columns have to be of the same type"

    # Create and explode an array of (column_name, column_value) structs
    kvs = explode(array([
      struct(lit(c).alias(kname), col(c).alias(vname)) for c in cols
    ])).alias("kvs")

    return df.select(by + [kvs]).select(by + [f"kvs.{kname}", f"kvs.{vname}"])
   

In [96]:
df_confirmados_aj = ajusta_df(df_confirmados, \
                ["Province/State", "Country/Region", "Lat", "Long"], \
                "data_t", \
                "quantidade_confirmados") \
                .na.fill(value="-1",subset=["Province/State"])\
                .na.fill(value=-1,subset=["Lat", "Long"])
                     
df_mortes_aj = ajusta_df(df_mortes, \
                ["Province/State", "Country/Region", "Lat", "Long"], \
                "data_t", \
                "quantidade_mortes") \
                .na.fill(value="-1",subset=["Province/State"])\
                .na.fill(value=-1,subset=["Lat", "Long"])
df_recuperados_aj = ajusta_df(df_recuperados, \
                ["Province/State", "Country/Region", "Lat", "Long"], \
                "data_t", \
                "quantidade_recuperados") \
                .na.fill(value="-1",subset=["Province/State"])\
                .na.fill(value=-1,subset=["Lat", "Long"])

In [98]:
df_confirmados_aj.summary().show()

[Stage 307:>                                                        (0 + 1) / 1]

+-------+--------------+--------------+------------------+------------------+------+----------------------+
|summary|Province/State|Country/Region|               Lat|              Long|data_t|quantidade_confirmados|
+-------+--------------+--------------+------------------+------------------+------+----------------------+
|  count|        131175|        131175|            131175|            131175|131175|                131175|
|   mean|          -1.0|          null|20.378187454548296|22.853393141819883|  null|    178641.85499523536|
| stddev|           0.0|          null| 25.12360910363382| 73.22238037277491|  null|    1223316.1245162636|
|    min|            -1|   Afghanistan|          -51.7963|         -178.1165|1/1/21|                     0|
|    25%|          -1.0|          null|            4.5709|          -19.0208|  null|                    81|
|    50%|          -1.0|          null|         21.521757|         20.902977|  null|                  1291|
|    75%|          -1.0|    

                                                                                

In [128]:
print((df_confirmados_aj.count(), len(df_confirmados_aj.columns)))

(131175, 6)


In [99]:
df_mortes_aj.summary().show()

[Stage 310:>                                                        (0 + 1) / 1]

+-------+--------------+--------------+------------------+------------------+------+------------------+
|summary|Province/State|Country/Region|               Lat|              Long|data_t| quantidade_mortes|
+-------+--------------+--------------+------------------+------------------+------+------------------+
|  count|        131175|        131175|            131175|            131175|131175|            131175|
|   mean|          -1.0|          null|20.378187454548296|22.853393141819883|  null| 4311.777091671432|
| stddev|           0.0|          null| 25.12360910363382| 73.22238037277491|  null|24884.883496583665|
|    min|            -1|   Afghanistan|          -51.7963|         -178.1165|1/1/21|                 0|
|    25%|          -1.0|          null|            4.5709|          -19.0208|  null|                 1|
|    50%|          -1.0|          null|         21.521757|         20.902977|  null|                20|
|    75%|          -1.0|          null|           41.1129|      

                                                                                

In [100]:
df_recuperados_aj.summary().show()

[Stage 313:>                                                        (0 + 1) / 1]

+-------+--------------+--------------+------------------+-----------------+------+----------------------+
|summary|Province/State|Country/Region|               Lat|             Long|data_t|quantidade_recuperados|
+-------+--------------+--------------+------------------+-----------------+------+----------------------+
|  count|        124020|        124020|            124020|           124020|124020|                124020|
|   mean|          -1.0|          null|19.025420719234248|28.38237537693291|  null|    110515.60972423802|
| stddev|           0.0|          null|24.577543028073567| 70.6865522696299|  null|     657530.0087847405|
|    min|            -1|   Afghanistan|          -51.7963|        -178.1165|1/1/21|                     0|
|    25%|          -1.0|          null|            4.5353|        -9.429499|  null|                    29|
|    50%|          -1.0|          null|           19.3133|          23.8813|  null|                   923|
|    75%|          -1.0|          nul

                                                                                

In [116]:
merged_df = df_confirmados_aj.join(df_mortes_aj, \
                                   ["Province/State", "Country/Region", "Lat", "Long", "data_t"], \
                                    how="full") \
                                .join(df_recuperados_aj, \
                                    ["Province/State", "Country/Region", "Lat", "Long", "data_t"], \
                                    how="full")

#merged_df.select("*").sort("data_t", "Country/Region", "Province/State").where(col("data_t")=="1/22/20").show()

# eqNullSafe

In [132]:
merged_df = df_confirmados_aj.join(df_recuperados_aj, \
                                   ["Province/State", "Country/Region", "Lat", "Long", "data_t"], \
                                    how="full")

#merged_df.select("*").sort("data_t", "Country/Region", "Province/State").where(col("data_t")=="1/22/20").show()

# eqNullSafe

In [134]:
merged_df.show()

+--------------+--------------+--------+---------+-------+----------------------+----------------------+
|Province/State|Country/Region|     Lat|     Long| data_t|quantidade_confirmados|quantidade_recuperados|
+--------------+--------------+--------+---------+-------+----------------------+----------------------+
|            -1|   Afghanistan|33.93911|67.709953| 1/1/21|                 51526|                 41727|
|            -1|   Afghanistan|33.93911|67.709953|1/10/21|                 53489|                 43948|
|            -1|   Afghanistan|33.93911|67.709953|1/11/21|                 53538|                 44137|
|            -1|   Afghanistan|33.93911|67.709953|1/12/21|                 53584|                 44608|
|            -1|   Afghanistan|33.93911|67.709953|1/13/21|                 53584|                 44850|
|            -1|   Afghanistan|33.93911|67.709953|1/15/21|                 53831|                 45434|
|            -1|   Afghanistan|33.93911|67.709953|1/18/

In [133]:
merged_df.summary().show()



+-------+--------------+--------------+------------------+------------------+------+----------------------+----------------------+
|summary|Province/State|Country/Region|               Lat|              Long|data_t|quantidade_confirmados|quantidade_recuperados|
+-------+--------------+--------------+------------------+------------------+------+----------------------+----------------------+
|  count|        134037|        134037|            134037|            134037|134037|                131175|                124020|
|   mean|          -1.0|          null| 20.42409911031998|23.512841330959787|  null|    178641.85499523536|    110515.60972423802|
| stddev|           0.0|          null|25.163794555234247| 73.51623563186928|  null|    1223316.1245162508|      657530.008784744|
|    min|            -1|   Afghanistan|          -51.7963|         -178.1165|1/1/21|                     0|                     0|
|    25%|          -1.0|          null|            4.5709|          -15.3101|  null

                                                                                

In [117]:
merged_df.summary().show()

23/07/18 20:46:20 WARN DAGScheduler: Broadcasting large task binary with size 1150.9 KiB
[Stage 404:>                                                        (0 + 2) / 2]

+-------+--------------+--------------+------------------+------------------+------+----------------------+-----------------+----------------------+
|summary|Province/State|Country/Region|               Lat|              Long|data_t|quantidade_confirmados|quantidade_mortes|quantidade_recuperados|
+-------+--------------+--------------+------------------+------------------+------+----------------------+-----------------+----------------------+
|  count|        134037|        134037|            134037|            134037|134037|                131175|           131175|                124020|
|   mean|          -1.0|          null| 20.42409911031998|23.512841330959787|  null|    178641.85499523536|4311.777091671432|    110515.60972423802|
| stddev|           0.0|          null|25.163794555234247| 73.51623563186928|  null|    1223316.1245162508| 24884.8834965837|      657530.008784744|
|    min|            -1|   Afghanistan|          -51.7963|         -178.1165|1/1/21|                     0

23/07/18 20:46:22 WARN DAGScheduler: Broadcasting large task binary with size 1184.8 KiB
                                                                                

In [118]:
merged_df = merged_df.select("*") \
    .withColumnRenamed("Country/Region","pais") \
    .withColumnRenamed("Province/State","estado") \
    .withColumnRenamed("Lat","latitude") \
    .withColumnRenamed("Long","longitude") \
    .withColumn("data", to_timestamp(col("data_t"),"M/d/yy")) \
    .drop("data_t") \
    .withColumn("ano",year(col("data"))) \
    .withColumn("mes",month(col("data")))

In [119]:
merged_df = merged_df.selectExpr("pais", \
                            "estado", \
                            "latitude", \
                            "longitude", \
                            "data", \
                            "cast(quantidade_confirmados as long) quantidade_confirmados", \
                            "cast(quantidade_mortes as long) quantidade_mortes", \
                            "cast(quantidade_recuperados as long) quantidade_recuperados", \
                            "ano",
                            "mes"
                            )

In [127]:
merged_df.selectExpr("pais", "estado", "latitude", "longitude", "data") \
    .groupBy("pais", "estado", "latitude", "longitude", "data") \
    .agg(count("*").alias('count')) \
    .filter(col('count')==1) \
    .show()

+-----------------+------+---------+---------+-------------------+-----+
|             pais|estado| latitude|longitude|               data|count|
+-----------------+------+---------+---------+-------------------+-----+
|        Argentina|    -1| -38.4161| -63.6167|2021-01-02 00:00:00|    1|
|           Belize|    -1|  17.1899| -88.4976|2020-01-22 00:00:00|    1|
|            Burma|    -1|  21.9162|   95.956|2020-01-25 00:00:00|    1|
|            Chile|    -1| -35.6751|  -71.543|2021-01-09 00:00:00|    1|
|         Colombia|    -1|   4.5709| -74.2973|2020-01-25 00:00:00|    1|
|          Czechia|    -1|  49.8175|   15.473|2020-10-25 00:00:00|    1|
|          Ecuador|    -1|  -1.8312| -78.1834|2020-10-15 00:00:00|    1|
|            Egypt|    -1|26.820553|30.802498|2021-01-04 00:00:00|    1|
|      El Salvador|    -1|  13.7942| -88.8965|2021-01-01 00:00:00|    1|
|Equatorial Guinea|    -1|   1.6508|  10.2679|2021-01-08 00:00:00|    1|
|           Gambia|    -1|  13.4432| -15.3101|2020-

In [109]:
df_confirmados_aj.selectExpr("max(to_date(data_t,'M/d/yy'))","min(to_date(data_t,'M/d/yy'))").show()

+----------------------------+----------------------------+
|max(to_date(data_t, M/d/yy))|min(to_date(data_t, M/d/yy))|
+----------------------------+----------------------------+
|                  2021-05-12|                  2020-01-22|
+----------------------------+----------------------------+



In [110]:
df_mortes_aj.selectExpr("max(to_date(data_t,'M/d/yy'))","min(to_date(data_t,'M/d/yy'))").show()

+----------------------------+----------------------------+
|max(to_date(data_t, M/d/yy))|min(to_date(data_t, M/d/yy))|
+----------------------------+----------------------------+
|                  2021-05-12|                  2020-01-22|
+----------------------------+----------------------------+



In [111]:
df_recuperados_aj.selectExpr("max(to_date(data_t,'M/d/yy'))","min(to_date(data_t,'M/d/yy'))").show()

+----------------------------+----------------------------+
|max(to_date(data_t, M/d/yy))|min(to_date(data_t, M/d/yy))|
+----------------------------+----------------------------+
|                  2021-05-12|                  2020-01-22|
+----------------------------+----------------------------+



In [86]:
merged_df.printSchema()

root
 |-- pais: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- data: timestamp (nullable = true)
 |-- quantidade_confirmados: long (nullable = true)
 |-- quantidade_mortes: long (nullable = true)
 |-- quantidade_recuperados: long (nullable = true)
 |-- ano: integer (nullable = true)
 |-- mes: integer (nullable = true)



In [89]:
merged_df.show()

23/07/18 20:29:04 WARN DAGScheduler: Broadcasting large task binary with size 1144.6 KiB


+-----------+------+--------+---------+-------------------+----------------------+-----------------+----------------------+----+---+
|       pais|estado|latitude|longitude|               data|quantidade_confirmados|quantidade_mortes|quantidade_recuperados| ano|mes|
+-----------+------+--------+---------+-------------------+----------------------+-----------------+----------------------+----+---+
|Afghanistan|    -1|33.93911|67.709953|2021-01-01 00:00:00|                 51526|             2191|                 41727|2021|  1|
|Afghanistan|    -1|33.93911|67.709953|2021-01-10 00:00:00|                 53489|             2277|                 43948|2021|  1|
|Afghanistan|    -1|33.93911|67.709953|2021-01-11 00:00:00|                 53538|             2288|                 44137|2021|  1|
|Afghanistan|    -1|33.93911|67.709953|2021-01-12 00:00:00|                 53584|             2301|                 44608|2021|  1|
|Afghanistan|    -1|33.93911|67.709953|2021-01-13 00:00:00|          

                                                                                

In [90]:
merged_df.summary().show()

23/07/18 20:29:39 WARN DAGScheduler: Broadcasting large task binary with size 1158.1 KiB
23/07/18 20:29:41 WARN DAGScheduler: Broadcasting large task binary with size 1194.6 KiB


+-------+-----------+--------+------------------+------------------+----------------------+-----------------+----------------------+------------------+------------------+
|summary|       pais|  estado|          latitude|         longitude|quantidade_confirmados|quantidade_mortes|quantidade_recuperados|               ano|               mes|
+-------+-----------+--------+------------------+------------------+----------------------+-----------------+----------------------+------------------+------------------+
|  count|     134037|  134037|            134037|            134037|                131175|           131175|                124020|            134037|            134037|
|   mean|       null|    -1.0| 20.42409911031998|23.512841330959787|    178641.85499523536|4311.777091671432|    110515.60972423802|2020.2767295597484|5.7085953878406706|
| stddev|       null|     0.0|25.163794555234247| 73.51623563186928|    1223316.1245162508| 24884.8834965837|      657530.008784744|0.44738328507

                                                                                

In [88]:
merged_df.printSchema()

root
 |-- pais: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- data: timestamp (nullable = true)
 |-- quantidade_confirmados: long (nullable = true)
 |-- quantidade_mortes: long (nullable = true)
 |-- quantidade_recuperados: long (nullable = true)
 |-- ano: integer (nullable = true)
 |-- mes: integer (nullable = true)



In [None]:
merged_df = df_confirmados_aj.join(df_mortes_aj, \
                                   ( \
                                       df_confirmados_aj.col("Province/State").eqNullSafe(df_mortes_aj.col("Province/State")) & \
                                        df_confirmados_aj.col("Country/Region").eqNullSafe(df_mortes_aj.col("Country/Region")) & \
                                        df_confirmados_aj.col("Lat").eqNullSafe(df_mortes_aj.col("Lat")) & \
                                        df_confirmados_aj.col("Long").eqNullSafe(df_mortes_aj.col("Long")) & \
                                        df_confirmados_aj.col("data").eqNullSafe(df_mortes_aj.col("data")) \
                                        ), \
                                        how="full")
merged_df.select("*").sort("data", "Country/Region", "Province/State").where(col("data")=="1/22/20").show()

In [None]:
df_confirmados

In [35]:
teste.show()

+--------------+--------------+--------+---------+-------+----------+
|Province/State|Country/Region|     Lat|     Long|   data|quantidade|
+--------------+--------------+--------+---------+-------+----------+
|          null|   Afghanistan|33.93911|67.709953|1/22/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/23/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/24/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/25/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/26/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/27/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/28/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/29/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/30/20|         0|
|          null|   Afghanistan|33.93911|67.709953|1/31/20|         0|
|          null|   Afghanistan|33.93911|67.709953| 2/1/20|         0|
|          null|   A

In [18]:
cols_confirmados = list(set(df_confirmados.columns) - {"Province/State", "Country/Region", "Lat", "Long"})
df_confirmados.melt(ids=["Province/State", "Country/Region", "Lat", "Long"], values=[cols_confirmados], variableColumnName="key", valueColumnName="val").show()

AttributeError: 'DataFrame' object has no attribute 'melt'

In [7]:
dfs

[DataFrame[Province/State: string, Country/Region: string, Lat: double, Long: double, 1/22/20: int, 1/23/20: int, 1/24/20: int, 1/25/20: int, 1/26/20: int, 1/27/20: int, 1/28/20: int, 1/29/20: int, 1/30/20: int, 1/31/20: int, 2/1/20: int, 2/2/20: int, 2/3/20: int, 2/4/20: int, 2/5/20: int, 2/6/20: int, 2/7/20: int, 2/8/20: int, 2/9/20: int, 2/10/20: int, 2/11/20: int, 2/12/20: int, 2/13/20: int, 2/14/20: int, 2/15/20: int, 2/16/20: int, 2/17/20: int, 2/18/20: int, 2/19/20: int, 2/20/20: int, 2/21/20: int, 2/22/20: int, 2/23/20: int, 2/24/20: int, 2/25/20: int, 2/26/20: int, 2/27/20: int, 2/28/20: int, 2/29/20: int, 3/1/20: int, 3/2/20: int, 3/3/20: int, 3/4/20: int, 3/5/20: int, 3/6/20: int, 3/7/20: int, 3/8/20: int, 3/9/20: int, 3/10/20: int, 3/11/20: int, 3/12/20: int, 3/13/20: int, 3/14/20: int, 3/15/20: int, 3/16/20: int, 3/17/20: int, 3/18/20: int, 3/19/20: int, 3/20/20: int, 3/21/20: int, 3/22/20: int, 3/23/20: int, 3/24/20: int, 3/25/20: int, 3/26/20: int, 3/27/20: int, 3/28/20:

In [8]:
joined_df = dfs[0]
for df in dfs[1:]:
    joined_df = joined_df.union(df)

In [10]:
joined_df.show()

+--------------------+-------------------+---------+----------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-----

In [13]:
joined_df = joined_df.select( \
    col("country/region").alias("pais"), \
    col("Province/State").alias("estado"), \
    col("Lat").alias("latitude"),
    col("Long").alias("longitude")).show()

+-------------------+--------------------+---------+----------+
|               pais|              estado| latitude| longitude|
+-------------------+--------------------+---------+----------+
|        Afghanistan|                null| 33.93911| 67.709953|
|            Albania|                null|  41.1533|   20.1683|
|            Algeria|                null|  28.0339|    1.6596|
|            Andorra|                null|  42.5063|    1.5218|
|             Angola|                null| -11.2027|   17.8739|
|Antigua and Barbuda|                null|  17.0608|  -61.7964|
|          Argentina|                null| -38.4161|  -63.6167|
|            Armenia|                null|  40.0691|   45.0382|
|          Australia|Australian Capita...| -35.4735|  149.0124|
|          Australia|     New South Wales| -33.8688|  151.2093|
|          Australia|  Northern Territory| -12.4634|  130.8456|
|          Australia|          Queensland| -27.4698|  153.0251|
|          Australia|     South Australi

In [4]:
def unifica_arquivos(filepath_input, filepath_output, **kwargs):
    # Inicializa SparkSession
    spark = SparkSession.builder \
        .appName("Unifica_Arquivos_CSV") \
        .getOrCreate()

    # Lista dos arquivos no diretório de entrada
    lista_arquivos = [f for f in os.listdir(filepath_input) if f.endswith('.csv')]

    # Carrega os arquivos CSV em DataFrames e faz o join
    dfs = []
    for filename in lista_arquivos:
        file_path = os.path.join(filepath_input, filename)
        df = spark.read.csv(file_path, header=True, inferSchema=True)
        dfs.append(df)

    joined_df = dfs[0]
    for df in dfs[1:]:
        joined_df = joined_df.union(df)

    # Salva o DataFrame resultante em um arquivo CSV
    joined_df.write.parquet(filepath_output, mode="overwrite", header=True)

    # Encerra a sessão do Spark
    spark.stop()

In [5]:
# Definindo a DAG e seus argumentos
default_args = {
    'owner': 'airflow',
    'start_date': datetime.now(),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}


In [None]:
with DAG(
    default_args=default_args,
    dag_id="covid19",
    schedule_interval="@daily",
    description='ETL de arquivos CSV de Covid19 e join com PySpark'
) as dag:

    # Caminho relativo à pasta "dags"
    relative_path_input = os.path.join(os.path.dirname(__file__),'..','datalake', 'raw')
    relative_path_output = os.path.join(os.path.dirname(__file__), '..', 'datalake', 'trusted')
    

    # Caminho absoluto do diretório "datalake/raw"
    filepath_input = os.path.abspath(relative_path_input)

    def create_parent_folder(self):
        (relative_path_output).mkdir(parents=True, exist_ok=True)

    # Definindo as tarefas da DAG
    t_start = DummyOperator(task_id='t_start', dag=dag)

    t_carga_arquivos = PythonOperator(
        task_id="t_carga_arquivos",
        python_callable=unifica_arquivos,
        op_args=[relative_path_input, relative_path_output],
        provide_context=True
    )

    t_start >> t_carga_arquivos

