In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
            builder.\
            appName("Spark ETL").\
            master("spark://spark-master:7077").\
            config("spark.jars", "mysql-connector-j-8.0.33.jar").\
            config("hive.metastore.uris", "thrift://hive-server:9083").\
            enableHiveSupport().\
            getOrCreate()

In [5]:
spark.sql("show tables from covid19;").show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  covid19|      dim_date|      false|
|  covid19|  dim_location|      false|
|  covid19|dim_population|      false|
|  covid19|dim_who_region|      false|
|  covid19|    fact_covid|      false|
+---------+--------------+-----------+



In [79]:
merged_df = spark.sql("""select f.uuid 
                , f.confirmed 
                , l.country 
                , l.continent
                , r.who_region
                , l.lat_ 
                , l.long_ 
                , cast(concat(year, "-",month,"-", day) as date) as date
                from covid19.fact_covid as f
         join covid19.dim_location as l on f.pop_loc_id = l.pop_loc_id
         join covid19.dim_date as d on d.date_id = f.date_id
         join covid19.dim_who_region as r on r.who_id = f.who_id;""").toPandas()

In [56]:
import pandas as pd

In [80]:
merged_df.sort_values(by=["date"], inplace=True)

In [58]:
merged_df[merged_df.date == pd.to_datetime("2020-03-06")]


Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior. In a future version these will be considered non-comparable. Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.



Unnamed: 0,uuid,confirmed,country,continent,who_region,lat_,long_,date
2746,11603,670,Germany,Europe,Europe,51.16570,10.4515,2020-03-06
30667,11492,1,Armenia,Asia,Europe,40.06910,45.0382,2020-03-06
8994,11487,17,Algeria,Africa,Africa,28.03390,1.6596,2020-03-06
33262,11553,10,China,Asia,Western Pacific,22.16670,113.5500,2020-03-06
13491,11706,373,United Kingdom,Europe,Europe,55.37810,-3.4360,2020-03-06
...,...,...,...,...,...,...,...,...
16560,11658,108,Norway,Europe,Europe,60.47200,8.4689,2020-03-06
25448,11564,174,China,Asia,Western Pacific,24.97400,101.4870,2020-03-06
17391,11693,1,Togo,Africa,Africa,8.61950,0.8248,2020-03-06
25478,11571,11,Croatia,Europe,Europe,45.10000,15.2000,2020-03-06


In [88]:
merged_df.sort_values(by=["date"], inplace=True)
# Tạo biểu đồ density_mapbox
fig = px.density_mapbox(merged_df, lat='lat_', lon='long_', z='confirmed', radius=20,
                        animation_frame='date', title='Hot Spot of COVID-19',
                        center=dict(lat=0, lon=0), zoom=1, range_color=[0, 10000],
                        hover_name='country', hover_data=['country', 'who_region', 'confirmed'],
                        mapbox_style="carto-positron")

# Tùy chỉnh kích thước của bản đồ (chiều rộng và chiều cao)
fig.update_layout(
    mapbox=dict(
        style="carto-positron",  # Loại bản đồ
        center=dict(lat=0, lon=0),  # Tọa độ trung tâm
        zoom=1,  # Mức độ phóng to
    ),
    margin={"r": 0, "t": 30, "l": 0, "b": 0},  # Khoảng cách từ biểu đồ đến viền
    width=1000,  # Chiều rộng
    height=600,  # Chiều cao
)

fig.show()
