<h1 align="center"> Challenge Presentation </h1>

<div align="center">
<img src="..\..\..\01_aux_files\img\UK_Traffic_Data\7.png" height="20%" width="30%"/>
<img src="..\..\..\01_aux_files\img\UK_Traffic_Data\6.png" height="20%" width="30%"/>
</div>

## We will create the following tables for our Power Bi Report
* dim_region
* dim_vehicle
* fact_road_vehicle_count
### We will save them to the results folder

In [50]:
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, split, lpad, lit, regexp_replace, concat, upper, trim, coalesce, sum, count

In [51]:
spark = SparkSession.builder.appName("myapp").getOrCreate()

## Creating and storing our dim_date

In [14]:
import pandas as pd;

In [16]:
dates = pd.date_range(start="2000-01-01", end="2024-12-31")

df = pd.DataFrame(
    {
    "date_id": dates.strftime("%Y%m%d").astype(int),
    "year": dates.year,
    "month": dates.strftime("%m"),
    "period": dates.strftime("%Y-%m"),
    "month_name": dates.strftime("%B").str.upper(),
    "day": dates.strftime("%d"),
    "date": dates
    }
)  
df

Unnamed: 0,date_id,year,month,period,month_name,day,date
0,20000101,2000,01,2000-01,JANUARY,01,2000-01-01
1,20000102,2000,01,2000-01,JANUARY,02,2000-01-02
2,20000103,2000,01,2000-01,JANUARY,03,2000-01-03
3,20000104,2000,01,2000-01,JANUARY,04,2000-01-04
4,20000105,2000,01,2000-01,JANUARY,05,2000-01-05
...,...,...,...,...,...,...,...
9127,20241227,2024,12,2024-12,DECEMBER,27,2024-12-27
9128,20241228,2024,12,2024-12,DECEMBER,28,2024-12-28
9129,20241229,2024,12,2024-12,DECEMBER,29,2024-12-29
9130,20241230,2024,12,2024-12,DECEMBER,30,2024-12-30


In [17]:
df.to_parquet(r"..\..\..\01_aux_files\results\UK_Traffic_Data\Power_BI\dim_time\dim_time.parquet")
print("dim_time stored!")

dim_time stored!


## Extracting the main source

In [55]:
df = spark.read.parquet(r"..\..\..\01_aux_files\source\UK_Traffic_Data\dft_traffic_counts_raw_counts\dft_traffic_counts_raw_counts.parquet")
new_columns = [col(column).alias(column.lower()) for column in df.columns]
df = df.select(*new_columns)

In [56]:
df.show()

+--------------+-------------------+----+-------------------+----+---------+-----------+---------------+------------------+--------------------+--------------------+---------+-------------+---------+------------------------+----------------------+-------+--------+------------+------------+--------------+-------------------+------------+--------------------------+--------------+-----------------+----+-----------------+-----------------+-------------------------+----------------------------+-----------------------+-----------------------+--------+------------------+
|count_point_id|direction_of_travel|year|         count_date|hour|region_id|region_name|region_ons_code|local_authority_id|local_authority_name|local_authority_code|road_name|road_category|road_type|start_junction_road_name|end_junction_road_name|easting|northing|    latitude|   longitude|link_length_km|  link_length_miles|pedal_cycles|two_wheeled_motor_vehicles|cars_and_taxis|buses_and_coaches|lgvs|hgvs_2_rigid_axle|hgvs_3_r

In [57]:
df.dtypes

[('count_point_id', 'string'),
 ('direction_of_travel', 'string'),
 ('year', 'string'),
 ('count_date', 'string'),
 ('hour', 'string'),
 ('region_id', 'string'),
 ('region_name', 'string'),
 ('region_ons_code', 'string'),
 ('local_authority_id', 'string'),
 ('local_authority_name', 'string'),
 ('local_authority_code', 'string'),
 ('road_name', 'string'),
 ('road_category', 'string'),
 ('road_type', 'string'),
 ('start_junction_road_name', 'string'),
 ('end_junction_road_name', 'string'),
 ('easting', 'string'),
 ('northing', 'string'),
 ('latitude', 'string'),
 ('longitude', 'string'),
 ('link_length_km', 'string'),
 ('link_length_miles', 'string'),
 ('pedal_cycles', 'string'),
 ('two_wheeled_motor_vehicles', 'string'),
 ('cars_and_taxis', 'string'),
 ('buses_and_coaches', 'string'),
 ('lgvs', 'string'),
 ('hgvs_2_rigid_axle', 'string'),
 ('hgvs_3_rigid_axle', 'string'),
 ('hgvs_4_or_more_rigid_axle', 'string'),
 ('hgvs_3_or_4_articulated_axle', 'string'),
 ('hgvs_5_articulated_axle', 

### Creating dim_region

In [58]:
df.filter(
    (col("region_id").isNull()) |
    (col("road_name").isNull()) |
    (col("road_category").isNull()) |
    (col("latitude").isNull()) |
    (col("longitude").isNull()) 
).show()

+--------------+-------------------+----+----------+----+---------+-----------+---------------+------------------+--------------------+--------------------+---------+-------------+---------+------------------------+----------------------+-------+--------+--------+---------+--------------+-----------------+------------+--------------------------+--------------+-----------------+----+-----------------+-----------------+-------------------------+----------------------------+-----------------------+-----------------------+--------+------------------+
|count_point_id|direction_of_travel|year|count_date|hour|region_id|region_name|region_ons_code|local_authority_id|local_authority_name|local_authority_code|road_name|road_category|road_type|start_junction_road_name|end_junction_road_name|easting|northing|latitude|longitude|link_length_km|link_length_miles|pedal_cycles|two_wheeled_motor_vehicles|cars_and_taxis|buses_and_coaches|lgvs|hgvs_2_rigid_axle|hgvs_3_rigid_axle|hgvs_4_or_more_rigid_axle|

In [59]:
dim_region = df.select(
            "region_id", "region_name", "road_name", "road_category", "road_type",
            "latitude", "longitude"
        ).\
        filter( 
                (col("region_id").isNotNull()) & 
                (col("latitude").isNotNull()) &
                (col("longitude").isNotNull())
        ).\
        distinct().\
        withColumn("lat_long", concat(trim(col("latitude")), lit(", "), trim(col("longitude")))).\
        withColumn("region_name", coalesce(upper(trim(col("region_name"))) , lit("NOT SPECIFIED") )).\
        withColumn("road_name",   coalesce(upper(trim("road_name"))        , lit("NOT SPECIFIED") )).\
        withColumn("road_category", upper(trim("road_category"))).\
        withColumn("road_type", upper(trim("road_type"))).\
        withColumn("region_id",   concat(col("region_id"), lit('-'), 'road_name', lit('-'), 'road_category',lit('-'), 'lat_long')).\
        coalesce(1)
        #To avoid creating multiple files



In [60]:
dim_region.groupBy('region_id').agg(count(lit(1)).alias('cuenta')).filter(col('cuenta') != 1).show()

+---------+------+
|region_id|cuenta|
+---------+------+
+---------+------+



In [61]:
dim_region.write.parquet(r"..\..\..\01_aux_files\results\UK_Traffic_Data\Power_BI\dim_region", mode="overwrite")
print("dim_region stored!")

dim_region stored!


### Defining fact (we will make some extra transformations on Power BI)

In [62]:
list_of_float_fields = ["pedal_cycles", "two_wheeled_motor_vehicles", "cars_and_taxis", "buses_and_coaches", "lgvs", "hgvs_2_rigid_axle", "hgvs_3_rigid_axle", "hgvs_4_or_more_rigid_axle",
    "hgvs_3_or_4_articulated_axle", "hgvs_5_articulated_axle", "hgvs_6_articulated_axle"]

In [63]:
fact = df.select(
            "region_id", 'road_name', 'road_category', 'latitude', 'longitude', "count_date", "pedal_cycles", "two_wheeled_motor_vehicles",
            "cars_and_taxis", "buses_and_coaches", "lgvs", "hgvs_2_rigid_axle", "hgvs_3_rigid_axle", "hgvs_4_or_more_rigid_axle",
            "hgvs_3_or_4_articulated_axle", "hgvs_5_articulated_axle", "hgvs_6_articulated_axle"
        ).\
        withColumn("region_id", coalesce(col("region_id").cast("int"), lit(-1))).\
        withColumn("date_id", regexp_replace((split(col("count_date"), " ").getItem(0)), "-", "" ).cast("int")).\
        withColumn("region_id",   concat(col("region_id"), lit('-'), 'road_name', lit('-'),
                                         'road_category', lit('-'), 'latitude', lit(', '), 'longitude')).\
        drop('road_name', 'road_category', 'latitude', 'longitude').\
        drop("count_date")

for colName in list_of_float_fields:
    fact.withColumn(colName, col(colName).cast("int"))

## Although this code is just to simplify the group by writing, it's important to remember that
## sum by default returns a double, which is heavier for our data model calculations than an int

In [64]:
for x in list_of_float_fields:
    print(f"sum('{x}').cast('int').alias('{x}'),")

sum('pedal_cycles').cast('int').alias('pedal_cycles'),
sum('two_wheeled_motor_vehicles').cast('int').alias('two_wheeled_motor_vehicles'),
sum('cars_and_taxis').cast('int').alias('cars_and_taxis'),
sum('buses_and_coaches').cast('int').alias('buses_and_coaches'),
sum('lgvs').cast('int').alias('lgvs'),
sum('hgvs_2_rigid_axle').cast('int').alias('hgvs_2_rigid_axle'),
sum('hgvs_3_rigid_axle').cast('int').alias('hgvs_3_rigid_axle'),
sum('hgvs_4_or_more_rigid_axle').cast('int').alias('hgvs_4_or_more_rigid_axle'),
sum('hgvs_3_or_4_articulated_axle').cast('int').alias('hgvs_3_or_4_articulated_axle'),
sum('hgvs_5_articulated_axle').cast('int').alias('hgvs_5_articulated_axle'),
sum('hgvs_6_articulated_axle').cast('int').alias('hgvs_6_articulated_axle'),


### As we are not considering hours of the day, we must summarize our data

In [65]:
fact = fact.groupBy('region_id', 'date_id').agg(
    sum('pedal_cycles').cast('int').alias('pedal_cycles'),
    sum('two_wheeled_motor_vehicles').cast('int').alias('two_wheeled_motor_vehicles'),
    sum('cars_and_taxis').cast('int').alias('cars_and_taxis'),
    sum('buses_and_coaches').cast('int').alias('buses_and_coaches'),
    sum('lgvs').cast('int').alias('lgvs'),
    sum('hgvs_2_rigid_axle').cast('int').alias('hgvs_2_rigid_axle'),
    sum('hgvs_3_rigid_axle').cast('int').alias('hgvs_3_rigid_axle'),
    sum('hgvs_4_or_more_rigid_axle').cast('int').alias('hgvs_4_or_more_rigid_axle'),
    sum('hgvs_3_or_4_articulated_axle').cast('int').alias('hgvs_3_or_4_articulated_axle'),
    sum('hgvs_5_articulated_axle').cast('int').alias('hgvs_5_articulated_axle'),
    sum('hgvs_6_articulated_axle').cast('int').alias('hgvs_6_articulated_axle')
).coalesce(1).persist()

In [66]:
print("total rows after summarizing:", fact.count())
print("The less, the better for our model")

total rows after summarizing: 202926
The less, the better for our model


In [67]:
fact.show()

+--------------------+--------+------------+--------------------------+--------------+-----------------+-----+-----------------+-----------------+-------------------------+----------------------------+-----------------------+-----------------------+
|           region_id| date_id|pedal_cycles|two_wheeled_motor_vehicles|cars_and_taxis|buses_and_coaches| lgvs|hgvs_2_rigid_axle|hgvs_3_rigid_axle|hgvs_4_or_more_rigid_axle|hgvs_3_or_4_articulated_axle|hgvs_5_articulated_axle|hgvs_6_articulated_axle|
+--------------------+--------+------------+--------------------------+--------------+-----------------+-----+-----------------+-----------------+-------------------------+----------------------------+-----------------------+-----------------------+
|5-A56-PA-53.34049...|20140520|          21|                        91|         15017|               70| 2377|              426|               83|                       91|                          46|                    479|                    328|


In [68]:
fact.coalesce(1).write.parquet(r"..\..\..\01_aux_files\results\UK_Traffic_Data\Power_BI\fact_road_vehicle_count", mode="overwrite")

print("fact_road_vehicle_count stored!")

fact_road_vehicle_count stored!
