<h1 align="center"> Challenge Presentation </h1>

<div align="center">
<img src="..\..\..\01_aux_files\img\UK_Traffic_Data\7.png" height="20%" width="30%"/>
<img src="..\..\..\01_aux_files\img\UK_Traffic_Data\6.png" height="20%" width="30%"/>
</div>

## We will create the following tables for our Power Bi Report
* dim_region
* dim_vehicle
* fact_road_vehicle_count
### We will save them to the results folder

In [61]:
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, split, lpad, lit, regexp_replace, concat, upper, trim, coalesce

In [62]:
spark = SparkSession.builder.appName("myapp").getOrCreate()

## Creating and storing our dim_date

In [63]:
import pandas as pd;

In [64]:
dates = pd.date_range(start="2000-01-01", end="2024-12-31")

df = pd.DataFrame(
    {
    "date_id": dates.strftime("%Y%m%d").astype(int),
    "month": dates.strftime("%m"),
    "year": dates.year,
    "date": dates
    }
)  
df

Unnamed: 0,date_id,month,year,date
0,20000101,01,2000,2000-01-01
1,20000102,01,2000,2000-01-02
2,20000103,01,2000,2000-01-03
3,20000104,01,2000,2000-01-04
4,20000105,01,2000,2000-01-05
...,...,...,...,...
9127,20241227,12,2024,2024-12-27
9128,20241228,12,2024,2024-12-28
9129,20241229,12,2024,2024-12-29
9130,20241230,12,2024,2024-12-30


In [65]:
df.to_parquet(r"..\..\..\01_aux_files\results\UK_Traffic_Data\Power_BI\dim_time\dim_time.parquet")
print("dim_time stored!")

dim_time stored!


## Extracting the main source

In [66]:
df = spark.read.parquet(r"..\..\..\01_aux_files\source\UK_Traffic_Data\dft_traffic_counts_raw_counts\dft_traffic_counts_raw_counts.parquet")
new_columns = [col(column).alias(column.lower()) for column in df.columns]
df = df.select(*new_columns)

In [67]:
df.show()

+--------------+-------------------+----+-------------------+----+---------+-----------+---------------+------------------+--------------------+--------------------+---------+-------------+---------+------------------------+----------------------+-------+--------+------------+------------+--------------+-------------------+------------+--------------------------+--------------+-----------------+----+-----------------+-----------------+-------------------------+----------------------------+-----------------------+-----------------------+--------+------------------+
|count_point_id|direction_of_travel|year|         count_date|hour|region_id|region_name|region_ons_code|local_authority_id|local_authority_name|local_authority_code|road_name|road_category|road_type|start_junction_road_name|end_junction_road_name|easting|northing|    latitude|   longitude|link_length_km|  link_length_miles|pedal_cycles|two_wheeled_motor_vehicles|cars_and_taxis|buses_and_coaches|lgvs|hgvs_2_rigid_axle|hgvs_3_r

In [68]:
df.dtypes

[('count_point_id', 'string'),
 ('direction_of_travel', 'string'),
 ('year', 'string'),
 ('count_date', 'string'),
 ('hour', 'string'),
 ('region_id', 'string'),
 ('region_name', 'string'),
 ('region_ons_code', 'string'),
 ('local_authority_id', 'string'),
 ('local_authority_name', 'string'),
 ('local_authority_code', 'string'),
 ('road_name', 'string'),
 ('road_category', 'string'),
 ('road_type', 'string'),
 ('start_junction_road_name', 'string'),
 ('end_junction_road_name', 'string'),
 ('easting', 'string'),
 ('northing', 'string'),
 ('latitude', 'string'),
 ('longitude', 'string'),
 ('link_length_km', 'string'),
 ('link_length_miles', 'string'),
 ('pedal_cycles', 'string'),
 ('two_wheeled_motor_vehicles', 'string'),
 ('cars_and_taxis', 'string'),
 ('buses_and_coaches', 'string'),
 ('lgvs', 'string'),
 ('hgvs_2_rigid_axle', 'string'),
 ('hgvs_3_rigid_axle', 'string'),
 ('hgvs_4_or_more_rigid_axle', 'string'),
 ('hgvs_3_or_4_articulated_axle', 'string'),
 ('hgvs_5_articulated_axle', 

### Creating dim_region

In [69]:
dim_region = df.select(
            "region_id", "region_name", "road_name", "road_category", "road_type",
            "latitude", "longitude"
        ).\
        filter( 
                (col("region_id").isNotNull()) & 
                (col("latitude").isNotNull()) &
                (col("longitude").isNotNull())
        ).\
        distinct().\
        withColumn("lat_long", concat(trim(col("latitude")), lit(", "), trim(col("longitude")))).\
        withColumn("region_id",   coalesce(col("region_id").cast("int")    , lit("NOT SPECIFIED") )).\
        withColumn("region_name", coalesce(upper(trim(col("region_name"))) , lit("NOT SPECIFIED") )).\
        withColumn("road_name",   coalesce(upper(trim("road_name"))        , lit("NOT SPECIFIED") )).\
        withColumn("road_category", upper(trim("road_category"))).\
        withColumn("road_type", upper(trim("road_type"))).\
        coalesce(1)
        #To avoid creating multiple files

dim_region.write.parquet(r"..\..\..\01_aux_files\results\UK_Traffic_Data\Power_BI\dim_region", mode="overwrite")
print("dim_region stored!")


dim_region stored!


### Defining fact (we will make some extra transformations on Power BI)

In [70]:
list_of_float_fields = ["pedal_cycles", "two_wheeled_motor_vehicles", "cars_and_taxis", "buses_and_coaches", "lgvs", "hgvs_2_rigid_axle", "hgvs_3_rigid_axle", "hgvs_4_or_more_rigid_axle",
    "hgvs_3_or_4_articulated_axle", "hgvs_5_articulated_axle", "hgvs_6_articulated_axle", "all_hgvs", "all_motor_vehicles"]

In [72]:
fact = df.select(
            "region_id", "count_date", "pedal_cycles", "two_wheeled_motor_vehicles",
            "cars_and_taxis", "buses_and_coaches", "lgvs", "hgvs_2_rigid_axle", "hgvs_3_rigid_axle", "hgvs_4_or_more_rigid_axle",
            "hgvs_3_or_4_articulated_axle", "hgvs_5_articulated_axle", "hgvs_6_articulated_axle", "all_hgvs", "all_motor_vehicles"
        ).\
        withColumn("region_id", coalesce(col("region_id").cast("int"), lit(-1))).\
        withColumn("date_id", regexp_replace((split(col("count_date"), " ").getItem(0)), "-", "" ).cast("int")).\
        drop("count_date")

for colName in list_of_float_fields:
    fact.withColumn(colName, col(colName).cast("int"))

fact.coalesce(1).write.parquet(r"..\..\..\01_aux_files\results\UK_Traffic_Data\Power_BI\fact_road_vehicle_count", mode="overwrite")

print("fact_road_vehicle_count stored!")

fact_road_vehicle_count stored!
