In [5]:
# Load the preprocessed data
trusted_path = "gs://flight-analysis-ms-bucket/trusted/preprocessed_with_frequencies.parquet"
flight_data = spark.read.parquet(trusted_path)

# Show the schema to verify
flight_data.printSchema()

# Display a sample of the data
flight_data.show(5)


root
 |-- Dest: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- FlightDate: timestamp_ntz (nullable = true)
 |-- Airline: string (nullable = true)
 |-- Cancelled: byte (nullable = true)
 |-- Diverted: byte (nullable = true)
 |-- DepDel15: byte (nullable = true)
 |-- ArrivalDelayGroups: byte (nullable = true)
 |-- DistanceGroup: byte (nullable = true)
 |-- OriginState: string (nullable = true)
 |-- DestState: string (nullable = true)
 |-- Operating_Airline: string (nullable = true)
 |-- Year: short (nullable = true)
 |-- Quarter: byte (nullable = true)
 |-- Month: byte (nullable = true)
 |-- DayofMonth: byte (nullable = true)
 |-- DayOfWeek: byte (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- numerical_features: vector (nullable = true)
 |-- scaled_features: vector (nullable = true)
 |-- ArrivalDelayGroups_indexed: double (nullable = true)
 |-- Airline_indexed: double (nullable = true)
 |-- Origin_frequency: long (nullable = true)
 |-- Dest_frequency

In [6]:
from pyspark.sql.functions import dayofyear, weekofyear

# Extract day of the year and week of the year
flight_data = flight_data.withColumn("DayOfYear", dayofyear("FlightDate"))
flight_data = flight_data.withColumn("WeekOfYear", weekofyear("FlightDate"))

# Show the derived features
flight_data.select("FlightDate", "DayOfYear", "WeekOfYear").show(5)


+-------------------+---------+----------+
|         FlightDate|DayOfYear|WeekOfYear|
+-------------------+---------+----------+
|2018-01-23 00:00:00|       23|         4|
|2018-01-24 00:00:00|       24|         4|
|2018-01-25 00:00:00|       25|         4|
|2018-01-26 00:00:00|       26|         4|
|2018-01-27 00:00:00|       27|         4|
+-------------------+---------+----------+
only showing top 5 rows



In [10]:
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

# Convert the vector to an array so we can extract its first value
flight_data = flight_data.withColumn("Distance_scaled", vector_to_array("scaled_features")[0])

# Create the interaction feature: scaled Distance * DepDel15
flight_data = flight_data.withColumn("Distance_Delay_Interaction", col("Distance_scaled") * col("DepDel15"))

# Show the new interaction feature
flight_data.select("Distance_scaled", "DepDel15", "Distance_Delay_Interaction").show(5, truncate=False)


[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+--------+--------------------------+
|Distance_scaled     |DepDel15|Distance_Delay_Interaction|
+--------------------+--------+--------------------------+
|0.025971411314676868|0       |0.0                       |
|0.025971411314676868|0       |0.0                       |
|0.025971411314676868|0       |0.0                       |
|0.025971411314676868|0       |0.0                       |
|0.025971411314676868|0       |0.0                       |
+--------------------+--------+--------------------------+
only showing top 5 rows



                                                                                

In [11]:
# Select relevant columns for modeling
final_columns = [
    "Cancelled",  # Target variable
    "Airline_indexed",
    "Origin_frequency",
    "Dest_frequency",
    "Distance_scaled",  # Normalized Distance
    "DayOfYear",
    "WeekOfYear",
    "Distance_Delay_Interaction"
]

# Select only these columns
final_data = flight_data.select(*final_columns)

# Show the final dataset
final_data.show(5)


+---------+---------------+----------------+--------------+--------------------+---------+----------+--------------------------+
|Cancelled|Airline_indexed|Origin_frequency|Dest_frequency|     Distance_scaled|DayOfYear|WeekOfYear|Distance_Delay_Interaction|
+---------+---------------+----------------+--------------+--------------------+---------+----------+--------------------------+
|        0|           13.0|             501|        223860|0.025971411314676868|       23|         4|                       0.0|
|        0|           13.0|             501|        223860|0.025971411314676868|       24|         4|                       0.0|
|        0|           13.0|             501|        223860|0.025971411314676868|       25|         4|                       0.0|
|        0|           13.0|             501|        223860|0.025971411314676868|       26|         4|                       0.0|
|        0|           13.0|             501|        223860|0.025971411314676868|       27|       

[Stage 11:>                                                         (0 + 1) / 1]                                                                                

In [12]:
# Define the path for the final dataset
final_dataset_path = "gs://flight-analysis-ms-bucket/trusted/final_dataset.parquet"

# Save the dataset
final_data.write.mode("overwrite").parquet(final_dataset_path)

print(f"Final dataset saved to {final_dataset_path}")


                                                                                

Final dataset saved to gs://flight-analysis-ms-bucket/trusted/final_dataset.parquet
