# Model Creation Script

In [1]:
spark

# Get clean data

In [2]:
# Url for flight data
url = "gs://my-bigdata-project-mp/cleaned"

# Load data into a PySpark DataFrame
df = spark.read.parquet(url)

                                                                                

In [3]:
# Display our inital Schema before all changes.
df.printSchema()

root
 |-- legId: string (nullable = true)
 |-- searchDate: date (nullable = true)
 |-- flightDate: date (nullable = true)
 |-- startingAirport: string (nullable = true)
 |-- destinationAirport: string (nullable = true)
 |-- fareBasisCode: string (nullable = true)
 |-- elapsedDays: integer (nullable = true)
 |-- isBasicEconomy: boolean (nullable = true)
 |-- isRefundable: boolean (nullable = true)
 |-- isNonStop: boolean (nullable = true)
 |-- baseFare: double (nullable = true)
 |-- totalFare: double (nullable = true)
 |-- seatsRemaining: integer (nullable = true)
 |-- totalTravelDistance: integer (nullable = true)
 |-- segmentsDepartureTimeEpochSeconds: string (nullable = true)
 |-- segmentsDepartureTimeRaw: string (nullable = true)
 |-- segmentsArrivalTimeEpochSeconds: string (nullable = true)
 |-- segmentsArrivalTimeRaw: string (nullable = true)
 |-- segmentsArrivalAirportCode: string (nullable = true)
 |-- segmentsDepartureAirportCode: string (nullable = true)
 |-- segmentsAirlineNa

In [4]:
# Drop legId since it will not be useful for ML as it is an identifer.
df.drop("legId")

# Drop baseFare because of its high correlation to totalFare. 
df.drop("baseFare")

DataFrame[legId: string, searchDate: date, flightDate: date, startingAirport: string, destinationAirport: string, fareBasisCode: string, elapsedDays: int, isBasicEconomy: boolean, isRefundable: boolean, isNonStop: boolean, totalFare: double, seatsRemaining: int, totalTravelDistance: int, segmentsDepartureTimeEpochSeconds: string, segmentsDepartureTimeRaw: string, segmentsArrivalTimeEpochSeconds: string, segmentsArrivalTimeRaw: string, segmentsArrivalAirportCode: string, segmentsDepartureAirportCode: string, segmentsAirlineName: string, segmentsAirlineCode: string, segmentsEquipmentDescription: string, segmentsDurationInSeconds: string, segmentsDistance: string, segmentsCabinCode: string, travelDurationMinutes: int]

# Import libraries for ML

In [5]:
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, Binarizer
from pyspark.ml import Pipeline
# Import the logistic regression model
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
# Import the evaluation module
from pyspark.ml.evaluation import *
# Import the model tuning module
from pyspark.ml.tuning import *
# We will use these functions to extract the numbers from the strings in travelDuration
from pyspark.sql.functions import regexp_extract, col, when, expr, split, rand

In [6]:
# Sample a portion of the dataset. Using all roughly 76 million records, with all features, will take too long.
SEED = 645
sample_percentage = 0.05 # 1% (or 0.1) of the dataset represents around 7,600,000 rows. 
df = df.sample(withReplacement=False, fraction=sample_percentage, seed=SEED)

# Create transform functions
   - Function to seperate segments
   - Function to extract features from date columns
   - Function to binarize certain features

In [8]:
# Function to seperate a segment column into new columns.
# Three new columns will be created, and split values will enter them.
# ARG1: Pyspark dataframe you want to transform
# ARG2: the name of the column you want to seperate
# ARG3: How many columns you want to create
#   EX- If you expect 3 values in a segment colum, set split to 3 so all values can be captured
#       If set too high, there will be columns with all 0's
#       If set too low, then some data will not be captured.

def seperate_segment(df, column_name, splits=2):
    
    # Will keep track of how many segments are made then return them as a list
    name_list_of_split_segments = []
    
    # Split the column using '||' and extract elements
    split_col = split(col(column_name), r'\|\|')  # Split on '||'
    
    # Add three new columns for the first three parts of the split, replacing nulls with 0 and casting result as integers
    for i in range(splits):
        df = df.withColumn(
            f"{column_name}[{i}]", 
            when(split_col.getItem(i).isNull(), lit(0))
            .otherwise(split_col.getItem(i))
            .cast("int"))  # Cast to integer
        
        # Add name of new split to list
        name_list_of_split_segments.append(f"{column_name}[{i}]")
    
    return df, name_list_of_split_segments

In [9]:
# Function to extract features from a date column (Month,Day,DayOfWeek,isWeekend)
# ARG1: Pyspark dataframe you want to transform
# ARG2: Name of date column you want to extract features from

def date_feature_extraction(df, col_name: str):
    df = df.withColumn(col_name+"Month", month(col(col_name)))
    df = df.withColumn(col_name+"Day", day(col(col_name)))
    df = df.withColumn(col_name+"DayOfWeek", dayofweek(col(col_name)))
    df = df.withColumn(col_name+"isWeekend", when(df[col_name+"DayOfWeek"] == 1, 1.0).\
                       when(df[col_name+"DayOfWeek"] == 7, 1.0).otherwise(0))
    return df

In [10]:
# Function that turns a boolean into either a 0 (False) or 1 (True)
# ARG1: Pyspark dataframe you want to transform
# ARG2: Name of boolean column you want to transform
def boolean_binarizer(df, col_name: str):
    df = df.withColumn(col_name+"Binarized", when(df[col_name] == True, 1.0).\
                      otherwise(0))
    return df

# Transform the dataframe with newly created functions

In [11]:
# Seperate certain columns which could allow for better model performance when using seperated columns to train.
# COLUMNS TO POSSIBLY SEPERATE
# - segmentsArrivalTimeRaw
# - segmentsArrivalTimeEpochSeconds
# - segmentsDepartureTimeRaw
# - segmentsDepartureTimeEpochSeconds
# - segmentsDurationInSeconds
#
# The only column may be worth seperating is "segmentsDurationInSeconds", which technically could  be correlated 
# to both arrival and departure time. This cant be proven visually since those columns are all different in format,
# but it may be possible to derive arrival and departure time if we have duration. To truly prove this may require
# reformatting the data quite a bit, but for now, we will only seperate one column.

df, new_split_segments = seperate_segment(df, "segmentsDurationInSeconds", splits=2)
split_segments = split_segments + new_split_segments

# Visualize if columns are separated correctly
#
#df.select("segmentsDurationInSeconds","segmentsDurationInSeconds[0]",
#          "segmentsDurationInSeconds[1]","segmentsDurationInSeconds[2]").show(10)

In [None]:
# Checking outliers for the split segmentsDuration column. These outliers have yet to be dropped for the ML model.

# Function from Clean Data script that will be used to remove outliers on newly split columns
def set_min_max_col(spark_df, col_name: str, min: float, max: float):
    new_df = spark_df.where((col(col_name) <= max) & (col(col_name) >= min))
    return new_df

# Function to view outliers on newly split columns.
def view_seg_splits(sdf, col_name, splits=2):
    for i in range(splits):
        col_to_select = col_name+"["+str(i)+"]"
        df = sdf.select(col_to_select).sample(False, 0.08, seed=SEED).toPandas()
        df.boxplot(column=[col_to_select])
        plt.show()
        
# Count before and after setting min-max on columns. 
count = df.count()
print("BEFORE-------"+"[ OLD Count: "+str(count)+" ]---------------------")    
view_seg_splits(df, "segmentsDurationInSeconds", splits=2)

df = set_min_max_col(df,"segmentsDurationInSeconds[0]", 0, 20000)
df = set_min_max_col(df,"segmentsDurationInSeconds[1]", 0, 19000)
#sdf_adj = set_min_max_col(sdf_adj,"segmentsDurationInSeconds[2]", 0, 4000)

count = df.count()
print("AFTER-------"+"[ NEW Count: "+str(count)+" ]---------------------")    
view_seg_splits(df, "segmentsDurationInSeconds", splits=2)

In [12]:
# Extract features from date columns
df = date_feature_extraction(df, "searchDate")
df = date_feature_extraction(df, "flightDate")

# Visualize old cols vs new cols
#
#df.select("searchDate","searchDateMonth","searchDateDay","searchDateisWeekend","searchDateDayOfWeek").filter(col("searchDateisWeekend") == 1).show(10)
#df.select("flightDate","flightDateMonth","flightDateDay","flightDateisWeekend","flightDateDayOfWeek").show(10)

In [13]:
# Columns to binarize: isBasicEconomy, isRefundable, isNonStop
df = boolean_binarizer(df, "isBasicEconomy")
df = boolean_binarizer(df, "isRefundable")
df = boolean_binarizer(df, "isNonStop")

# Visualize old cols vs new cols
#
#df.select("isBasicEconomy","isBasicEconomyBinarized").show()
#df.select("isRefundable","isRefundableBinarized").show()
#df.select("isNonStop","isNonStopBinarized").show()

In [14]:
# Check schema to make all features are correct datatypes
df.printSchema()

root
 |-- legId: string (nullable = true)
 |-- searchDate: date (nullable = true)
 |-- flightDate: date (nullable = true)
 |-- startingAirport: string (nullable = true)
 |-- destinationAirport: string (nullable = true)
 |-- fareBasisCode: string (nullable = true)
 |-- elapsedDays: integer (nullable = true)
 |-- isBasicEconomy: boolean (nullable = true)
 |-- isRefundable: boolean (nullable = true)
 |-- isNonStop: boolean (nullable = true)
 |-- baseFare: double (nullable = true)
 |-- totalFare: double (nullable = true)
 |-- seatsRemaining: integer (nullable = true)
 |-- totalTravelDistance: integer (nullable = true)
 |-- segmentsDepartureTimeEpochSeconds: string (nullable = true)
 |-- segmentsDepartureTimeRaw: string (nullable = true)
 |-- segmentsArrivalTimeEpochSeconds: string (nullable = true)
 |-- segmentsArrivalTimeRaw: string (nullable = true)
 |-- segmentsArrivalAirportCode: string (nullable = true)
 |-- segmentsDepartureAirportCode: string (nullable = true)
 |-- segmentsAirlineNa

# Create pipeline

In [15]:
# Define the string columns to be indexed and encoded
string_columns = [
    "startingAirport", "destinationAirport", "fareBasisCode",
    "segmentsArrivalAirportCode", "segmentsDepartureAirportCode",
    "segmentsAirlineName", "segmentsAirlineCode", 
    "segmentsEquipmentDescription", "segmentsDistance", "segmentsCabinCode"
]

# Dynamically generate input and output column names for indexer and encoder
indexer_output_columns = [f"{col}Index" for col in string_columns]
encoder_output_columns = [f"{col}Vector" for col in string_columns]

# Create the StringIndexer
indexer = StringIndexer(
    inputCols=string_columns, 
    outputCols=indexer_output_columns,
    handleInvalid="keep"
)

# Create OneHotEncoder
encoder = OneHotEncoder(
    inputCols=indexer_output_columns, 
    outputCols=encoder_output_columns, 
    dropLast=True,
    handleInvalid="keep"
)


# Define all numerical columns that will be put through their own assembler, then scaled. 
num_columns = [ "searchDateMonth", "searchDateDay", "flightDateMonth", "flightDateDay", 
            "elapsedDays", "seatsRemaining", "totalTravelDistance", 
            "travelDurationMinutes", "searchDateDayOfWeek", "flightDateDayOfWeek",
              ] + split_segments

numerical_assembler = VectorAssembler(inputCols=num_columns,outputCol="numVector")
scaler = StandardScaler(inputCol="numVector",outputCol="numScaled")


# Create an assembler
assembler = VectorAssembler(inputCols=["startingAirportVector", "destinationAirportVector", "fareBasisCodeVector",
                                  "segmentsArrivalAirportCodeVector","segmentsDepartureAirportCodeVector",
                                  "segmentsAirlineNameVector","segmentsAirlineCodeVector","segmentsEquipmentDescriptionVector",
                                  "segmentsDistanceVector","segmentsCabinCodeVector","numScaled","searchDateisWeekend","flightDateisWeekend",
                                      "isBasicEconomyBinarized","isRefundableBinarized","isNonStopBinarized"], 
                            outputCol="features")

# Create a Linear Regression Estimator
linear_reg = LinearRegression(labelCol="totalFare")

In [16]:
# Create the pipeline
flights_pipe = Pipeline(stages=[indexer, encoder, 
                                numerical_assembler,
                                scaler, assembler, 
                                linear_reg])

# Call .fit to transform the data
transformed_df = flights_pipe.fit(df).transform(df)

24/12/02 07:20:51 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/12/02 07:21:11 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:21:47 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:21:51 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:22:21 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:22:23 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:22:26 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:22:28 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:22:32 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:22:33 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:22:37 WARN DAGScheduler: 

24/12/02 07:29:59 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:30:03 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:30:04 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:30:08 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:30:09 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:30:13 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:30:14 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:30:18 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:30:19 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:30:23 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
24/12/02 07:30:24 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
24/12/02 07:30:27 WARN DAGScheduler: Broadcasting larg

In [17]:
# Review features
transformed_df.select("features").show(10, truncate=False)

24/12/02 07:31:25 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                                                                                                                                                                                                               |
+---------------------------------------------------------------------------------------------------------------------------------------------------

# Train Models

In [18]:
# Split the data into training and test sets
trainingData, testData = df.randomSplit([0.70, 0.3], seed=SEED)

In [19]:
# Create a regression evaluator (to get RMSE, R2, RME, etc.)
evaluator = RegressionEvaluator(labelCol="totalFare")

In [20]:
# Create a grid to hold hyperparameters 
grid = ParamGridBuilder()

In [21]:
# Build the parameter grid
grid = grid.build()

In [22]:
# Create the CrossValidator using the hyperparameter grid
cv = CrossValidator(estimator=flights_pipe, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator, 
                    numFolds=3)

In [23]:
# Train the models
all_models  = cv.fit(trainingData)

24/12/02 07:32:07 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:17 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:18 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:28 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:29 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:32 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:32 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:35 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:36 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:39 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:40 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:32:43 WARN DAGScheduler: Broadcasting larg

24/12/02 07:38:12 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:15 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:16 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:19 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:19 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:22 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:23 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:26 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:27 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:29 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:30 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:38:33 WARN DAGScheduler: Broadcasting larg

24/12/02 07:45:34 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:37 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:37 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:40 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:41 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:44 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:45 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:48 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:49 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:52 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:53 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:45:55 WARN DAGScheduler: Broadcasting larg

24/12/02 07:52:55 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:52:58 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:52:58 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:01 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:02 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:05 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:06 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:09 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:10 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:13 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:13 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/02 07:53:16 WARN DAGScheduler: Broadcasting larg

24/12/02 08:02:17 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:21 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:22 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:25 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:27 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:30 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:31 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:35 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:36 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:40 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:41 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/12/02 08:02:44 WARN DAGScheduler: Broadcasting larg

# Metrics

In [24]:
# Show the average performance over the six folds
print(f"Average metric {all_models.avgMetrics}")

Average metric [55.308132794724465]


# Get best model

In [25]:
# Get the best model from all of the models trained
bestModel = all_models.bestModel

# Use the model 'bestModel' to predict the test set
test_results = bestModel.transform(testData)

# Show the predicted totalFare
test_results.select('totalFare', 'prediction').orderBy(rand(seed=SEED)).limit(20).show(truncate=False)

# Calculate RMSE and R2
rmse = evaluator.evaluate(test_results, {evaluator.metricName:'rmse'})
r2 =evaluator.evaluate(test_results,{evaluator.metricName:'r2'})
print(f"RMSE: {rmse}  R-squared:{r2}")

24/12/02 08:06:54 WARN DAGScheduler: Broadcasting large task binary with size 8.6 MiB
                                                                                

+---------+------------------+
|totalFare|prediction        |
+---------+------------------+
|457.6    |335.6070943229785 |
|331.6    |236.161544762181  |
|181.6    |165.59969797758777|
|463.7    |504.63027023209065|
|228.6    |213.32221487097178|
|241.6    |233.97993287584342|
|281.6    |445.0608382548361 |
|128.6    |129.632597412518  |
|212.6    |227.7614131802248 |
|588.6    |478.7053221167554 |
|317.6    |340.5538972065891 |
|571.6    |600.2815471526486 |
|178.6    |129.72774665694737|
|198.6    |230.55948983520022|
|378.6    |393.5438723331017 |
|208.6    |230.3580659377818 |
|497.6    |445.7897225970332 |
|767.6    |781.7053092850845 |
|597.2    |433.913655429527  |
|317.6    |310.4454567379491 |
+---------+------------------+



24/12/02 08:07:27 WARN DAGScheduler: Broadcasting large task binary with size 8.6 MiB
24/12/02 08:07:59 WARN DAGScheduler: Broadcasting large task binary with size 8.6 MiB
24/12/02 08:08:00 WARN DAGScheduler: Broadcasting large task binary with size 8.6 MiB
24/12/02 08:08:29 WARN DAGScheduler: Broadcasting large task binary with size 8.6 MiB


RMSE: 55.019262513041525  R-squared:0.8873606113076086




# Save model and data

In [26]:
# Save best model
url_model = "gs://my-bigdata-project-mp/models/flight_prices_linear_regression_model" # save model here
bestModel.write().overwrite().save(url_model)

# Save transformed data
url_trusted = "gs://my-bigdata-project-mp/trusted" # save data with features here
transformed_df.write.parquet(path=url_trusted, mode="overwrite")

24/12/02 08:08:33 WARN TaskSetManager: Stage 1119 contains a task of very large size (1181 KiB). The maximum recommended task size is 1000 KiB.
24/12/02 08:08:47 WARN DAGScheduler: Broadcasting large task binary with size 15.2 MiB
                                                                                