In [None]:
spark

In [1]:
import io
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from google.cloud import storage
from pyspark.ml import PipelineModel

SEED = 645
bucket_name = "my-bigdata-project-mp"

# Get data with features vector and model

In [None]:
# Path for data used for model
data_path = "gs://my-bigdata-project-mp/trusted"

# Load data into a PySpark DataFrame
sdf = spark.read.parquet(data_path)

In [None]:
# Checking schema
sdf.printSchema()

In [None]:
sdf.count()

In [None]:
# Path for the linear regression model
model_path = "gs://my-bigdata-project-mp/models/flight_prices_linear_regression_model"

# Load PipelineModel into a variable
pipeline = PipelineModel.load(model_path)

# Extract the model
lr_model = pipeline.stages[-1]

In [None]:
# FUNCTION
# ARG1 - matplot variable you used for your plot
# ARG1 - Name you want to give the image.
# ARG2 - The type you want the image to be. This function assumes we want a PNG.

def save_fig(plt, img_name, img_type="png"):
    print("Saving figure...")
    # Create a memory buffer to hold the figure
    img_data = io.BytesIO()
    # Write the figure to the buffer
    plt.savefig(img_data, format=img_type, bbox_inches='tight')
    # Rewind the pointer to the start of the data
    img_data.seek(0)
    # Connect to Google Cloud Storage
    storage_client = storage.Client()
    # Point to the bucket
    bucket = storage_client.get_bucket(bucket_name)
    # Create a blob to hold the data. Give it a file name
    blob = bucket.blob(img_name+"."+img_type)
    # Upload the img_data contents to the blob
    blob.upload_from_file(img_data)
    print("Picture successfully uploaded!")

 # Predicted vs Actual
 - Scatter plot of predicted vs actual
 - Shows how accurate the model is (closer to the line means better prediction)

In [None]:
# Scatter plot of predicted vs. actual

# Define what name the image file for this picture will have and the type of image it will be saved as
img_name = "actual_vs_predicted"
img_type = "png"

df = sdf.select("prediction","totalFare").sample(False, 0.01, seed=SEED).toPandas()

plt.figure(figsize=(8, 8))
sns.scatterplot(x=df['totalFare'], y=df['prediction'], alpha=0.2)
plt.plot([df['totalFare'].min(), df['totalFare'].max()],
         [df['totalFare'].min(), df['totalFare'].max()],
         color='red', linestyle='--', label='Ideal Fit')  # Add a reference line for ideal fit
plt.title('Predicted vs Actual')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.xticks([0,100,200,300,400,500,600,700,800])  # Tick marks from 0 to 1000 with step of 100
plt.yticks([0,100,200,300,400,500,600,700,800,900,1000])  # Same for y-axis
plt.legend()
plt.grid()

save_fig(plt,img_name,img_type)
plt.show()

 # Histogram of Residuals
 - Normality: If the residuals are normally distributed (bell-shaped curve), this supports the normality assumption of linear regression. If the residuals are skewed or have outliers, this suggests violations of the normality assumption.

In [None]:

# Define what name the image file for this picture will have and the type of image it will be saved as
img_name = "histogram_of_residuals"
img_type = "png"

# Extract actual values and predicted values
result_df = sdf.select("prediction","totalFare").sample(False, 0.01, seed=SEED).toPandas()

# Compute residuals (difference between actual and predicted)
result_df['residual'] = result_df['totalFare'] - result_df['prediction']

plt.figure(figsize=(10, 6))
sns.histplot(result_df['residual'], kde=True, bins=30, color='blue')

plt.title('Residuals Histogram')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True)

save_fig(plt,img_name,img_type)
plt.show()

del result_df

 # Correlation Matrix
 - With many new features created and most numerical, we can check correlations for more columns

In [None]:
# Correlation Matrix

# 1st, gett all numerical columns to do the correlation matrix

# Step 1: Grab one row from the PySpark DataFrame and convert it to Pandas
row_df = sdf.limit(1).toPandas()
# Step 2: Extract numerical columns from the Pandas DataFrame
numeric_columns = row_df.select_dtypes(include=['number']).columns.tolist()
# Step 3: Re-select the numerical columns from the original PySpark DataFrame
sdf_numeric = sdf.select(*numeric_columns).drop("isRefundableBinarized","baseFare")
# Step 4: Convert the selected PySpark DataFrame to a Pandas DataFrame
df = sdf_numeric.sample(False, 0.01, seed=SEED).toPandas()

In [None]:
# 2nd, Compute correlation matrix

# Define what name the image file for this picture will have and the type of image it will be saved as
img_name = "correlation_matrix_post_pipeline"
img_type = "png"

# Compute correlation matrix
corr_matrix = df.corr()

# Create a mask to remove the upper triangle of the correlation matrix
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Apply the mask to the correlation matrix (set upper triangle to NaN or zero)
masked_corr_matrix = corr_matrix.mask(mask)

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(masked_corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=1)
plt.title('Correlation Matrix')

save_fig(plt,img_name,img_type)
plt.show()

 # Feature Coefficients
 - Plots the coefficients of each feature
 - Note: If changing anything related to the ordering of features in model creation,
   then the features here must also be changed to reflect the same order. Otherwise, the coefficients will not actually correlate to the features.

In [None]:
# Get the coefficients and intercept
coefficients = lr_model.coefficients

# Get the feature names (order must match the feature vector in assembler)
feature_columns = [
    "startingAirportVector", "destinationAirportVector", "fareBasisCodeVector",
    "segmentsArrivalAirportCodeVector", "segmentsDepartureAirportCodeVector",
    "segmentsAirlineNameVector", "segmentsAirlineCodeVector", "segmentsEquipmentDescriptionVector",
    "segmentsDistanceVector", "segmentsCabinCodeVector", "numScaled", 
    "searchDateisWeekend", "flightDateisWeekend",
    "isBasicEconomyBinarized", "isRefundableBinarized", "isNonStopBinarized"
]

# Combine coefficients with feature names
feature_coefficients = list(zip(feature_columns, coefficients))

# Optionally, use pandas to display the coefficients for easier interpretation
feature_coefficients_df = pd.DataFrame(feature_coefficients, columns=["Feature", "Coefficient"])

# Display the DataFrame
print(feature_coefficients_df)


In [None]:
# Define what name the image file for this picture will have and the type of image it will be saved as
img_name = "feature_coefficients"
img_type = "png"

# Plot the coefficients
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=feature_coefficients_df)
plt.title('Feature Coefficients from Linear Regression')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')

save_fig(plt,img_name,img_type)
plt.show()

# Get model Hyperparameters
- These are the parameters of the best model produced from ModelCreation script

In [None]:
# Print hyperparameters from Linear Regression Model
print("Best Model Parameters:")
for param, value in lr_model.extractParamMap().items():
    print(f"{param.name}: {value}")