In [0]:
# Load the table and select only required columns
df = spark.table("mc.amadeus2.data_jan26").select(
    "trip_origin_city",
    "trip_destination_city",
    "flight_leg_departure_date",
    "flight_leg_total_seats"
)

# Display sample data
display(df.limit(10))

In [0]:
from pyspark.sql.functions import concat, lit, dayofweek

# Create route feature: origin_to_destination
df_with_route = df.withColumn(
    "route",
    concat("trip_origin_city", lit("_to_"), "trip_destination_city")
)

# Extract day_of_week for grouping (1=Sunday, 7=Saturday)
df_with_route = df_with_route.withColumn(
    "day_of_week",
    dayofweek("flight_leg_departure_date")
)

# Display sample
display(df_with_route.limit(10))

In [0]:
from pyspark.sql.functions import avg, round

# Group by route and day_of_week, calculate mean of flight_leg_total_seats
df_aggregated = df_with_route.groupBy("route", "day_of_week").agg(
    round(avg("flight_leg_total_seats"), 0).alias("avg_seats")
)

# Display sample
display(df_aggregated.limit(10))

In [0]:
from sklearn.ensemble import IsolationForest
import pandas as pd

# Convert to pandas DataFrame
df_pandas = df_aggregated.toPandas()

# Initialize Isolation Forest
iso_forest = IsolationForest(random_state=42)

# Fit on avg_seats only (reshape for sklearn)
X = df_pandas[['avg_seats']]
iso_forest.fit(X)

print(f"Model trained on {len(df_pandas)} route-day combinations")
print(f"avg_seats range: {df_pandas['avg_seats'].min()} - {df_pandas['avg_seats'].max()}")
display(df_pandas.head(10))

In [0]:
# Predict anomalies: -1 = anomaly, 1 = normal
df_pandas['anomaly'] = iso_forest.predict(X)

# Count anomalies
anomalies_count = (df_pandas['anomaly'] == -1).sum()
normal_count = (df_pandas['anomaly'] == 1).sum()

print(f"Total records: {len(df_pandas)}")
print(f"Anomalies detected: {anomalies_count} ({anomalies_count/len(df_pandas)*100:.2f}%)")
print(f"Normal records: {normal_count} ({normal_count/len(df_pandas)*100:.2f}%)")

# Display sample with anomalies
print("\nSample of detected anomalies:")
display(df_pandas[df_pandas['anomaly'] == -1].head(10))

In [0]:
import mlflow
import mlflow.sklearn

# Start MLflow run
with mlflow.start_run(run_name="flight_seat_anomaly_detection") as run:
    
    # Log parameters
    mlflow.log_param("model_type", "IsolationForest")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("feature", "avg_seats")
    mlflow.log_param("total_records", len(df_pandas))
    
    # Log metrics
    mlflow.log_metric("anomalies_count", anomalies_count)
    mlflow.log_metric("anomalies_percentage", anomalies_count/len(df_pandas)*100)
    mlflow.log_metric("normal_count", normal_count)
    mlflow.log_metric("avg_seats_min", df_pandas['avg_seats'].min())
    mlflow.log_metric("avg_seats_max", df_pandas['avg_seats'].max())
    
    # Log the model
    mlflow.sklearn.log_model(iso_forest, "isolation_forest_model")
    
    print(f"MLflow Run ID: {run.info.run_id}")
    print(f"Model logged successfully!")
    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Artifact URI: {run.info.artifact_uri}")