# 04 Baseline Model

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import boto3
import sagemaker
from pyathena import connect


# Query Values from Development Database

In [None]:
# Define file path
csv_path = "development_data.csv"

# Check if CSV file exists
if os.path.exists(csv_path):
    print("Loading data from local CSV...")
    df = pd.read_csv(csv_path)
else:
    print("CSV not found! Querying from Athena...")
    
    # Use the same Athena connection details
    database_name = "db_airline_delay_cause"
    table_name = "development_data"
    bucket = sagemaker_session.default_bucket()  
    s3_staging_dir = f"s3://{bucket}/athena-results/"
    
    # Athena connection
    conn = connect(s3_staging_dir=s3_staging_dir, region_name="us-east-1")
    
    # Query development dataset
    query = f"SELECT * FROM {database_name}.{table_name};"
    df = pd.read_sql(query, conn)
    
    # Save locally for future use
    df.to_csv(csv_path, index=False)
    print(f"Data saved locally as {csv_path}")

# Define feature columns and target variable
features = [
    "arr_flights", "arr_del15", "carrier_ct", "weather_ct", "nas_ct", 
    "security_ct", "late_aircraft_ct", "arr_cancelled", "arr_diverted", 
    "arr_delay", "carrier_delay", "weather_delay", "nas_delay", 
    "security_delay", "late_aircraft_delay", "delay_rate"
]
target = "on_time"

# Ensure no missing values
df = df.dropna()



# Create Linear Regression Baseline Model

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Initialize and train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)



# Evaluate Model Performance

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"RÂ² Score: {r2:.4f}")




# Visualization

In [None]:
# Visualization
plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred, alpha=0.5, color='blue')
plt.xlabel("Actual On-Time")
plt.ylabel("Predicted On-Time")
plt.title("Training Set Predictions")
plt.grid()

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred, alpha=0.5, color='red')
plt.xlabel("Actual On-Time")
plt.ylabel("Predicted On-Time")
plt.title("Test Set Predictions")
plt.grid()

plt.tight_layout()
plt.show()


# Store Model

In [None]:
# Save the model for future use
joblib.dump(model, "baseline_model.pkl")
print("Model saved as baseline_model.pkl")