# Build and evaluate ensemble model

In [23]:
import os, io, boto3, zipfile, requests
import pandas as pd
from sklearn.model_selection import train_test_split
from sagemaker import get_execution_role, Session, image_uris
import sagemaker

In [24]:
bucket='c182567a4701745l12374482t1w262869721852-labbucket-jw0boyepjqpu'

In [25]:
df = pd.read_csv('combined_csv_v1_20percent.csv')

# label column is named 'target' and is first column 
if 'target' not in df.columns:
    raise ValueError("Expected a 'target' column in the dataset.")
cols = df.columns.tolist()
cols = ['target'] + [c for c in cols if c != 'target']
df = df[cols]

# Optional: ensure target is 0/1 ints
df['target'] = pd.to_numeric(df['target'], errors='coerce').fillna(0).astype(int)

# Train/Val/Test split (70/15/15)
train, temp = train_test_split(
    df, test_size=0.30, random_state=42, stratify=df['target']
)
validate, test = train_test_split(
    temp, test_size=0.50, random_state=42, stratify=temp['target']
)

print(f"Train: {len(train)}  Validate: {len(validate)}  Test: {len(test)}")

Train: 228982  Validate: 49068  Test: 49068


In [14]:
prefix = 'lab3'                  # change if you like

s3_resource = boto3.Session().resource('s3')

def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    # header=False, index=False like your format
    dataframe.to_csv(csv_buffer, header=False, index=False)
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

# Filenames (mirroring your naming convention)
train_file    = 'flight_delay_train.csv'
validate_file = 'flight_delay_validate.csv'
test_file     = 'flight_delay_test.csv'

# Upload splits
upload_s3_csv(train_file, 'train', train)
upload_s3_csv(validate_file, 'validate', validate)
upload_s3_csv(test_file, 'test', test)

print(f"S3 paths:\n  s3://{bucket}/{prefix}/train/{train_file}\n  s3://{bucket}/{prefix}/validate/{validate_file}\n  s3://{bucket}/{prefix}/test/{test_file}")

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


S3 paths:
  s3://c182567a4701745l12374482t1w262869721852-labbucket-jw0boyepjqpu/lab3/train/flight_delay_train.csv
  s3://c182567a4701745l12374482t1w262869721852-labbucket-jw0boyepjqpu/lab3/validate/flight_delay_validate.csv
  s3://c182567a4701745l12374482t1w262869721852-labbucket-jw0boyepjqpu/lab3/test/flight_delay_test.csv


In [26]:
region = boto3.Session().region_name
region

'us-east-1'

In [27]:
container = image_uris.retrieve('xgboost',boto3.Session().region_name,'1.0-1')

hyperparams={"num_round":"42",
             "eval_metric": "auc",
             "objective": "binary:logistic"}

s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

xgb_model.fit(inputs=data_channels, logs=False)

print('ready for hosting!')

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-10-30-12-53-43-736



2025-10-30 12:53:45 Starting - Starting the training job.
2025-10-30 12:53:59 Starting - Preparing the instances for training....
2025-10-30 12:54:23 Downloading - Downloading input data.....
2025-10-30 12:54:53 Downloading - Downloading the training image..........
2025-10-30 12:55:49 Training - Training image download completed. Training in progress....
2025-10-30 12:56:10 Uploading - Uploading generated training model...
2025-10-30 12:56:28 Completed - Training job completed
ready for hosting!


In [28]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                serializer = sagemaker.serializers.CSVSerializer(),
                instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-10-30-12-57-34-652
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-10-30-12-57-34-652
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-10-30-12-57-34-652


-------!

In [29]:
non_numeric_cols = test.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", list(non_numeric_cols))

Non-numeric columns: ['Quarter_2', 'Quarter_3', 'Quarter_4', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'DayofMonth_2', 'DayofMonth_3', 'DayofMonth_4', 'DayofMonth_5', 'DayofMonth_6', 'DayofMonth_7', 'DayofMonth_8', 'DayofMonth_9', 'DayofMonth_10', 'DayofMonth_11', 'DayofMonth_12', 'DayofMonth_13', 'DayofMonth_14', 'DayofMonth_15', 'DayofMonth_16', 'DayofMonth_17', 'DayofMonth_18', 'DayofMonth_19', 'DayofMonth_20', 'DayofMonth_21', 'DayofMonth_22', 'DayofMonth_23', 'DayofMonth_24', 'DayofMonth_25', 'DayofMonth_26', 'DayofMonth_27', 'DayofMonth_28', 'DayofMonth_29', 'DayofMonth_30', 'DayofMonth_31', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4', 'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7', 'Reporting_Airline_DL', 'Reporting_Airline_OO', 'Reporting_Airline_UA', 'Reporting_Airline_WN', 'Origin_CLT', 'Origin_DEN', 'Origin_DFW', 'Origin_IAH', 'Origin_LAX', 'Origin_ORD', 'Origin_PHX', 'Origin_SFO', 'Dest_CLT', 'De

In [30]:
# Convert boolean True/False to 1/0
for col in test.select_dtypes(include=['bool']).columns:
    test[col] = test[col].astype(int)

# Convert 'True'/'False' strings to numeric 1/0
for col in test.columns:
    if test[col].dtype == object:
        if set(test[col].unique()) <= {'True', 'False'}:
            test[col] = test[col].map({'True': 1, 'False': 0})

In [31]:
non_numeric_cols = test.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", list(non_numeric_cols))

Non-numeric columns: []


In [32]:
test.head()

Unnamed: 0,target,Distance,Quarter_2,Quarter_3,Quarter_4,Month_2,Month_3,Month_4,Month_5,Month_6,...,DepHourofDay_14,DepHourofDay_15,DepHourofDay_16,DepHourofDay_17,DepHourofDay_18,DepHourofDay_19,DepHourofDay_20,DepHourofDay_21,DepHourofDay_22,DepHourofDay_23
234564,0,602.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32337,0,802.0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10046,1,370.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
228703,0,862.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
101846,0,1199.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [33]:
import re
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import BytesDeserializer
import numpy as np

# Prepare test data
X_test = test.drop('target', axis=1).astype('float32')
y_test = test['target'].astype(int).to_numpy()

# Configure predictor
xgb_predictor.serializer   = CSVSerializer()
xgb_predictor.deserializer = BytesDeserializer()  # return raw bytes
xgb_predictor.content_type = 'text/csv'
xgb_predictor.accept       = 'text/csv'

def parse_scores(resp_bytes, expect_n=None):
    """Parse CSV text that may be one line with commas or many lines."""
    text = resp_bytes.decode('utf-8', errors='ignore').strip()
    # Split on commas, whitespace, or newlines; filter empties
    tokens = [t for t in re.split(r'[,\r\n\s]+', text) if t]
    scores = np.array([float(t) for t in tokens], dtype=float)
    if expect_n is not None and scores.size != expect_n:
        print(f"Parsed {scores.size} scores, expected {expect_n}. First 120 chars:\n{text[:120]}")
    return scores

pred_scores = []
batch_size = 200

for start in range(0, len(X_test), batch_size):
    batch = X_test.iloc[start:start + batch_size]
    payload = batch.to_csv(header=False, index=False).strip()
    resp = xgb_predictor.predict(payload)           # bytes
    scores = parse_scores(resp, expect_n=len(batch))
    pred_scores.extend(scores.tolist())

pred_scores = np.array(pred_scores, dtype=float)
pred_labels = (pred_scores >= 0.5).astype(int)

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

print("Accuracy :", accuracy_score(y_test, pred_labels))
print("Precision:", precision_score(y_test, pred_labels, zero_division=0))
print("Recall   :", recall_score(y_test, pred_labels, zero_division=0))
print("F1-score :", f1_score(y_test, pred_labels, zero_division=0))
try:
    print("ROC AUC  :", roc_auc_score(y_test, pred_scores))
except Exception as e:
    print("ROC AUC  : n/a (", e, ")")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_labels))
print("\nClassification Report:\n", classification_report(y_test, pred_labels, zero_division=0))

Accuracy : 0.7915749572022499
Precision: 0.0
Recall   : 0.0
F1-score : 0.0
ROC AUC  : 0.5584638278292663

Confusion Matrix:
 [[38841     0]
 [10227     0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      1.00      0.88     38841
           1       0.00      0.00      0.00     10227

    accuracy                           0.79     49068
   macro avg       0.40      0.50      0.44     49068
weighted avg       0.63      0.79      0.70     49068



### Comparison between Linear and XGBoost on Combined Data:

When comparing the Linear Learner and XGBoost models trained on the combined dataset v1 (70/15/15 split), the results clearly show that increasing the model complexity from a simple linear approach to an ensemble method did not produce substantial improvement in predictive performance. Both models achieved an overall accuracy of around 79%, but this metric is deceptive in the context of a highly imbalanced dataset—where the number of non-delayed flights vastly exceeds the number of delayed ones. Accuracy alone does not reflect true predictive capability for minority classes, and in this case, both models predominantly classified almost every flight as “on time,” ignoring the delayed cases. This issue is evident in the confusion matrices, where the majority of predictions fell within the true negative category, and very few (if any) positive predictions were made.

The Linear Learner, which serves as a baseline linear classification model, performed marginally better than XGBoost in identifying at least a handful of delayed flights. It achieved a precision of 0.46, indicating that some of its positive predictions were correct, but its recall was only 0.0012, meaning it captured less than 1% of actual delayed cases. This resulted in a very low F1-score (0.0023), showing a poor balance between precision and recall. The confusion matrix confirms that while the model correctly classified most non-delayed flights (38,827 out of 38,841), it failed to detect almost all delayed ones (only 12 out of 10,227). The model’s inability to generalize delay patterns suggests that the relationships between the input features and delay events are either weakly linear or overshadowed by the dominance of the majority class.

The XGBoost model, a more sophisticated ensemble learner that typically excels in handling non-linear relationships and complex feature interactions, surprisingly performed no better. It yielded zero precision, recall, and F1-score for the delayed class, with a ROC AUC of 0.558, only marginally above random guessing. The confusion matrix reveals that XGBoost classified all flights as non-delayed, completely neglecting the minority class. This outcome suggests that the model overfitted to the dominant class during training or that its hyperparameters (such as scale_pos_weight, learning rate, or number of estimators) were not tuned to handle the severe class imbalance. Essentially, XGBoost failed to leverage its usual strengths in this setting because the imbalance in class distribution overwhelmed the model’s decision boundaries.

Overall, both models demonstrate the limitations of using standard training pipelines on imbalanced datasets. The Linear Learner offered minimal detection of delayed flights, whereas XGBoost, despite being an advanced ensemble algorithm, defaulted entirely to the majority prediction. This indicates that model complexity alone cannot overcome data imbalance issues without appropriate preprocessing and tuning strategies. To achieve meaningful improvement, future iterations should incorporate techniques such as resampling (SMOTE or undersampling), adjusting class weights, or focusing on alternative metrics like Precision-Recall AUC. Additionally, introducing more discriminative features, such as weather severity indices, traffic congestion, or time-of-day patterns, may help capture non-linear dependencies that simple and ensemble models alike failed to recognize in this experiment.