In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("combined_csv_v1.csv")

In [3]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(df, test_size=0.3)
validation, test = train_test_split(validation, test_size = 0.5)

In [None]:
train.to_csv("train.csv", index = False, header = False)
validation.to_csv("validation.csv", index = False, header = False)

In [10]:
test_X =  test.drop("target", axis = 1)
test_y = test["target"]
test_X.to_csv("test_X.csv", index = False, header = False)

In [8]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [11]:
prefix = "flight_delay"
train_path = sess.upload_data(path="train.csv", key_prefix=prefix + "/part2/input/training")
valid_path = sess.upload_data(path="validation.csv", key_prefix=prefix + "/part2/input/validation")
test_X_path = sess.upload_data(path="test_X.csv", key_prefix=prefix + "/part2/input/test")

In [12]:
import boto3
from sagemaker import image_uris
region = boto3.Session().region_name
container = image_uris.retrieve('xgboost', region,version='latest')

In [14]:
from sagemaker.estimator import Estimator
xgb_estimator = Estimator(container, role=sagemaker.get_execution_role(), instance_count=1, instance_type='ml.m5.large',
                          output_path='s3://{}/{}/part2/output'.format(bucket,prefix))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [21]:
xgb_estimator.set_hyperparameters(objective='binary:logistic', num_round=10)

In [22]:
training_data_channel = sagemaker.TrainingInput(s3_data=train_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=valid_path, content_type='text/csv')

In [23]:
xgb_estimator.fit({'train': training_data_channel,'validation': validation_data_channel})

INFO:sagemaker:Creating training-job with name: xgboost-2023-11-02-17-12-04-300


2023-11-02 17:12:04 Starting - Starting the training job......
2023-11-02 17:12:49 Starting - Preparing the instances for training......
2023-11-02 17:13:55 Downloading - Downloading input data......
2023-11-02 17:14:40 Training - Downloading the training image...
2023-11-02 17:15:16 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2023-11-02:17:15:21:INFO] Running standalone xgboost training.[0m
[34m[2023-11-02:17:15:21:INFO] File size need to be processed in the node: 257.72mb. Available memory size in the node: 348.14mb[0m
[34m[2023-11-02:17:15:21:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:15:21] S3DistributionType set as FullyReplicated[0m
[34m[17:15:24] 1144913x93 matrix with 106476909 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-11-02:17:15:24:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:15:24] S3DistributionType set as FullyReplicated[0m


In [24]:
xgb_transformer = xgb_estimator.transformer(
    instance_count=1,
    instance_type="ml.m4.xlarge",
    strategy="MultiRecord",
    assemble_with="Line",
    output_path="s3://{}/{}/part2/transformer/output".format(bucket, prefix),
)

INFO:sagemaker:Creating model with name: xgboost-2023-11-02-17-19-41-958


In [25]:
xgb_transformer.transform(test_X_path, content_type="text/csv", split_type="Line")
xgb_transformer.wait()

INFO:sagemaker:Creating transform job with name: xgboost-2023-11-02-17-20-33-930


................................[34mArguments: serve[0m
[34m[2023-11-02 17:25:57 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2023-11-02 17:25:57 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2023-11-02 17:25:57 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2023-11-02 17:25:57 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2023-11-02 17:25:57 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2023-11-02 17:25:57 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-11-02:17:25:57:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-11-02:17:25:57:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-11-02:17:25:57:INFO] Model loaded successfully for worker : 23[0m
[34m[2023-11-02 17:25:57 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-1

In [27]:
import io
y_file = boto3.client("s3").get_object(Bucket = bucket, Key = f"{prefix}/part2/transformer/output/test_X.csv.out")
y_pred = pd.read_csv(io.BytesIO(y_file["Body"].read()), header = None, names = ["Predicted"])

In [28]:
y_pred

Unnamed: 0,Predicted
0,0.128531
1,0.152312
2,0.179380
3,0.122546
4,0.342384
...,...
245334,0.229901
245335,0.087173
245336,0.320495
245337,0.229604


In [30]:
y_pred["actual"] = y_pred["Predicted"].apply(lambda x : 1 if x > 0.5 else 0)

In [31]:
y_pred

Unnamed: 0,Predicted,actual
0,0.128531,0
1,0.152312,0
2,0.179380,0
3,0.122546,0
4,0.342384,0
...,...,...
245334,0.229901,0
245335,0.087173,0
245336,0.320495,0
245337,0.229604,0


In [32]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
accuracy = accuracy_score(test_y, y_pred["actual"])
print("Accuracy:", accuracy)

Accuracy: 0.791260256216908
