# Part 2: combined_csv_v2.csv with xgboost
Fazal Mahmud Niloy (u3228358)

In [1]:
import pandas as pd

In [18]:
df = pd.read_csv("combined_csv_v2.csv")
df = df.replace({True: 1, False: 0})
df.head(2)

Unnamed: 0,target,Distance,DepHourofDay,AWND_O,PRCP_O,TAVG_O,AWND_D,PRCP_D,TAVG_D,SNOW_O,...,Origin_SFO,Dest_CLT,Dest_DEN,Dest_DFW,Dest_IAH,Dest_LAX,Dest_ORD,Dest_PHX,Dest_SFO,isHoliday_True
0,0.0,689.0,21,33,0,54.0,30,0,130.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,731.0,9,39,0,136.0,33,0,54.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Splitting the data into train, validation and test

In [19]:
from sklearn.model_selection import train_test_split


train, validation = train_test_split(df, test_size=0.3)
validation, test = train_test_split(validation, test_size = 0.5)

In [20]:
train.to_csv("data/v2b/train.csv", index = False, header = False)
validation.to_csv("data/v2b/validation.csv", index = False, header = False)

In [21]:
test_X =  test.drop("target", axis = 1)
test_y = test["target"]
test_X.to_csv("data/v2b/test_X.csv", index = False, header = False)

### Starting sagemaker session, getting the default s3 bucket and region, getting the xgboost container

In [22]:
import sagemaker
import boto3
from sagemaker import image_uris


sess = sagemaker.Session()
bucket = sess.default_bucket()


region = boto3.Session().region_name
container = image_uris.retrieve('xgboost', region,version='latest')

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


### Defining s3 bucket data paths

In [23]:
prefix = "flight_delay/v2b"
train_path = sess.upload_data(path="train.csv", key_prefix=prefix + "/input/training")
valid_path = sess.upload_data(path="validation.csv", key_prefix=prefix + "/input/validation")
test_X_path = sess.upload_data(path="test_X.csv", key_prefix=prefix + "/input/test")

### Making an estimator and setting the hyperparameters

In [25]:
from sagemaker.estimator import Estimator


xgb_estimator = Estimator(container,
                          role=sagemaker.get_execution_role(),
                          instance_count=1,
                          instance_type='ml.m5.large',
                          output_path='s3://{}/{}/output'.format(bucket,prefix))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [26]:
xgb_estimator.set_hyperparameters(objective='binary:logistic', num_round=15)

In [27]:
training_data_channel = sagemaker.TrainingInput(s3_data=train_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=valid_path, content_type='text/csv')

### Fitting the model

In [28]:
xgb_estimator.fit({'train': training_data_channel,'validation': validation_data_channel})

INFO:sagemaker:Creating training-job with name: xgboost-2023-11-02-20-04-04-584


2023-11-02 20:04:04 Starting - Starting the training job...
2023-11-02 20:04:21 Starting - Preparing the instances for training......
2023-11-02 20:05:24 Downloading - Downloading input data......
2023-11-02 20:06:09 Training - Downloading the training image...
2023-11-02 20:06:44 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2023-11-02:20:06:49:INFO] Running standalone xgboost training.[0m
[34m[2023-11-02:20:06:49:INFO] File size need to be processed in the node: 256.38mb. Available memory size in the node: 348.35mb[0m
[34m[2023-11-02:20:06:49:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:06:49] S3DistributionType set as FullyReplicated[0m
[34m[20:06:52] 1144913x85 matrix with 97317605 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-11-02:20:06:52:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:06:52] S3DistributionType set as FullyReplicated[0m
[34m

## Deployment
### Making a batch transformer and transforming the test data

In [29]:
xgb_transformer = xgb_estimator.transformer(
    instance_count=1,
    instance_type="ml.m4.xlarge",
    strategy="MultiRecord",
    assemble_with="Line",
    output_path=f"s3://{bucket}/{prefix}/batch_output"
)

INFO:sagemaker:Creating model with name: xgboost-2023-11-02-20-08-56-545


In [30]:
xgb_transformer.transform(test_X_path, content_type="text/csv", split_type="Line")
xgb_transformer.wait()

INFO:sagemaker:Creating transform job with name: xgboost-2023-11-02-20-08-59-364


...............................[34mArguments: serve[0m
[34m[2023-11-02 20:14:11 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2023-11-02 20:14:11 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2023-11-02 20:14:11 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2023-11-02 20:14:11 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2023-11-02 20:14:11 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-11-02:20:14:11:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-11-02:20:14:11:INFO] Model loaded successfully for worker : 22[0m
[34m[2023-11-02 20:14:11 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2023-11-02 20:14:11 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-11-02:20:14:12:INFO] Model loaded successfully for worker : 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2023-11

### Downloading the predictions

In [31]:
import io


y_file = boto3.client("s3").get_object(Bucket = bucket, Key = prefix +"/transformer/output/test_X.csv.out")
y_pred = pd.read_csv(io.BytesIO(y_file["Body"].read()), header = None, names = ["Predicted"])

In [32]:
y_pred

Unnamed: 0,Predicted
0,0.229906
1,0.177965
2,0.116457
3,0.350774
4,0.230469
...,...
245334,0.208295
245335,0.162544
245336,0.305527
245337,0.199313


In [33]:
y_pred["actual"] = y_pred["Predicted"].apply(lambda x : 1 if x > 0.5 else 0)

In [34]:
y_pred

Unnamed: 0,Predicted,actual
0,0.229906,0
1,0.177965,0
2,0.116457,0
3,0.350774,0
4,0.230469,0
...,...,...
245334,0.208295,0
245335,0.162544,0
245336,0.305527,0
245337,0.199313,0


### Calculating the accuracy

In [35]:
from sklearn.metrics import accuracy_score


accuracy = accuracy_score(test_y, y_pred["actual"])
print("Accuracy:", accuracy)

Accuracy: 0.7973090295468719
Recall: 0.06855306959820391
F1 Score: 0.12469196648595365
