# Part 1: combined_csv_v2.csv Linear Estimator

In [2]:
import pandas as pd

In [6]:
df = pd.read_csv("combined_csv_v2.csv")
df = df.replace({True: 1, False: 0})
df.head(3)

Unnamed: 0,target,Distance,DepHourofDay,AWND_O,PRCP_O,TAVG_O,AWND_D,PRCP_D,TAVG_D,SNOW_O,...,Origin_SFO,Dest_CLT,Dest_DEN,Dest_DFW,Dest_IAH,Dest_LAX,Dest_ORD,Dest_PHX,Dest_SFO,isHoliday_True
0,0.0,689.0,21,33,0,54.0,30,0,130.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,731.0,9,39,0,136.0,33,0,54.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,1199.0,18,33,0,54.0,77,0,68.0,0.0,...,0,0,1,0,0,0,0,0,0,0


### Splitting the data into train, validation, and test sets

In [7]:
from sklearn.model_selection import train_test_split


train, validation = train_test_split(df, test_size=0.3)
validation, test = train_test_split(validation, test_size = 0.5)

In [8]:
train.to_csv("data/v2a/train.csv", index = False, header = False)
validation.to_csv("data/v2a/validation.csv", index = False, header = False)
test.to_csv("data/v2a/test.csv", index = False, header = False)

## Starting Sagemaker Session, getting default s3 bucket and loading the linear learner model container

In [9]:
import sagemaker
import boto3
from sagemaker import image_uris

sess = sagemaker.Session()
bucket = sess.default_bucket()

region = boto3.Session().region_name
container = image_uris.retrieve("linear-learner", region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Uploading the data to s3

In [10]:
prefix = "flight_delays/v2a/"
train_path = sess.upload_data(path="train.csv", key_prefix=prefix + "input/training")
valid_path = sess.upload_data(path="validation.csv", key_prefix=prefix + "input/validation")
test_path = sess.upload_data(path="test.csv", key_prefix=prefix + "input/test")

## Creating the Linear Estimator

In [13]:
from sagemaker.estimator import Estimator


linear_estimator = Estimator(
        container,
        role=sagemaker.get_execution_role(),
        instance_count=1,
        instance_type='ml.m5.large',
        output_path='s3://{}/{}/output'.format(bucket,prefix))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Setting the hyperparameters

In [14]:
linear_estimator.set_hyperparameters(predictor_type='binary_classifier')

#### Making input streams of data

In [15]:
training_data_channel = sagemaker.TrainingInput(s3_data=train_path,content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=valid_path,content_type='text/csv')

# Training Phase

In [16]:
linear_estimator.fit({'train': training_data_channel,'validation': validation_data_channel})

INFO:sagemaker:Creating training-job with name: linear-learner-2023-11-02-20-27-31-654


2023-11-02 20:27:31 Starting - Starting the training job...
2023-11-02 20:27:54 Starting - Preparing the instances for training......
2023-11-02 20:28:50 Downloading - Downloading input data......
2023-11-02 20:29:45 Training - Downloading the training image......
2023-11-02 20:30:50 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/02/2023 20:30:52 INFO 139741035947840] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'opti

### Batch Transformer Deployment

In [17]:
transformer = linear_estimator.transformer(
        instance_count=1,
        instance_type="ml.m4.xlarge",
        assemble_with="Line",
        output_path=f"s3://{bucket}/{prefix}/batch_output"
)

INFO:sagemaker:Creating model with name: linear-learner-2023-11-02-20-45-02-334


In [18]:
test_df = pd.read_csv("data/v2a/test.csv")
test_batch = test_df[test_df.columns[1:]]

In [19]:
test_batch.to_csv("data/v2a/batch_input.csv", index = False, header = False)

In [20]:
batch_test_path = sess.upload_data(path="batch_input.csv", key_prefix=prefix + "/batch_input")

## Deploying the Batch Transformer

In [22]:
transformer.transform(batch_test_path, content_type = "text/csv", split_type="Line")
transformer.wait()

INFO:sagemaker:Creating transform job with name: linear-learner-2023-11-02-20-45-22-059


........................................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[11/02/2023 20:52:01 INFO 139672611194688] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[11/02/2023 20:52:05 INFO 139672611194688] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[11/02/2023 20:52:05 INFO 139672611194688] loading entry points[0m
[34m[11/02/2023 20:52:05 INFO 139672611194688] loaded request iterator application/json[0m
[34m[11/02/2023 20:52:05 INFO 139672611194688] loaded request iterator application/jsonlines[0m
[34m[11/02/2023 20:52:05 INFO 139672611194688] loaded request iterator application/x-recordio-protobuf[0m
[34m[11/02/2023 20:52:05 INFO 139672611194688] loaded request iterator text/csv[0m
[34m[11/02/2023 20:5

In [23]:
import io
y_file = boto3.client("s3").get_object(Bucket = bucket, Key = f"{prefix}/batch_output/batch_input.csv.out")
y_pred = pd.read_csv(io.BytesIO(y_file["Body"].read()), header = None, names = ["Predicted"])

In [24]:
y_pred["target"] = y_pred.index

In [25]:
y_pred

Unnamed: 0,Predicted,target
"{""predicted_label"":0",score:0.204789876937866},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.124824427068233},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.169004499912261},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.112902536988258},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.126648128032684},"{""predicted_label"":0"
...,...,...
"{""predicted_label"":0",score:0.151689544320106},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.177160590887069},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.142887115478515},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.141783311963081},"{""predicted_label"":0"


In [26]:
predicted_series = y_pred['target'].apply(lambda x: 1 if x == 1 else 0)

In [27]:
from sklearn.metrics import accuracy_score


actual_test_vals = test_df.iloc[:, 0]
accuracy = accuracy_score(actual_test_vals, predicted_series)
print("Accuracy:", accuracy)

Accuracy: 0.789624925612828
Recall: 0.0
F1 Score: 0.0


In [28]:
actual_test_vals.shape

(245338,)