In [2]:
!pip install sagemaker ipywidgets --upgrade --quiet


[0m

In [3]:
! pip install xgboost


Collecting xgboost
  Using cached xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
[0m

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from scipy import stats
import sklearn
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import median_absolute_error
from datetime import date
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import sagemaker
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.xgboost.model import XGBoostModel
from sagemaker.serializers import CSVSerializer

In [4]:
import boto3
s3=boto3.resource('s3')

In [7]:
bucket_name='fractionatordata'
try:
    s3.create_bucket(Bucket=bucket_name)
    print("S3 bucket {} was created".format(bucket_name))
except Exception as e:
    print("S3 error: ",e)

S3 bucket fractionatordata was created


In [8]:
df=pd.read_csv('Fractionator.csv')

In [9]:
df['Time']=pd.to_datetime(df['Time'])

In [10]:
df=df[(np.abs(stats.zscore(df["AI_2020_OVERHEAD_C5'S_MOL"])) < 3)]

In [11]:
df.rename(columns = {"AI_2020_OVERHEAD_C5'S_MOL":"AI_top","AI_2021_MIDDLE_C7'S_MOL":"AI_middle","AI_2022_BOTTOM_C3'S_MOL":"AI_bottom",
                          'FIC_2100_PV_FEED_FURNACE_FUEL_SCFH':'Furnace' ,'FIC_2004_PV_TOP_REFLUX_MBBL/D':'Reflux_top',
                    'FI-2005_PV_FEED_FLOW_MBBL/D':'Feed'
 }, inplace = True)

In [12]:
df_Reg=df[['AI_top','AI_middle','AI_bottom','Furnace',]]

In [13]:
df_Reg.head()

Unnamed: 0,AI_top,AI_middle,AI_bottom,Furnace
0,2.0,3.0,4.0,8.5
1,2.0036,3.00954,3.99831,8.50735
2,2.0087,2.99518,3.97746,8.50977
3,2.00385,3.01356,3.97262,8.50838
4,1.99852,3.02963,3.98841,8.50365


In [14]:
df_Reg.describe()

Unnamed: 0,AI_top,AI_middle,AI_bottom,Furnace
count,5729.0,5729.0,5729.0,5729.0
mean,2.129367,3.352078,3.965614,9.041685
std,0.439436,0.23371,0.360089,0.535607
min,0.840555,2.82078,3.06754,7.99997
25%,1.83188,3.17409,3.7735,8.5689
50%,2.12057,3.30672,4.0132,9.01091
75%,2.43936,3.52084,4.21305,9.41585
max,3.29784,3.93095,4.85636,10.6115


In [15]:
df_Reg.isnull().sum()

AI_top       0
AI_middle    0
AI_bottom    0
Furnace      0
dtype: int64

In [16]:
df_Reg.corr()

Unnamed: 0,AI_top,AI_middle,AI_bottom,Furnace
AI_top,1.0,0.538482,-0.553021,0.407673
AI_middle,0.538482,1.0,-0.457908,0.17136
AI_bottom,-0.553021,-0.457908,1.0,-0.687598
Furnace,0.407673,0.17136,-0.687598,1.0


In [17]:
df_Reg=df[['Furnace','AI_top','AI_middle','AI_bottom']]

In [18]:
df_Reg.head()

Unnamed: 0,Furnace,AI_top,AI_middle,AI_bottom
0,8.5,2.0,3.0,4.0
1,8.50735,2.0036,3.00954,3.99831
2,8.50977,2.0087,2.99518,3.97746
3,8.50838,2.00385,3.01356,3.97262
4,8.50365,1.99852,3.02963,3.98841


In [19]:
X=df_Reg.iloc[:,1:].values
y=df_Reg.iloc[:,0].values

In [20]:
X.shape,y.shape

((5729, 3), (5729,))

In [None]:
df_np=df_Reg.to_numpy()
df_np.shape

In [None]:
target=df_np[:,0]
feature=df_np[:,1:]
target.shape,feature.shape

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [32]:
X_train.shape, X_val.shape, X_test.shape

((3838, 3), (945, 3), (946, 3))

In [34]:
y_train.shape, y_val.shape, y_test.shape

((3838,), (945,), (946,))

In [37]:
train_data = np.concatenate((y_train.reshape(-1,1), X_train), axis=1)
test_data = np.concatenate((y_test.reshape(-1,1), X_test), axis=1)
val_data = np.concatenate((y_val.reshape(-1,1), X_val), axis=1)

In [38]:
train_data.shape, test_data.shape, val_data.shape

((3838, 4), (946, 4), (945, 4))

In [39]:
train_data[0]

array([9.63946, 2.63081, 3.44818, 3.23914])

In [43]:
train_data=pd.DataFrame(train_data, index=None,)
test_data=pd.DataFrame(test_data, index=None,)
val_data=pd.DataFrame(val_data, index=None, )

In [46]:
train_data.head()

Unnamed: 0,0,1,2,3
0,9.63946,2.63081,3.44818,3.23914
1,9.91781,2.36694,3.36823,3.80285
2,9.04111,1.51664,3.13586,3.97599
3,9.87153,2.0631,3.04215,3.79857
4,9.68077,1.67002,3.07148,4.19419


In [None]:
y=y.reshape([5719])
print(y.shape)
X=X.reshape([5719,10])
X.shape

In [None]:
split_index = int(len(xy) * 0.8)

split_index

In [None]:
train_data,test_data=np.split(xy,[int(len(xy)*0.8)] )
train_data.shape,test_data.shape

In [None]:
train_data=xy.iloc[:split_index,:]
test_data=xy.iloc[split_index:,:]
#,test_data=np.split(xy,split_index)
print(train_data.shape,test_data.shape)

In [None]:

X_train = X[:split_index]
X_test = X[split_index:]

y_train = y[:split_index]
y_test = y[split_index:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [40]:
import os
prefix="control_furnace"
train_csv_path='s3://{}/{}/{}/{}'.format(bucket_name,prefix,'train','train.csv')
val_csv_path='s3://{}/{}/{}/{}'.format(bucket_name,prefix,'val','val.csv')
test_csv_path='s3://{}/{}/{}/{}'.format(bucket_name,prefix,'test','test.csv')
print(train_csv_path)
print(val_csv_path)
print(test_csv_path)


s3://fractionatordata/control_furnace/train/train.csv
s3://fractionatordata/control_furnace/val/val.csv
s3://fractionatordata/control_furnace/test/test.csv


In [47]:
train_data.to_csv(train_csv_path,index=False,header=False)
val_data.to_csv(val_csv_path,index=False,header=False)
test_data.to_csv(test_csv_path,index=False,header=False)


In [48]:
xgboost_container=image_uris.retrieve('xgboost',boto3.Session().region_name,"1.5-1")
display(xgboost_container)

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'

In [49]:
output_path="s3://{}/{}/{}/".format(bucket_name,prefix,'output')
print(output_path)

s3://fractionatordata/control_furnace/output/


In [50]:
content_type="csv"
train_input=TrainingInput("s3://{}/{}/{}/".format(bucket_name,prefix,'train',content_type=content_type))
val_input=TrainingInput("s3://{}/{}/{}/".format(bucket_name,prefix,'val',content_type=content_type))
test_input=TrainingInput("s3://{}/{}/{}/".format(bucket_name,prefix,'test',content_type=content_type))

In [60]:
hyperparams = {
    "max_depth": "5",
    "eta": "0.01",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

instance_type = "ml.m5.2xlarge"
output_path =output_path
content_type = "csv"

In [61]:
# Open Source distributed script mode


session = Session()
script_path = "train.py"

xgb_script_mode_estimator = XGBoost(
    entry_point=script_path,
    framework_version="1.7-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparams,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type=instance_type,
    output_path=output_path,
)


INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.2xlarge.


In [62]:
xgb_script_mode_estimator.fit({"train": train_input, "validation": val_input})


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-05-10-21-04-12-850


Using provided s3_resource
2023-05-10 21:04:13 Starting - Starting the training job...
2023-05-10 21:04:29 Starting - Preparing the instances for training......
2023-05-10 21:05:28 Downloading - Downloading input data...
2023-05-10 21:05:49 Training - Downloading the training image...
2023-05-10 21:06:34 Uploading - Uploading generated training model[34m[2023-05-10 21:06:29.247 ip-10-0-254-44.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-05-10 21:06:29.322 ip-10-0-254-44.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-05-10:21:06:29:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-05-10:21:06:29:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-05-10:21:06:29:INFO] Invoking user training script.[0m
[34m[2023-05-10:21:06:29:INFO] Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2023-05-10:21:06:29:INFO] Generating setup.cfg

In [None]:
estimator.latest_training_job.describe()


In [None]:
from sagemaker.xgboost.model import XGBoostModel
from sagemaker.serializers import CSVSerializer

In [64]:
predictor = xgb_script_mode_estimator.deploy(
    initial_instance_count=1, instance_type="ml.m5.2xlarge",
serializer=CSVSerializer(),)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-05-10-21-08-50-710
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-05-10-21-08-50-710
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-05-10-21-08-50-710


----!

In [65]:
ENDPOINT_NAME=predictor.endpoint_name
ENDPOINT_NAME

'sagemaker-xgboost-2023-05-10-21-08-50-710'

In [58]:
testing1=X_test[1]
testing2=X_test[2]
value1=y_test[1]
value2=y_test[2]
testing1,testing2,value1,value2
value1

8.83145

In [66]:
prediction=predictor.predict([testing1])
prediction

[['3.962116']]

### lambda handling function
> ** make_prediction using lambda 

In [None]:
ENDPOINT_NAME='sagemaker-xgboost-2023-04-23-20-02-45-272'
runtime = boto3.client('runtime.sagemaker')
def lambda_handler(event, context):
    inputs=event['data']
    result=[]
    for input in inputs:
        serialized_input=','.join(map(str,input))
        
        reponse=runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                        ContentType='text/csv', 
                                        Body=serialized_input)
        result.append(reponse['Body'].read().decode('utf-8'))
    return result
        

In [None]:
input_jason={"data":[testing1,testing2]}

In [None]:
result=lambda_handler(input_jason,None)
result

In [67]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2023-05-10-21-08-50-710
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-05-10-21-08-50-710
