In [2]:
import boto3
s3=boto3.resource('s3')

In [3]:
bucket_name='fractionatordata'
try:
    s3.create_bucket(Bucket=bucket_name)
    print("S3 bucket {} was created".format(bucket_name))
except Exception as e:
    print("S3 error: ",e)

S3 bucket fractionatordata was created


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from scipy import stats
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import median_absolute_error
from datetime import date
from sklearn.preprocessing import MinMaxScaler


In [5]:
df=pd.read_csv('Fractionator.csv')

In [6]:
df['Time']=pd.to_datetime(df['Time'])

In [7]:
df=df[(np.abs(stats.zscore(df["AI_2020_OVERHEAD_C5'S_MOL"])) < 3)]

In [8]:
df.rename(columns = {"AI_2020_OVERHEAD_C5'S_MOL":"AI_top","AI_2021_MIDDLE_C7'S_MOL":"AI_middle","AI_2022_BOTTOM_C3'S_MOL":"AI_bottom",
                          'FIC_2100_PV_FEED_FURNACE_FUEL_SCFH':'Furnace' ,'FIC_2004_PV_TOP_REFLUX_MBBL/D':'Reflux_top',
                    'FI-2005_PV_FEED_FLOW_MBBL/D':'Feed'
 }, inplace = True)

In [9]:
df_uni=df[['AI_top','Furnace','Feed','Reflux_top',]]

In [10]:
df_Xgb=df_uni[['AI_top']]

In [11]:
df_Xgb.head()

Unnamed: 0,AI_top
0,2.0
1,2.0036
2,2.0087
3,2.00385
4,1.99852


In [12]:
def df_to_X_y(df, window_size=10):
  df_np = df_Xgb.to_numpy()
  X = []
  y = []
  for i in range(len(df_np)-window_size):
    row = [r for r in df_np[i:i+window_size]]
    X.append(row)
    label = df_np[i+window_size][0]
    y.append(label)
  return np.array(X), np.array(y)

In [13]:
X,y= df_to_X_y(df_Xgb,window_size=10)
X.shape, y.shape

((5719, 10, 1), (5719,))

In [14]:
y=y.reshape([5719])
print(y.shape)
X=X.reshape([5719,10])
X.shape

(5719,)


(5719, 10)

In [15]:
X_dataframe=pd.DataFrame(X,columns=['t0','t1','t2','t3','t4','t5','t6','t7','t8','t9'])
X_dataframe.head()

Unnamed: 0,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9
0,2.0,2.0036,2.0087,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368
1,2.0036,2.0087,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608
2,2.0087,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608,2.02237
3,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608,2.02237,2.0202
4,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608,2.02237,2.0202,2.01924


In [16]:
y_dataframe=pd.DataFrame(y,columns=['t10'])
y_dataframe.head()

Unnamed: 0,t10
0,2.01608
1,2.02237
2,2.0202
3,2.01924
4,2.01335


In [17]:
xy=pd.DataFrame()
xy['t10']=y_dataframe['t10']
xy[['t0','t1','t2','t3','t4','t5','t6','t7','t8','t9']]=X_dataframe[['t0','t1','t2','t3','t4','t5','t6','t7','t8','t9']]
xy.head()



Unnamed: 0,t10,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9
0,2.01608,2.0,2.0036,2.0087,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368
1,2.02237,2.0036,2.0087,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608
2,2.0202,2.0087,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608,2.02237
3,2.01924,2.00385,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608,2.02237,2.0202
4,2.01335,1.99852,1.9971,2.0026,2.0144,2.01669,2.02368,2.01608,2.02237,2.0202,2.01924


In [18]:
xy.tail(5)

Unnamed: 0,t10,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9
5714,1.66446,1.71108,1.71109,1.69975,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325
5715,1.66227,1.71109,1.69975,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325,1.66446
5716,1.65845,1.69975,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325,1.66446,1.66227
5717,1.67002,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325,1.66446,1.66227,1.65845
5718,1.67681,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325,1.66446,1.66227,1.65845,1.67002


In [19]:
df_data_features=xy.iloc[:,1:]
df_data_features

Unnamed: 0,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9
0,2.00000,2.00360,2.00870,2.00385,1.99852,1.99710,2.00260,2.01440,2.01669,2.02368
1,2.00360,2.00870,2.00385,1.99852,1.99710,2.00260,2.01440,2.01669,2.02368,2.01608
2,2.00870,2.00385,1.99852,1.99710,2.00260,2.01440,2.01669,2.02368,2.01608,2.02237
3,2.00385,1.99852,1.99710,2.00260,2.01440,2.01669,2.02368,2.01608,2.02237,2.02020
4,1.99852,1.99710,2.00260,2.01440,2.01669,2.02368,2.01608,2.02237,2.02020,2.01924
...,...,...,...,...,...,...,...,...,...,...
5714,1.71108,1.71109,1.69975,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325
5715,1.71109,1.69975,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325,1.66446
5716,1.69975,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325,1.66446,1.66227
5717,1.69506,1.68498,1.68894,1.68242,1.68244,1.67338,1.67325,1.66446,1.66227,1.65845


In [20]:
df_data_target=xy.iloc[:,0].rename("Targets")
df_data_target

0       2.01608
1       2.02237
2       2.02020
3       2.01924
4       2.01335
         ...   
5714    1.66446
5715    1.66227
5716    1.65845
5717    1.67002
5718    1.67681
Name: Targets, Length: 5719, dtype: float64

In [23]:
split_index = int(len(xy) * 0.8)

split_index

4575

In [21]:
train_data,test_data=np.split(xy,[int(len(xy)*0.8)] )
train_data.shape,test_data.shape

((4575, 11), (1144, 11))

In [138]:
train_data=xy.iloc[:split_index,:]
test_data=xy.iloc[split_index:,:]
#,test_data=np.split(xy,split_index)
print(train_data.shape,test_data.shape)

(4575, 11) (1144, 11)


In [24]:

X_train = X[:split_index]
X_test = X[split_index:]

y_train = y[:split_index]
y_test = y[split_index:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4575, 10), (1144, 10), (4575,), (1144,))

In [25]:
import os
prefix="xgboost_ai_top"
train_csv_path='s3://{}/{}/{}/{}'.format(bucket_name,prefix,'train','train.csv')
test_csv_path='s3://{}/{}/{}/{}'.format(bucket_name,prefix,'test','test.csv')
print(train_csv_path)
print(test_csv_path)


s3://fractionatordata/xgboost_ai_top/train/train.csv
s3://fractionatordata/xgboost_ai_top/test/test.csv


In [26]:
train_data.to_csv(train_csv_path,index=False,header=False)
test_data.to_csv(test_csv_path,index=False,header=False)


In [27]:
import sagemaker
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

In [172]:
xgboost_container=image_uris.retrieve('xgboost',boto3.Session().region_name,"1.5-1")
display(xgboost_container)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'

In [28]:
! pip install xgboost
 

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [29]:
output_path="s3://{}/{}/{}/".format(bucket_name,prefix,'output')
print(output_path)

s3://fractionatordata/xgboost_ai_top/output/


In [30]:
content_type="csv"
train_input=TrainingInput("s3://{}/{}/{}/".format(bucket_name,prefix,'train',content_type=content_type))
test_input=TrainingInput("s3://{}/{}/{}/".format(bucket_name,prefix,'test',content_type=content_type))

In [31]:
hyperparams = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

instance_type = "ml.m5.2xlarge"
output_path =output_path
content_type = "csv"

In [32]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

session = Session()
script_path = "train.py"

xgb_script_mode_estimator = XGBoost(
    entry_point=script_path,
    framework_version="1.7-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparams,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type=instance_type,
    output_path=output_path,
)


In [33]:
xgb_script_mode_estimator.fit({"train": train_input, "validation": test_input})


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-04-23-19-59-03-609


2023-04-23 19:59:04 Starting - Starting the training job...
2023-04-23 19:59:20 Starting - Preparing the instances for training...
2023-04-23 20:00:01 Downloading - Downloading input data...
2023-04-23 20:00:21 Training - Downloading the training image...
2023-04-23 20:01:12 Uploading - Uploading generated training model[34m[2023-04-23 20:01:04.745 ip-10-0-66-241.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-04-23 20:01:04.827 ip-10-0-66-241.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-04-23:20:01:05:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-04-23:20:01:05:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-04-23:20:01:05:INFO] Invoking user training script.[0m
[34m[2023-04-23:20:01:05:INFO] Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2023-04-23:20:01:05:INFO] Generating setup.cfg[0m
[34m[2023-04-23:20:01:05

In [34]:
from sagemaker.xgboost.model import XGBoostModel
from sagemaker.serializers import CSVSerializer

In [None]:
predictor = xgb_script_mode_estimator.deploy(
    initial_instance_count=1, instance_type="ml.m5.2xlarge",
serializer=CSVSerializer(),)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-04-23-20-02-45-272
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-04-23-20-02-45-272
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-04-23-20-02-45-272


--

In [43]:
ENDPOINT_NAME=predictor.endpoint_name
ENDPOINT_NAME

'sagemaker-xgboost-2023-04-23-20-02-45-272'

In [39]:
testing1=X_test[1]
testing2=X_test[2]
value1=y_test[1]
value2=y_test[2]
testing1,testing2,value1,value2

(array([3.02232, 3.02497, 3.02772, 3.03216, 3.03337, 3.03509, 3.03924,
        3.03716, 3.03911, 3.03898]),
 array([3.02497, 3.02772, 3.03216, 3.03337, 3.03509, 3.03924, 3.03716,
        3.03911, 3.03898, 3.03417]),
 3.03417,
 3.03501)

In [41]:
prediction=predictor.predict([testing1,testing2])
prediction

[['2.9409072'], ['2.9409072']]

### lambda handling function
> ** make_prediction using lambda 

In [49]:
ENDPOINT_NAME='sagemaker-xgboost-2023-04-23-20-02-45-272'
runtime = boto3.client('runtime.sagemaker')
def lambda_handler(event, context):
    inputs=event['data']
    result=[]
    for input in inputs:
        serialized_input=','.join(map(str,input))
        
        reponse=runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                        ContentType='text/csv', 
                                        Body=serialized_input)
        result.append(reponse['Body'].read().decode('utf-8'))
    return result
        

In [45]:
input_jason={"data":[testing1,testing2]}

In [48]:
result=lambda_handler(input_jason,None)
result

['[2.9409072399139404]', '[2.9409072399139404]']

In [52]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2023-04-23-20-02-45-272
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-04-23-20-02-45-272
