In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'farmgenbukerforsagemaker'
print("Using bucket " + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/nikhilmankani/Library/Application Support/sagemaker/config.yaml
Using bucket farmgenbukerforsagemaker


In [2]:
df = pd.read_csv("Crop_recommendation2.csv")

In [3]:
features = list(df.columns)
features

['temperature', 'humidity', 'rainfall', 'label']

In [4]:
label = features.pop(-1)
label

'label'

In [5]:
x = df[features]
y = df[label]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=0)

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1870, 3)
(330, 3)
(1870,)
(330,)


In [10]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [11]:
print(trainX.shape)
print(testX.shape)

(1870, 4)
(330, 4)


In [12]:
trainX.to_csv("train-V-2.csv",index = False)
testX.to_csv("test-V-2.csv", index = False)

In [13]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/crop_recommendation_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-2.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-2.csv", bucket=bucket, key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://farmgenbukerforsagemaker/sagemaker/crop_recommendation_classification/sklearncontainer/train-V-2.csv
s3://farmgenbukerforsagemaker/sagemaker/crop_recommendation_classification/sklearncontainer/test-V-2.csv


In [17]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-2.csv")
    parser.add_argument("--test-file", type=str, default="test-V-2.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [18]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::533267156402:role/service-role/AmazonSageMaker-ExecutionRole-20241111T183834",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-crop-recommendation",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [19]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-crop-recommendation-2024-11-13-06-37-10-433


2024-11-13 06:37:17 Starting - Starting the training job...
2024-11-13 06:37:32 Starting - Preparing the instances for training...
2024-11-13 06:38:03 Downloading - Downloading input data...
2024-11-13 06:38:28 Downloading - Downloading the training image...
2024-11-13 06:39:25 Training - Training image download completed. Training in progress.
2024-11-13 06:39:25 Uploading - Uploading generated training model2024-11-13 06:39:18,091 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-11-13 06:39:18,095 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-13 06:39:18,141 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-11-13 06:39:18,317 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-13 06:39:18,330 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-13 06:39:18,342 sagemaker-training-toolkit INFO

In [20]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-11-13 06:39:37 Starting - Preparing the instances for training
2024-11-13 06:39:37 Downloading - Downloading the training image
2024-11-13 06:39:37 Training - Training image download completed. Training in progress.
2024-11-13 06:39:37 Uploading - Uploading generated training model
2024-11-13 06:39:37 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-533267156402/RF-crop-recommendation-2024-11-13-06-37-10-433/output/model.tar.gz


In [21]:
artifact

's3://sagemaker-us-east-1-533267156402/RF-crop-recommendation-2024-11-13-06-37-10-433/output/model.tar.gz'

In [22]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "FarmGen-Crop-Recommendation-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::533267156402:role/service-role/AmazonSageMaker-ExecutionRole-20241111T183834",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [23]:
model_name

'FarmGen-Crop-Recommendation-model-2024-11-13-06-42-39'

In [25]:
endpoint_name = "FarmGen-Crop-Recommendation-model"
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name=endpoint_name,
)

EndpointName=FarmGen-Crop-Recommendation-model


INFO:sagemaker:Creating model with name: FarmGen-Crop-Recommendation-model-2024-11-13-06-42-39
INFO:sagemaker:Creating endpoint-config with name FarmGen-Crop-Recommendation-model
INFO:sagemaker:Creating endpoint with name FarmGen-Crop-Recommendation-model


------------!

In [26]:
endpoint_name

'FarmGen-Crop-Recommendation-model'

In [27]:
testX[features][100:105].values.tolist()

[[13.70319166, 90.95589386, 106.2944879],
 [24.09874353, 80.57226761, 176.8604109],
 [28.14720892, 83.8001509, 37.44800463],
 [28.65003945, 82.68752542, 98.75084366],
 [27.50527651, 80.79783998, 105.0776992]]

In [None]:
## Redeploy the endpoint

In [29]:
## For Calling the endpoint from code

import json
import numpy as np

payload = np.array([24.09874353, 80.57226761, 176.8604109]).reshape(1,-1).astype(float).tolist()
endpoint_name = "FarmGen-Crop-Recommendation-model"

sm_runtime = boto3.client("runtime.sagemaker")
response = sm_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result)

['jute']


In [54]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '98b4a741-49ab-48d1-b1f0-7ebe51895562',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '98b4a741-49ab-48d1-b1f0-7ebe51895562',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 11 Nov 2024 15:14:20 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}