# Mobile Classification

Importing libraries

In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3 #to use created bucket
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = 'us-east-1'
bucket = 'mobclassification'
print('Using Bucket:' + bucket)
# print(sess)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using Bucket:mobclassification


In [3]:
df = pd.read_csv('train-V-1.csv')
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,719,0,2.9,0,7,0,6,0.7,102,6,...,405,1141,841,9,1,2,1,0,0,0
1,1029,1,1.9,1,5,1,5,0.2,171,5,...,264,956,582,15,7,4,1,1,1,0
2,720,1,0.9,1,12,1,14,0.8,165,4,...,366,1234,1086,14,7,3,1,1,0,0
3,696,1,2.4,0,11,1,35,0.2,103,3,...,533,797,2022,14,11,4,1,0,1,1
4,1391,0,1.4,0,2,1,36,0.8,182,1,...,14,1256,3139,14,7,14,1,1,0,2


In [4]:
df.shape

(400, 21)

In [5]:
df['price_range'].value_counts(normalize=True)

price_range
1    0.2775
0    0.2525
3    0.2475
2    0.2225
Name: proportion, dtype: float64

In [6]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [7]:
df.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [8]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [9]:
label = features.pop(-1)
label

'price_range'

In [10]:
x = df[features]
y = df[label]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.8,random_state=True)

In [12]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [13]:
trainX.to_csv('train-V-1.csv', index=False)
testX.to_csv('test-V-1.csv', index=False)


In [14]:
# send data to S3 bucket. Sagemaker will train data from S3
sk_prefix = "sagemaker/mobclassification/sklearncontainer"
trainpath = sess.upload_data(
    path = 'train-V-1.csv', 
    bucket=bucket, 
    key_prefix = sk_prefix
)

testpath = sess.upload_data(
    path = 'test-V-1.csv', 
    bucket=bucket, 
    key_prefix = sk_prefix
)

print("Training data uploaded to:", trainpath)
print("Test data uploaded to:", testpath)

Training data uploaded to: s3://mobclassification/sagemaker/mobclassification/sklearncontainer/train-V-1.csv
Test data uploaded to: s3://mobclassification/sagemaker/mobclassification/sklearncontainer/test-V-1.csv


In [21]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    print("[INFO] Extracting Arguments")
    parser = argparse.ArgumentParser()
    
    # Hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)
    
    # Data, Model, and Output Directories
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
    parser.add_argument("--test", type=str, default=os.environ["SM_CHANNEL_TEST"])
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")
    
    args = parser.parse_args()
    
    print("SkLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)
    
    print("[INFO] Reading data")
    
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building Training and Testing Datasets")
    
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]
    
    print("Training Model")
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)
    print()
    
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)
    print()
    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)
    
    print("\n---- Metrics Results For Testing Data ----\n")
    print("Total Rows are: ", X_test.shape[0])
    print("[TESTING] Model Accuracy is: ", test_acc)
    print(test_rep)

Overwriting script.py


In [22]:
# To run on Sagemaker From code
from sagemaker.sklearn import SKLearn

FRAMEWORK_VERSION = "1.2-1"

sklearn_estimator = SKLearn(
    entry_point = "script.py",
    role = "arn:aws:iam::637423511388:role/service-role/AmazonSageMaker-ExecutionRole-20240722T104063",
    instance_count = 1,
    instance_type = "ml.m5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name = "RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 3600,
    max_run = 3600
)
# print("SageMaker SKLearn estimator initialized successfully.")

In [23]:
#Launching Traing job, with asynchronous call
sklearn_estimator.fit({'training': trainpath,
             'test': testpath})


INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-07-22-11-29-18-245


2024-07-22 11:29:18 Starting - Starting the training job...
2024-07-22 11:29:37 Starting - Preparing the instances for training...
2024-07-22 11:30:11 Downloading - Downloading the training image......
2024-07-22 11:31:17 Training - Training image download completed. Training in progress...[34m2024-07-22 11:31:28,260 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-07-22 11:31:28,263 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-07-22 11:31:28,266 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-07-22 11:31:28,281 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-07-22 11:31:28,499 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-07-22 11:31:28,503 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34

In [25]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName = sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at "+ artifact)


2024-07-22 11:31:45 Starting - Preparing the instances for training
2024-07-22 11:31:45 Downloading - Downloading the training image
2024-07-22 11:31:45 Training - Training image download completed. Training in progress.
2024-07-22 11:31:45 Uploading - Uploading generated training model
2024-07-22 11:31:45 Completed - Training job completed
Model artifact persisted at s3://sagemaker-ap-south-1-637423511388/RF-custom-sklearn-2024-07-22-11-29-18-245/output/model.tar.gz


In [26]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime,strftime

model_name = "Custom-sklearn-model-"+ strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data = artifact,
    role = "arn:aws:iam::637423511388:role/service-role/AmazonSageMaker-ExecutionRole-20240722T104063",
    entry_point = "script.py",
    framework_version = FRAMEWORK_VERSION
)

In [27]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x7f14e6fca380>

In [29]:
endpoint_name = "Custom-sklearn-model-"+strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m4.xlarge",
    endpoint_name = endpoint_name
)

INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-07-22-11-45-17


EndpointName=Custom-sklearn-model-2024-07-22-11-48-54


INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-07-22-11-48-54
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-07-22-11-48-54


-------!

In [31]:
testX[features][0:2].values.tolist()

[[1523.0,
  1.0,
  1.8,
  0.0,
  6.0,
  1.0,
  11.0,
  0.1,
  129.0,
  1.0,
  8.0,
  148.0,
  1606.0,
  707.0,
  19.0,
  8.0,
  19.0,
  1.0,
  1.0,
  0.0],
 [1971.0,
  1.0,
  0.5,
  1.0,
  0.0,
  0.0,
  40.0,
  0.3,
  186.0,
  7.0,
  19.0,
  485.0,
  922.0,
  571.0,
  8.0,
  7.0,
  17.0,
  1.0,
  1.0,
  0.0]]

In [32]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[0 0]


In [33]:
sm_boto3.delete_endpoint(EndpointName = endpoint_name)

{'ResponseMetadata': {'RequestId': 'd7aafcdd-218c-417b-a7ee-8acd68bb5be3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd7aafcdd-218c-417b-a7ee-8acd68bb5be3',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 22 Jul 2024 11:59:46 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}