In [25]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3=boto3.client("sagemaker")
sess=sagemaker.Session()
region=sess.boto_session.region_name
bucket="mobilebucketsagemaker77"
print("Using bucket" + bucket)

Using bucketmobilebucketsagemaker77


In [6]:
df=pd.read_csv("mob_price_classification_train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


In [7]:
df.shape

(2000, 21)

In [8]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [9]:
df.duplicated().sum()

0

In [10]:
df["price_range"].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [11]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [15]:
X= df.drop(columns=['price_range'])
y = df["price_range"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1600, 20)
(400, 20)
(1600,)
(400,)


In [20]:
trainX = pd.DataFrame(X_train)
trainX["price_range"] = y_train

testX = pd.DataFrame(X_test)
testX["price_range"] = y_test

In [21]:
trainX.to_csv("train.csv", index = False)
testX.to_csv("test.csv", index = False)

# Send data to S3. Sagemaker will take the data for training from S3

In [None]:
sk_prefix="sagemaker/mobile_price_classification/sklearncontainer"
trainpath=sess.upload_data(path='train.csv',bucket=bucket,key_prefix=sk_prefix)
testpath=sess.upload_data(path='test.csv',bucket=bucket,key_prefix=sk_prefix)

print(trainpath)
print(testpath)

s3://mobilebucketsagemaker77/sagemaker/mobile_price_classification/sklearncontainer/train.csv
s3://mobilebucketsagemaker77/sagemaker/mobile_price_classification/sklearncontainer/test.csv


# Script used by AWS Sagemaker to train models

In [28]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import pandas as pd
import numpy as np


def model_fn(model_dir):
    clf=joblib.load(os.path.join(model_dir,"model.joblib"))

if __name__ == "__main__":

    print("Extracting arguments")
    print()

    parser=argparse.ArgumentParser()

    # Hyperparameter
    parser.add_argument("n_estimators",type=int,defaut=100)
    parser.add_argument("random_state",type=int,defaut=0)

    # Data, model, output directories
    parser.add_argument("model-dir",type=str,defaut=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("train",type=str,defaut=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("TEST",type=str,defaut=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("train-file",type=str,defaut=os.environ.get("train.csv"))
    parser.add_argument("test-file",type=str,defaut=os.environ.get("test.csv"))

    args, _ = parser.parse_know_args()

    print("sklearn version :", sklearn.__version__)
    print("joblib version :", joblib.__version__)
    print("Reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("Building training and testing datasets")
    print()
    X_train= train_df.drop(columns=['price_range'])
    y_train = train_df["price_range"]

    X_test= test_df.drop(columns=['price_range'])
    y_test = test_df["price_range"]

    print("Data shape")
    print()
    print("Training data")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("Testing data")
    print(X_test.shape)
    print(y_test.shape)
    print()

    print("Training RandomForest Model")
    model=RandomForestClassifier(n_estimators=args.n_estimators,random_state=args.random_state, verbose=2,n_jobs=1)

    model.fit(X_train,y_train)

    print()

    model_path=os.path.join(args.model_dir,"model.joblib")
    joblib.dump(model,model_path)

    print("Model saved at" + model_path)

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("Metrics results for testing data")
    print()
    print("Model accuracy:", test_acc)
    print("Testing report:", test_rep)


    

Overwriting script.py
