In [55]:
import pandas as pd
import numpy as np
from sagemaker.sklearn.estimator import SKLearn
import boto3
import sagemaker
from sagemaker.predictor import csv_serializer, json_deserializer

In [8]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

In [9]:
bucket = 'djk-ml-sagemaker'
key = 'music_lyrics'

train_path = f's3://{bucket}/{key}/train.csv'
test_path = f's3://{bucket}/{key}/test.csv'

#### NOTE: sagemaker's SKLEARN prebuilt container only work w/ sklearn 0.20.0
#### version control your stuff!!!!

In [68]:
xgb = joblib.load('xgb_model.sav')

In [69]:
train = pd.read_csv(train_path)

In [70]:
X_train = train.iloc[:,1]
y_train = train.iloc[:,0]

In [72]:
xgb.predict(['hi my name is dave'])

array([1.])

In [80]:
xgb.predict(['look cause middle rumor look be left face place beetle lip shine fee happy look child wife life street beat till would track train rain come fast train fast race place running will long till bring wasting waste time big town look child train quarter till sixteen get mess leg start drag start drag street go yeah yeah look girl look boy look girl look girl look world look child wife life street beat till would track train rain come fast '])

array([0.])

In [85]:
%%writefile 4d_random_forest_custom_train.py

# libraries necessary for name==main stuff
import argparse
import os

# dataframe libraries
import numpy as np
import pandas as pd

# pipeline building libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline

# algorithm library
from sklearn.ensemble import RandomForestClassifier

# persistance function
from sklearn.externals import joblib


# inference functions ---------------
def model_fn(model_dir):
    """Deserialized and return fitted model
    Note that this should have the same name as the serialized model in the main method
    """
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf



if __name__ =='__main__':

    parser = argparse.ArgumentParser()

#     hyperparameters to add
#     parser.add_argument('--n-estimators', type=int, default=10)
#     parser.add_argument('--min-samples-leaf', type=int, default=3)

#     Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
#     parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='train.csv')
#     parser.add_argument('--test-file', type=str, default='boston_test.csv')

    args, _ = parser.parse_known_args()

    train = pd.read_csv(f'{args.train}/{args.train_file}')
    y_train = train['liked']
    X_train = train.iloc[:,1]

    # train
    
    tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
    variance_filter = VarianceThreshold(.0005)
    best_random_forest = RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=2,
                                        min_samples_split=10,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=1200, n_jobs=None,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_start=False)
    pipeline_steps = [
        ('tfidf_vectorizer', tfidf_vectorizer), # term frequency document infrequency word vectorizer
        ('variance_filter', variance_filter), # removes low variance columns from dataset
        ('classifier', best_random_forest)
    ]
    model = Pipeline(steps = pipeline_steps)
    
    model.fit(X_train, y_train)
        
#     persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    
#     saving model to s3 bucket
    

Writing 4d_random_forest_custom_train.py


In [51]:
# SageMaker Python SDK Estimator
sklearn_estimator = SKLearn(
    entry_point='random_forest_custom_train.py',
    role = role,
    train_instance_count=1,
    train_instance_type='ml.c5.xlarge',
    framework_version='0.20.0')

In [52]:
# launch training job, with asynchronous call
sklearn_estimator.fit({'train':train_path})

2020-01-17 08:56:30 Starting - Starting the training job...
2020-01-17 08:56:32 Starting - Launching requested ML instances......
2020-01-17 08:57:39 Starting - Preparing the instances for training...
2020-01-17 08:58:18 Downloading - Downloading input data...
2020-01-17 08:58:53 Training - Training image download completed. Training in progress..[34m2020-01-17 08:58:53,389 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-01-17 08:58:53,392 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-01-17 08:58:53,400 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-01-17 08:58:53,616 sagemaker-containers INFO     Module random_forest_custom_train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-01-17 08:58:53,616 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-01-17 08:58:53,616 sagemaker-containers INFO     Generating MA

In [54]:
predictor = sklearn_estimator.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.t2.medium',
    endpoint_name = 'nlp-reommendation-v0'
)

Using already existing model: sagemaker-scikit-learn-2020-01-17-08-56-30-066


-----------------------!

In [64]:
endpoint_name = 'nlp-reommendation-v0'
cloud_predictor = sagemaker.predictor.RealTimePredictor(endpoint = endpoint_name)

In [76]:
cloud_predictor = sagemaker.sklearn.model.SKLearnPredictor(endpoint_name=endpoint_name)

In [84]:
# testing a lyric that was classified as 0 locally

cloud_predictor.predict(
    ['look cause middle rumor look be left face place beetle lip shine fee happy look child wife life street beat till would track train rain come fast train fast race place running will long till bring wasting waste time big town look child train quarter till sixteen get mess leg start drag start drag street go yeah yeah look girl look boy look girl look girl look world look child wife life street beat till would track train rain come fast '])

array([0.])