In [2]:
# Update and import packages
!pip install -Uqr requirements.txt
!/bin/bash ./local_mode_setup.sh

# Basic packages
import importlib
from time import time
from pathlib import Path
from progress.bar import Bar
import json
import re
import io
from bs4 import BeautifulSoup

# Data science/NLP packages
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

# AWS packages
import awswrangler as wr
import sagemaker
from sagemaker.sklearn import SKLearn
from sagemaker.predictor import Predictor
import boto3

# Local modules
import model
import train
for m in [model, train]:
    importlib.reload(m)

from model import VectorSimilarity, get_fitted_model
from train import combine_dfs

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.19.100 requires botocore==1.20.100, but you have botocore 1.21.11 which is incompatible.
awscli 1.19.100 requires s3transfer<0.5.0,>=0.4.0, but you have s3transfer 0.5.0 which is incompatible.
aiobotocore 1.3.0 requires botocore<1.20.50,>=1.20.49, but you have botocore 1.21.11 which is incompatible.[0m


In [24]:
# Sanity checks on VectorSimilarity
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
pred, score = estimator.predict(np.array([1, 2]).reshape(1, -1))

[['a' 'b' 'c']]
[[ 2.  1. -1.]]
Prediction took 0.001055002212524414 seconds


In [6]:
# Basic pipeline setup
basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = get_fitted_model(basic_corpus, basic_labels, lemmatize='custom')
pred, score = pipe.predict(basic_corpus)

Training model...
Took 1.6626145839691162 seconds
[['a' 'c' 'b' 'd']
 ['b' 'c' 'a' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.         0.         0.        ]]
Prediction took 0.0006358623504638672 seconds


  'stop_words.' % sorted(inconsistent))


In [25]:
# File helper functions
def list_data_objs():
    secret_name = "SageMakerS3Access"
    region_name = "us-west-2"
    bucket_name = 'amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5'
    bucket_subfolder = 'data/issues/'
    
#     secrets = boto3.client(
#         service_name='secretsmanager',
#         region_name=region_name
#     )

#     secrets_response = secrets.get_secret_value(SecretId=secret_name)
#     secrets_dict = json.loads(secrets_response['SecretString'])
#     (access_key, secret_key), = secrets_dict.items()

    s3 = boto3.client('s3')
    data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
    data_obj_names = [key['Key'] for key in data_objects]
#     data_obj_names = [f"s3://{bucket_name}/{key['Key']}" for key in data_objects]
    
    return data_obj_names


def download_data(filename, data_obj_names):
    dfs = []
    s3 = boto3.client(
        's3',
    )
    
    with Bar(
        message='Downloading parquets',
        check_tty=False,
        hide_cursor=False,
        max=len(data_obj_names)
    ) as bar:

        for obj_name in data_obj_names:
#             df = wr.s3.read_csv(obj_name)
            obj = s3.get_object(Bucket='amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5', Key=obj_name)
            df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
            dfs.append(df)
            bar.next()

        bar.finish()
        
    df = combine_dfs(dfs)
    df.to_csv(filename)

    return df

def get_data(filename, force_redownload=False):
    start = time()
    data = Path(filename)
    
    if data.is_file() and not force_redownload:
        print('Deserializing data from', filename, '...')
        df = pd.read_csv(filename)
        
    else:
        data_obj_names = list_data_objs()
        df = download_data(filename, data_obj_names[1:]) # TODO: this is because list data objs is returning an empty thing
        
    print('Took', time() - start, 'seconds')    
    return df

In [27]:
# Data helper functions
def query_df(df, **kwargs):
    query = True
    for key, value in kwargs.items():
        query &= (df[key] == value)
        
    result = df[query]
    return result


In [28]:
# Download and compile parquets
df = get_data('training_data.csv', force_redownload=True)

Downloading parquets |################################| 18/18



Took 2.2039387226104736 seconds


In [30]:
# Train model
corpus_col='title_body'
url_col = 'url'
title_col='title'
train_df = df

corpus = train_df[corpus_col]
labels = list(zip(train_df[url_col], train_df[title_col]))

pipe = get_fitted_model(corpus, labels, lemmatize='custom')

Training model...


  'stop_words.' % sorted(inconsistent))


Took 73.72894024848938 seconds


In [None]:
pw_mgr_query = ['AmplifySignIn component does not work with password managers or native browser autofill']
pred, score = pipe.predict(pw_mgr_query)

In [31]:
# Model stats
vocab = pipe['tfidfvectorizer'].get_feature_names()
print('Number of vocab words:', len(vocab))
repo_list = list(set(df['repo']))
print('Available repos:', repo_list)
small_words = [word for word in vocab if len(word) < 3]
print('Small words in vocab:\n', small_words)

# print(pipe[1]._labels)

Number of vocab words: 23750
Available repos: ['amplify-ci-support', 'amplify-ios', 'amplify-adminui', 'aws-appsync-realtime-client-ios', 'amplify-android', 'amplify-cli', 'amplify-js-samples', 'amplify-codegen', 'community', 'aws-sdk-ios', 'amplify-console', 'docs', 'amplify-ui', 'aws-amplify.github.io', 'amplify-js', 'amplify-flutter', 'amplify-observer', 'aws-sdk-android']
Small words in vocab:
 ['ad', 'al', 'au', 'az', 'bk', 'ca', 'cc', 'cd', 'cf', 'ci', 'cm', 'cs', 'cu', 'cv', 'db', 'dm', 'dy', 'ec', 'ed', 'em', 'er', 'es', 'fo', 'gb', 'gc', 'gi', 'gm', 'hl', 'hr', 'hz', 'ic', 'id', 'io', 'ip', 'iv', 'ki', 'km', 'kv', 'l', 'lf', 'lm', 'mb', 'mi', 'os', 'pc', 'pe', 'po', 'pr', 'r', 's', 'sc', 'sd', 'si', 'sl', 'sm', 'sn', 'ss', 'ti', 'tl', 'tn', 'tt', 'vi', 'w', 'x', 'zu']


In [None]:
with open('vocab.txt', 'w') as f:
    f.write(str(vocab))

In [None]:
tokenizer = pipe[0].build_tokenizer()
print(tokenizer.custom)
token = tokenizer('walk walks talk talks talking talked')
print(token)

In [None]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
inspect_doc(pipe[0], js_issue)
get_weights(pipe[0], 'workarounds issued cognito pool', js_issue)

In [None]:
# Train SM estimator
train_data = 's3://amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5/data/issues/'
sklearn_estimator = SKLearn(
    'train.py',
    role=sagemaker.get_execution_role(),
    instance_type='local',
    framework_version='0.23-1',
    hyperparameters = {'n-best': 10},
    dependencies=['.']
)
sklearn_estimator.fit({'train': train_data})

<<<<<<< local


Creating 096xlv7tu0-algo-1-vff22 ... 
Creating 096xlv7tu0-algo-1-vff22 ... done
Attaching to 096xlv7tu0-algo-1-vff22
[36m096xlv7tu0-algo-1-vff22 |[0m 2021-07-30 23:00:59,732 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
[36m096xlv7tu0-algo-1-vff22 |[0m 2021-07-30 23:00:59,735 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36m096xlv7tu0-algo-1-vff22 |[0m 2021-07-30 23:00:59,744 sagemaker_sklearn_container.training INFO     Invoking user training script.
[36m096xlv7tu0-algo-1-vff22 |[0m 2021-07-30 23:00:59,940 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
[36m096xlv7tu0-algo-1-vff22 |[0m /miniconda3/bin/python -m pip install -r requirements.txt
[36m096xlv7tu0-algo-1-vff22 |[0m Collecting progress
[36m096xlv7tu0-algo-1-vff22 |[0m   Downloading progress-1.6.tar.gz (7.8 kB)
[36m096xlv7tu0-algo-1-vff22 |[0m Collecting awswrangler
[36m096xlv7tu0-algo-1-vff22 |[0m



2021-07-31 00:05:15 Starting - Starting the training job...
2021-07-31 00:05:38 Starting - Launching requested ML instancesProfilerReport-1627689913: InProgress
......
2021-07-31 00:06:39 Starting - Preparing the instances for training......
2021-07-31 00:07:42 Downloading - Downloading input data...
2021-07-31 00:08:10 Training - Downloading the training image..
2021-07-31 00:08:39 Training - Training image download completed. Training in progress.[34m2021-07-31 00:08:27,381 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-07-31 00:08:27,383 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-31 00:08:27,392 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-07-31 00:08:28,100 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting pro

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2021-07-31-00-05-12-960: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 161, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 81, in check_error
    raise error_class(return_code=return_code, cmd=" ".join(cmd), output=stderr)
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
Command "/miniconda3/bin/python 

>>>>>>> remote


Using the short-lived AWS credentials found in session. They might expire while running.


FileNotFoundError: [Errno 2] No such file or directory: 'docker': 'docker'

In [1]:
# Deploy SM predictor
# predictor = sklearn_estimator.deploy(instance_type='ml.m5.4xlarge',
#                                      initial_instance_count=1,
# #                                      update_endpoint=True,
#                                     endpoint_name='issue-similarity-endpoint')
predictor = Predictor('issue-similarity-endpoint')
model = sklearn_estimator.create_model()
session = model.sagemaker_session
role = sagemaker.get_execution_role()
model_name = model.name
container_def = model.prepare_container_def(instance_type='ml.m5.4xlarge')
session.create_model(model_name, role, container_def)


endpoint_config_name = session.create_endpoint_config(name=model_name,
                                                      model_name=model_name,
                                                      initial_instance_count=1,
                                                      instance_type='ml.m5.4xlarge')

# Update desired endpoint with new Endpoint Config
client = boto3.client('sagemaker')
client.update_endpoint(EndpointName='issue-similarity-endpoint',
                       EndpointConfigName=endpoint_config_name)

# predictor.update_endpoint(
#     instance_type='ml.m5.4xlarge',
#     model_name = sklearn_estimator.create_model().name,
#     initial_instance_count=1
# )

NameError: name 'Predictor' is not defined

In [22]:
data = ['DataStore model subscription fails']
data = json.dumps({'data': data})
response = predictor.predict(data)
print(response)

b'[[[["https://github.com/aws-amplify/amplify-cli/issues/3114", "DataStore generates subscriptions for all @model types"], ["https://github.com/aws-amplify/amplify-js/issues/5648", "DataStore keeps searching for non-existent Subscriptions"], ["https://github.com/aws-amplify/amplify-android/issues/1388", "Datastore model subscription fails due to timeout under slow network connection"], ["https://github.com/aws-amplify/amplify-js/issues/5173", "Sync processor does not follow @model subscriptions level"], ["https://github.com/aws-amplify/amplify-android/issues/483", "Subscription fails when following Getting Started doc steps"], ["https://github.com/aws-amplify/amplify-cli/issues/1810", "GraphQL AppSync Subscriptions not generated by AWS Amplify for custom mutations"], ["https://github.com/aws-amplify/amplify-js/issues/4683", "Authed subscriptions not working"], ["https://github.com/aws-amplify/amplify-js/issues/7318", "DataStore fails to initialize subscriptions when @auth directive wit

In [None]:
# with open('extra_tokens.txt', 'w') as f:
#     f.write(str(set(vocab2) - set(vocab)))

In [None]:
pw_mgr = query_df(js_df, number=5782)['bodyCleaned']
inspect_doc(pipe[0], pw_mgr)

In [None]:
js_issue = ['image file upload fail file size 5 mb']
pipe.predict(js_issue)

In [None]:
js_issue = ['additionalHeaders param is never passed to underlying function -- How do I access the current request headers or set them per request']
pred, score = pipe.predict(js_issue)
inspect_doc(pipe[0], query_df(small_df, repo='amplify-console', number=1519)['title_body'])