In [114]:
# Update and import packages
!pip install -Uqr requirements.txt

# Basic packages
import importlib
from time import time
from pathlib import Path
from progress.bar import Bar
import json
import re
import io
from bs4 import BeautifulSoup

# Data science/NLP packages
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

# AWS packages
import awswrangler as wr
import sagemaker
from sagemaker.sklearn import SKLearn
from sagemaker.predictor import Predictor
import boto3

# Local modules
import model
import train
for m in [model, train]:
    importlib.reload(m)

from model import VectorSimilarity, get_fitted_model
from train import combine_dfs

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.20.2 requires botocore==1.21.2, but you have botocore 1.21.10 which is incompatible.
aiobotocore 1.3.3 requires botocore<1.20.107,>=1.20.106, but you have botocore 1.21.10 which is incompatible.[0m


In [2]:
# Sanity checks on VectorSimilarity
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

[['a' 'b' 'c']]
[[ 2.  1. -1.]]
Prediction took 0.000789642333984375 seconds


(array([['a', 'b', 'c']], dtype='<U1'), array([[ 2.,  1., -1.]]))

In [3]:
# Basic pipeline setup
basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = get_fitted_model(basic_corpus, basic_labels, lemmatize='custom')
pred, score = pipe.predict(basic_corpus)

Training model...
Took 1.484374761581421 seconds
[['a' 'c' 'b' 'd']
 ['b' 'c' 'a' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.         0.         0.        ]]
Prediction took 0.0005955696105957031 seconds


  'stop_words.' % sorted(inconsistent))


In [62]:
# File helper functions
def list_data_objs():
    secret_name = "SageMakerS3Access"
    region_name = "us-west-2"
    bucket_name = 'amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5'
    bucket_subfolder = 'data/issues/'
    
#     secrets = boto3.client(
#         service_name='secretsmanager',
#         region_name=region_name
#     )

#     secrets_response = secrets.get_secret_value(SecretId=secret_name)
#     secrets_dict = json.loads(secrets_response['SecretString'])
#     (access_key, secret_key), = secrets_dict.items()

    s3 = boto3.client('s3')
    data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
    data_obj_names = [key['Key'] for key in data_objects]
#     data_obj_names = [f"s3://{bucket_name}/{key['Key']}" for key in data_objects]
    
    return data_obj_names


def download_data(filename, data_obj_names):
    dfs = []
    s3 = boto3.client(
        's3',
    )
    
    with Bar(
        message='Downloading parquets',
        check_tty=False,
        hide_cursor=False,
        max=len(data_obj_names)
    ) as bar:

        for obj_name in data_obj_names:
#             df = wr.s3.read_csv(obj_name)
            obj = s3.get_object(Bucket='amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5', Key=obj_name)
            df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
            dfs.append(df)
            bar.next()

        bar.finish()
        
    df = combine_dfs(dfs)
    df.to_csv(filename)

    return df

def get_data(filename, force_redownload=False):
    start = time()
    data = Path(filename)
    
    if data.is_file() and not force_redownload:
        print('Deserializing data from', filename, '...')
        df = pd.read_csv(filename)
        
    else:
        data_obj_names = list_data_objs()
        df = download_data(filename, data_obj_names[1:]) # TODO: this is because list data objs is returning an empty thing
        
    print('Took', time() - start, 'seconds')    
    return df

In [6]:
# Data helper functions
def query_df(df, **kwargs):
    query = True
    for key, value in kwargs.items():
        query &= (df[key] == value)
        
    result = df[query]
    return result

def inspect_doc(vectorizer, doc, n_best=10):
    if type(doc) == str:
        doc = [doc]
        
    if len(doc) > 1:
        raise ValueError('Only one document per call is supported')

    vocab = np.array(vectorizer.get_feature_names(), ndmin=2)
    weights = vectorizer.transform(doc).toarray()
    weights_desc_args = np.flip(weights.argsort())
    words_desc = np.take_along_axis(vocab, weights_desc_args, axis=1)
    weights_desc = np.take_along_axis(weights, weights_desc_args, axis=1)
    
    print(words_desc[:, :n_best])
    print(weights_desc[:, :n_best])
    
def get_weights(vectorizer, query, doc, n_best=10):
    if type(doc) == str:
        doc = [doc]
        
    tokenizer = vectorizer.build_tokenizer()
    tokens = tokenizer(query)
    print(tokens)
    vocab = np.array(vectorizer.get_feature_names(), ndmin=2)
    indices = []
    for tok in tokens:
        indices.append(np.argwhere(vocab == tok)[0, 1])

    indices = np.array(indices)
    weights = vectorizer.transform(doc).toarray().flatten()
    print(indices)
    weights_desc = np.take_along_axis(weights, indices, axis=0)
    print(weights_desc[:n_best])
    


In [7]:
role = sagemaker.get_execution_role()
print(role)
region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')
print(region)
print(smclient)

arn:aws:iam::092109498566:role/service-role/AmazonSageMaker-ExecutionRole-20210728T153017
us-west-2
<botocore.client.SageMaker object at 0x7f17559ebc90>


In [63]:
# Download and compile parquets
iden = boto3.client('sts').get_caller_identity()
# print(iden)
df = get_data('training_data.csv', force_redownload=True)

Downloading parquets |################################| 18/18



Took 1.9556849002838135 seconds


In [75]:
# Train model
corpus = df[corpus_col]
labels = df[label_col]

pipe = get_fitted_model(corpus, labels, lemmatize='custom')

Training model...


  'stop_words.' % sorted(inconsistent))


Took 76.53842520713806 seconds


In [76]:
# Model stats
vocab = pipe['tfidfvectorizer'].get_feature_names()
print('Number of vocab words:', len(vocab))
repo_list = list(set(df['repo']))
print('Available repos:', repo_list)
small_words = [word for word in vocab if len(word) < 3]
print('Small words in vocab:\n', small_words)

Number of vocab words: 23734
Available repos: ['amplify-flutter', 'community', 'docs', 'amplify-ios', 'amplify-console', 'amplify-observer', 'amplify-cli', 'aws-appsync-realtime-client-ios', 'aws-sdk-ios', 'amplify-ci-support', 'amplify-codegen', 'amplify-js-samples', 'aws-amplify.github.io', 'aws-sdk-android', 'amplify-ui', 'amplify-js', 'amplify-android', 'amplify-adminui']
Small words in vocab:
 ['ad', 'al', 'au', 'az', 'bk', 'ca', 'cc', 'cd', 'cf', 'ci', 'cm', 'cs', 'cu', 'cv', 'db', 'dm', 'dy', 'ec', 'ed', 'em', 'er', 'es', 'fo', 'gb', 'gc', 'gi', 'gm', 'hl', 'hr', 'hz', 'ic', 'id', 'io', 'ip', 'iv', 'ki', 'km', 'kv', 'l', 'lf', 'lm', 'mb', 'mi', 'os', 'pc', 'pe', 'po', 'pr', 'r', 's', 'sc', 'sd', 'si', 'sl', 'sm', 'sn', 'ss', 'ti', 'tl', 'tn', 'tt', 'vi', 'w', 'x', 'zu']


In [None]:
with open('vocab.txt', 'w') as f:
    f.write(str(vocab))

In [None]:
tokenizer = pipe[0].build_tokenizer()
print(tokenizer.custom)
token = tokenizer('walk walks talk talks talking talked')
print(token)

In [None]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
inspect_doc(pipe[0], js_issue)
get_weights(pipe[0], 'workarounds issued cognito pool', js_issue)

In [123]:
# Train SM estimator
train_data = 's3://amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5/data/issues/'
sklearn_estimator = SKLearn(
    'train.py',
    role=sagemaker.get_execution_role(),
    instance_type='ml.m5.4xlarge',
    framework_version='0.23-1',
    hyperparameters = {'n-best': 10},
    dependencies=['.']
)
sklearn_estimator.fit({'train': train_data})

2021-07-29 21:32:48 Starting - Starting the training job...
2021-07-29 21:33:03 Starting - Launching requested ML instancesProfilerReport-1627594367: InProgress
......
2021-07-29 21:34:05 Starting - Preparing the instances for training.........
2021-07-29 21:35:45 Downloading - Downloading input data...
2021-07-29 21:36:06 Training - Training image download completed. Training in progress..[34m2021-07-29 21:36:10,019 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-07-29 21:36:10,021 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-29 21:36:10,029 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-07-29 21:36:11,934 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting progress
  Downloading progress-1.6.tar.gz (7.8 kB)[0m
[34mCo

In [124]:
# Deploy SM predictor
# predictor = sklearn_estimator.deploy(instance_type='ml.m5.4xlarge',
#                                      initial_instance_count=1,
# #                                      update_endpoint=True,
#                                     endpoint_name='issue-similarity-endpoint')
predictor = Predictor('issue-similarity-endpoint')
model = sklearn_estimator.create_model()
session = model.sagemaker_session
role = sagemaker.get_execution_role()
model_name = model.name
container_def = model.prepare_container_def(instance_type='ml.m5.4xlarge')
session.create_model(model_name, role, container_def)


endpoint_config_name = session.create_endpoint_config(name=model_name,
                                                      model_name=model_name,
                                                      initial_instance_count=1,
                                                      instance_type='ml.m5.4xlarge')

# Update desired endpoint with new Endpoint Config
client = boto3.client('sagemaker')
client.update_endpoint(EndpointName='issue-similarity-endpoint',
                       EndpointConfigName=endpoint_config_name)

# predictor.update_endpoint(
#     instance_type='ml.m5.4xlarge',
#     model_name = sklearn_estimator.create_model().name,
#     initial_instance_count=1
# )

{'EndpointArn': 'arn:aws:sagemaker:us-west-2:092109498566:endpoint/issue-similarity-endpoint',
 'ResponseMetadata': {'RequestId': '09396730-3c81-4974-8f65-2722bff6c4b5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '09396730-3c81-4974-8f65-2722bff6c4b5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '93',
   'date': 'Thu, 29 Jul 2021 21:41:22 GMT'},
  'RetryAttempts': 0}}

In [122]:
data = ['password managers autofill']
response = predictor.predict(data)
print(response)

[[['https://github.com/aws-amplify/amplify-js/issues/8472'
   'https://github.com/aws-amplify/amplify-adminui/issues/233'
   'https://github.com/aws-amplify/amplify-js/issues/4180'
   'https://github.com/aws-amplify/amplify-js/issues/5254'
   'https://github.com/aws-amplify/aws-sdk-ios/issues/3076'
   'https://github.com/aws-amplify/amplify-js/issues/7919'
   'https://github.com/aws-amplify/amplify-js/issues/3799'
   'https://github.com/aws-amplify/aws-sdk-android/issues/1040'
   'https://github.com/aws-amplify/amplify-ios/issues/90'
   'https://github.com/aws-amplify/amplify-ios/issues/373']]

 [[0.5997545956926212 0.36213294910206784 0.34880661084949227
   0.3262144123647185 0.3211355352512778 0.2898935616415725
   0.2888978803166796 0.27088072301221555 0.27011981892700004
   0.2699086298024099]]]


In [None]:
# with open('extra_tokens.txt', 'w') as f:
#     f.write(str(set(vocab2) - set(vocab)))

In [None]:
pw_mgr = query_df(js_df, number=5782)['bodyCleaned']
inspect_doc(pipe[0], pw_mgr)

In [79]:
pw_mgr_query = ['AmplifySignIn component does not work with password managers or native browser autofill']
pred, score = pipe.predict(pw_mgr_query)

[['https://github.com/aws-amplify/amplify-js/issues/8472'
  'https://github.com/aws-amplify/amplify-adminui/issues/233'
  'https://github.com/aws-amplify/amplify-js/issues/5782'
  'https://github.com/aws-amplify/amplify-js/issues/8289'
  'https://github.com/aws-amplify/amplify-js/issues/4748'
  'https://github.com/aws-amplify/amplify-js/issues/3799'
  'https://github.com/aws-amplify/amplify-js/issues/14'
  'https://github.com/aws-amplify/aws-sdk-ios/issues/3076'
  'https://github.com/aws-amplify/aws-sdk-ios/issues/313'
  'https://github.com/aws-amplify/amplify-js/issues/6111']]
[[0.7425792  0.33807384 0.31580219 0.29732867 0.29658787 0.29270244
  0.28858939 0.2834566  0.26672109 0.26536774]]
Prediction took 2.312330722808838 seconds


In [125]:
js_issue = ['image file upload fail file size 5 mb']
pipe.predict(js_issue)

[['https://github.com/aws-amplify/amplify-cli/issues/7434'
  'https://github.com/aws-amplify/amplify-js/issues/7574'
  'https://github.com/aws-amplify/aws-sdk-ios/issues/963'
  'https://github.com/aws-amplify/amplify-js/issues/3016'
  'https://github.com/aws-amplify/amplify-console/issues/604'
  'https://github.com/aws-amplify/aws-sdk-ios/issues/2214'
  'https://github.com/aws-amplify/amplify-cli/issues/1525'
  'https://github.com/aws-amplify/amplify-cli/issues/3621'
  'https://github.com/aws-amplify/amplify-cli/issues/6008'
  'https://github.com/aws-amplify/amplify-js/issues/5385']]
[[0.55712908 0.53323662 0.52674688 0.47105745 0.45292852 0.45261622
  0.44701542 0.4423523  0.43639532 0.43491068]]
Prediction took 2.4285876750946045 seconds


(array([['https://github.com/aws-amplify/amplify-cli/issues/7434',
         'https://github.com/aws-amplify/amplify-js/issues/7574',
         'https://github.com/aws-amplify/aws-sdk-ios/issues/963',
         'https://github.com/aws-amplify/amplify-js/issues/3016',
         'https://github.com/aws-amplify/amplify-console/issues/604',
         'https://github.com/aws-amplify/aws-sdk-ios/issues/2214',
         'https://github.com/aws-amplify/amplify-cli/issues/1525',
         'https://github.com/aws-amplify/amplify-cli/issues/3621',
         'https://github.com/aws-amplify/amplify-cli/issues/6008',
         'https://github.com/aws-amplify/amplify-js/issues/5385']],
       dtype=object),
 array([[0.55712908, 0.53323662, 0.52674688, 0.47105745, 0.45292852,
         0.45261622, 0.44701542, 0.4423523 , 0.43639532, 0.43491068]]))

In [82]:
js_issue = ['additionalHeaders param is never passed to underlying function -- How do I access the current request headers or set them per request']
pred, score = pipe.predict(js_issue)
inspect_doc(pipe[0], query_df(small_df, repo='amplify-console', number=1519)['title_body'])

[['https://github.com/aws-amplify/amplify-js/issues/4981'
  'https://github.com/aws-amplify/amplify-js/issues/5576'
  'https://github.com/aws-amplify/amplify-console/issues/1519'
  'https://github.com/aws-amplify/amplify-cli/issues/1770'
  'https://github.com/aws-amplify/amplify-js/issues/3706'
  'https://github.com/aws-amplify/amplify-js/issues/1646'
  'https://github.com/aws-amplify/amplify-cli/issues/5260'
  'https://github.com/aws-amplify/amplify-js/issues/515'
  'https://github.com/aws-amplify/amplify-cli/issues/4822'
  'https://github.com/aws-amplify/aws-sdk-android/issues/375']]
[[0.72054029 0.41931634 0.33497204 0.33299807 0.31423847 0.28113276
  0.27258603 0.26524243 0.26306481 0.26025255]]
Prediction took 2.3095221519470215 seconds
[['header' 'request' 'navegaor' 'server' 'end' 'proxy' 'standard' 'make'
  'cors' 'saw']]
[[0.61793193 0.41295811 0.21754669 0.20195419 0.18303126 0.14329843
  0.13805923 0.13789834 0.1327474  0.12866307]]
