In [1]:
# Update and import packages
!pip install -Uqr requirements.txt

# Basic packages
import importlib
from time import time
from pathlib import Path
from progress.bar import Bar
import json
import re
from bs4 import BeautifulSoup

# Data science/NLP packages
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

# AWS packages
import awswrangler as wr
import sagemaker
from sagemaker.sklearn import SKLearn
import boto3

# Local modules
import model
import train
for m in [model, train]:
    importlib.reload(m)

from model import VectorSimilarity, get_fitted_model
from train import combine_dfs

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [2]:
# Sanity checks on VectorSimilarity
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

array([['a', 'b', 'c']], dtype='<U1')

In [3]:
# Basic pipeline setup
basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = get_fitted_model(basic_corpus, basic_labels)
pred, score = pipe.predict_score(basic_corpus)
print(pred)
print(score)

[['a' 'b' 'c' 'd']
 ['b' 'a' 'c' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.50443175 0.3494023  0.        ]
 [1.         0.50443175 0.3494023  0.        ]
 [1.         0.3494023  0.3494023  0.        ]
 [1.         0.         0.         0.        ]]


In [4]:
# Train and infer on small Data Wrangler dataset
sess = sagemaker.Session()
bucket = sess.default_bucket()

chunksize = 1000
output_content_type = "parquet"
flow_export_id = f"30-23-06-49-58efbaf1"
flow_export_name = f"flow-{flow_export_id}"
s3_output_prefix = f"export-{flow_export_name}/output"
s3_output_path = f"s3://{bucket}/{s3_output_prefix}"

dfs=[]
if output_content_type.upper() == "CSV":
    dfs = wr.s3.read_csv(s3_output_path, chunksize=chunksize)
elif output_content_type.upper() == "PARQUET":
    dfs = wr.s3.read_parquet(s3_output_path, chunked=chunksize)
else:
    print(f"Unexpected output content type {output_content_type}") 

wrangled_df = next(dfs)

X = wrangled_df['bodyText']
y = wrangled_df['url']
pipe.fit(X, y)

pred, score = pipe.predict(X[13:14])
print(list(y[13:14]))

[['https://github.com/aws-amplify/amplify-adminui/issues/12'
  'https://github.com/aws-amplify/amplify-adminui/issues/21'
  'https://github.com/aws-amplify/amplify-adminui/issues/67'
  'https://github.com/aws-amplify/amplify-adminui/issues/82'
  'https://github.com/aws-amplify/amplify-adminui/issues/41'
  'https://github.com/aws-amplify/amplify-adminui/issues/28'
  'https://github.com/aws-amplify/amplify-adminui/issues/85'
  'https://github.com/aws-amplify/amplify-adminui/issues/45'
  'https://github.com/aws-amplify/amplify-adminui/issues/35'
  'https://github.com/aws-amplify/amplify-adminui/issues/33']]
[[1.         0.6949403  0.23125501 0.13250384 0.12011294 0.12004747
  0.11126224 0.08492276 0.08372554 0.07978957]]
['https://github.com/aws-amplify/amplify-adminui/issues/12']


In [6]:
# File helper functions
def list_data_objs():
    secret_name = "SageMakerS3Access"
    region_name = "us-west-2"
    bucket_name = 'githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v'
    bucket_subfolder = 'data/'
    
    secrets = boto3.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    secrets_response = secrets.get_secret_value(SecretId=secret_name)
    secrets_dict = json.loads(secrets_response['SecretString'])
    (access_key, secret_key), = secrets_dict.items()

    s3 = boto3.client('s3')
    data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
    data_obj_names = [f"s3://{bucket_name}/{key['Key']}" for key in data_objects]
    
    return data_obj_names


def download_data(filename, data_obj_names):
    dfs = []
    
    with Bar(
        message='Downloading parquets',
        check_tty=False,
        hide_cursor=False,
        max=len(data_obj_names)
    ) as bar:

        for obj_name in data_obj_names:
            df = wr.s3.read_parquet(obj_name)
            dfs.append(df)
            bar.next()

        bar.finish()
        
    df = combine_dfs(dfs)
    df.to_csv(filename)

    return df

def get_data(filename, force_redownload=False):
    start = time()
    data = Path(filename)
    
    if data.is_file() and not force_redownload:
        print('Deserializing data from', filename, '...')
        df = pd.read_csv(filename)
        
    else:
        data_obj_names = list_data_objs()
        df = download_data(filename, data_obj_names)
        
    print('Took', time() - start, 'seconds')    
    return df

In [7]:
# Data helper functions
def query_df(df, **kwargs):
    query = True
    for key, value in kwargs.items():
        query &= (df[key] == value)
        
    result = df[query]
    return result

def compare_vecs():
    print('unimplemented')

def inspect_corpus(vectorizer, df, url):
    print('unimplemented')
    

In [9]:
# Download and compile parquets
df = get_data('training_data.csv', force_redownload=False)

Deserializing data from training_data.csv ...
Took 3.805649518966675 seconds


In [10]:
# Preprocess training data
corpus_col = 'bodyCleaned'
label_col = 'url'

begin_text = r'.*Describe the bug'
mid_text = r'### Expected behavior|### Reproduction steps|\r\n*'
end_text = r'### Code Snippet.*'
begin_text_feat = r'.*Describe the feature you\'d like to request'
mid_text_feat = r'### Describe the solution you\'d like|### Describe alternatives you\'ve considered'
end_text_feat = r'### Additional context.*'

cases = [
    begin_text,
    mid_text,
    end_text,
    begin_text_feat,
    mid_text_feat,
    end_text_feat
]
pat_cases = '(' + '|'.join(cases) + ')'
pat = re.compile(pat_cases, flags=(re.DOTALL | re.M))

print('Preprocessing data...')
start = time()

# js_df = query_df(df, repository='amplify-js')
js_df = df
clean_template = lambda text : re.sub(pat, '', text)
js_df[corpus_col] = js_df['title'] + ' ' + js_df['body'].apply(clean_template)

print('Took', time() - start, 'seconds')

Preprocessing data...
Took 301.87840962409973 seconds


In [11]:
# Train model

small_df = df
corpus = small_df[corpus_col]
labels = small_df[label_col]

pipe = get_fitted_model(corpus, labels, lemmatize='custom')

Training model...
Took 156.8879110813141 seconds


In [12]:
# Model stats
vocab = pipe['tfidfvectorizer'].get_feature_names()
print('Number of vocab words:', len(vocab))
repo_list = list(set(df['repository']))
print('Available repos:', repo_list)
small_words = [word for word in vocab if len(word) < 3]
print('Small words in vocab:\n', small_words)

Number of vocab words: 65345
Available repos: ['aws-sdk-ios', 'amplify-js', 'docs', 'amplify-adminui', 'amplify-js-samples', 'amplify-console', 'amplify-codegen', 'aws-sdk-android', 'amplify-ci-support', 'amplify-ios', 'amplify-flutter', 'amplify-cli', 'amplify-android']
Small words in vocab:
 ['ad', 'al', 'an', 'as', 'at', 'az', 'bk', 'ca', 'cc', 'cd', 'ce', 'cf', 'ci', 'cl', 'cm', 'co', 'cs', 'ct', 'cu', 'cv', 'db', 'dd', 'de', 'dm', 'do', 'dy', 'eb', 'ec', 'ed', 'em', 'er', 'es', 'fo', 'gb', 'gc', 'gi', 'gm', 'go', 'ha', 'hl', 'ho', 'hr', 'i', 'ic', 'id', 'il', 'in', 'io', 'ip', 'iv', 'j', 'km', 'kt', 'kv', 'la', 'le', 'lf', 'lh', 'li', 'lm', 'm1', 'mb', 'mi', 'nd', 'ni', 'nt', 'nv', 'ob', 'or', 'os', 'pa', 'pi', 'po', 'pr', 'r', 're', 'rh', 's', 'sc', 'si', 'sl', 'sm', 'sn', 'so', 'ss', 'te', 'tl', 'tn', 'tt', 'us', 'vi', 'w', 'wa', 'x']


In [2]:
train_data = 's3://githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v/data'
sklearn_estimator = SKLearn('train_tfidf.py',
                            role=sagemaker.get_execution_role(),
                            instance_type='ml.m5.4xlarge',
                            framework_version='0.23-1',
                            hyperparameters = {'n-best': 10})
sklearn_estimator.fit({'train': train_data})

2021-07-27 18:48:51 Starting - Starting the training job...
2021-07-27 18:49:14 Starting - Launching requested ML instancesProfilerReport-1627411730: InProgress
......
2021-07-27 18:50:15 Starting - Preparing the instances for training......
2021-07-27 18:51:20 Downloading - Downloading input data...
2021-07-27 18:51:47 Training - Downloading the training image...
2021-07-27 18:52:18 Uploading - Uploading generated training model
2021-07-27 18:52:18 Failed - Training job failed
ProfilerReport-1627411730: Stopping
[34m2021-07-27 18:52:03,674 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-07-27 18:52:03,676 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-27 18:52:03,685 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-07-27 18:52:03,922 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-27 18:5

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2021-07-27-18-48-50-724: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 161, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 81, in check_error
    raise error_class(return_code=return_code, cmd=" ".join(cmd), output=stderr)
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
Command "/miniconda3/bin/python 

In [None]:
predictor = sklearn_estimator.deploy(instance_type='ml.m5.4xlarge',
                                     initial_instance_count=1)
data = ['password managers autofill']
response = predictor.predict(data)
print(response)

In [13]:
# with open('extra_tokens.txt', 'w') as f:
#     f.write(str(set(vocab2) - set(vocab)))

In [14]:
js_issue = ['Password managers autofill']
pipe.predict(js_issue)

Inferring on the query: ['Password managers autofill']
[['https://github.com/aws-amplify/amplify-js/issues/8472'
  'https://github.com/aws-amplify/amplify-js/issues/4748'
  'https://github.com/aws-amplify/amplify-js/issues/3799'
  'https://github.com/aws-amplify/amplify-js/issues/8289'
  'https://github.com/aws-amplify/amplify-js/issues/5782'
  'https://github.com/aws-amplify/amplify-js/issues/7919'
  'https://github.com/aws-amplify/amplify-js/issues/3522'
  'https://github.com/aws-amplify/amplify-js/issues/14'
  'https://github.com/aws-amplify/amplify-adminui/issues/233'
  'https://github.com/aws-amplify/amplify-js/issues/2479']]
[[0.61543985 0.45566849 0.40671398 0.39220031 0.30809485 0.29292973
  0.26983688 0.26227048 0.26008897 0.25457087]]
Took 21.20275568962097 seconds


In [15]:
js_issue = ['image file upload fail file size 5 mb']
pipe.predict(js_issue)

Inferring on the query: ['image file upload fail file size 5 mb']
[['https://github.com/aws-amplify/amplify-cli/issues/1525'
  'https://github.com/aws-amplify/amplify-console/issues/604'
  'https://github.com/aws-amplify/amplify-cli/issues/7434'
  'https://github.com/aws-amplify/amplify-cli/issues/6008'
  'https://github.com/aws-amplify/amplify-js/issues/2977'
  'https://github.com/aws-amplify/amplify-js/issues/125'
  'https://github.com/aws-amplify/docs/issues/3243'
  'https://github.com/aws-amplify/amplify-js/issues/6419'
  'https://github.com/aws-amplify/amplify-js/issues/3016'
  'https://github.com/aws-amplify/docs/issues/2250']]
[[0.47011621 0.41911752 0.41036934 0.39712141 0.32038145 0.31243851
  0.30829788 0.29588254 0.28722115 0.28440392]]
Took 21.234702110290527 seconds


In [16]:
js_issue = ['user endpoint disappear pinpoint']
pipe.predict(js_issue)

Inferring on the query: ['user endpoint disappear pinpoint']
[['https://github.com/aws-amplify/aws-sdk-ios/issues/1212'
  'https://github.com/aws-amplify/amplify-js/issues/6896'
  'https://github.com/aws-amplify/amplify-js/issues/3819'
  'https://github.com/aws-amplify/amplify-js/issues/4573'
  'https://github.com/aws-amplify/aws-sdk-android/issues/1927'
  'https://github.com/aws-amplify/amplify-adminui/issues/92'
  'https://github.com/aws-amplify/amplify-js/issues/7675'
  'https://github.com/aws-amplify/amplify-js/issues/4712'
  'https://github.com/aws-amplify/amplify-js/issues/4529'
  'https://github.com/aws-amplify/amplify-cli/issues/5204']]
[[0.34888384 0.3452026  0.34273862 0.33471597 0.32414749 0.31828594
  0.31434318 0.30612801 0.30452039 0.29743369]]
Took 21.23546576499939 seconds


In [17]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
print(js_issue)
js_issue_ast = BeautifulSoup(js_issue, 'html.parser')
print(js_issue_ast)

Connect to External Cognito Account **Is your feature request related to a problem? Please describe.**We are building multiple apps in a multi-account enterprise AWS environment and we're trying to consume a central cognito pool that is under a different account other than the amplify apps (each amplify app is under it's own account following best practices). Hence when we try to import the cognito pool for auth the IAM user cannot see that in the Role that is used by amplify. We looked at and successfully established cross account access using the method described [here](https://docs.amplify.aws/cli/usage/iam) however we cannot change roles after we ran amplify init so the end result is that we are provisioning the amplify app under the aws account that holds the central cognito instance. This issue is also mentioned [here](https://github.com/aws-amplify/amplify-cli/issues/7008).**Describe the solution you'd like**A straightforward way or recommended best practices for such scenario. 