In [39]:
# Update and import packages
!pip install -Uqr requirements.txt

# Basic packages
import importlib
from time import time
from pathlib import Path
from progress.bar import Bar
import json
import re
from bs4 import BeautifulSoup

# Data science/NLP packages
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

# AWS packages
import awswrangler as wr
import sagemaker
from sagemaker.sklearn import SKLearn
import boto3

# Local modules
import model
import train
for m in [model, train]:
    importlib.reload(m)

from model import VectorSimilarity, get_fitted_model
from train import combine_dfs

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [2]:
# Sanity checks on VectorSimilarity
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

[['a' 'b' 'c']]
[[ 2.  1. -1.]]
Prediction took 0.000789642333984375 seconds


(array([['a', 'b', 'c']], dtype='<U1'), array([[ 2.,  1., -1.]]))

In [3]:
# Basic pipeline setup
basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = get_fitted_model(basic_corpus, basic_labels, lemmatize='custom')
pred, score = pipe.predict(basic_corpus)

Training model...
Took 1.484374761581421 seconds
[['a' 'c' 'b' 'd']
 ['b' 'c' 'a' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.         0.         0.        ]]
Prediction took 0.0005955696105957031 seconds


  'stop_words.' % sorted(inconsistent))


In [4]:
# Train and infer on small Data Wrangler dataset
print('broken after changing IAM users')
# sess = sagemaker.Session()
# bucket = sess.default_bucket()

# chunksize = 1000
# output_content_type = "parquet"
# flow_export_id = f"30-23-06-49-58efbaf1"
# flow_export_name = f"flow-{flow_export_id}"
# s3_output_prefix = f"export-{flow_export_name}/output"
# s3_output_path = f"s3://{bucket}/{s3_output_prefix}"

# dfs=[]
# if output_content_type.upper() == "CSV":
#     dfs = wr.s3.read_csv(s3_output_path, chunksize=chunksize)
# elif output_content_type.upper() == "PARQUET":
#     dfs = wr.s3.read_parquet(s3_output_path, chunked=chunksize)
# else:
#     print(f"Unexpected output content type {output_content_type}") 

# wrangled_df = next(dfs)

# X = wrangled_df['bodyText']
# y = wrangled_df['url']
# pipe.fit(X, y)

# pred, score = pipe.predict(X[13:14])
# print(list(y[13:14]))

broken after changing IAM users


In [62]:
# File helper functions
import io
def list_data_objs():
    secret_name = "SageMakerS3Access"
    region_name = "us-west-2"
    bucket_name = 'amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5'
    bucket_subfolder = 'data/issues/'
    
#     secrets = boto3.client(
#         service_name='secretsmanager',
#         region_name=region_name
#     )

#     secrets_response = secrets.get_secret_value(SecretId=secret_name)
#     secrets_dict = json.loads(secrets_response['SecretString'])
#     (access_key, secret_key), = secrets_dict.items()

    s3 = boto3.client('s3')
    data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
    data_obj_names = [key['Key'] for key in data_objects]
#     data_obj_names = [f"s3://{bucket_name}/{key['Key']}" for key in data_objects]
    
    return data_obj_names


def download_data(filename, data_obj_names):
    dfs = []
    s3 = boto3.client(
        's3',
    )
    
    with Bar(
        message='Downloading parquets',
        check_tty=False,
        hide_cursor=False,
        max=len(data_obj_names)
    ) as bar:

        for obj_name in data_obj_names:
#             df = wr.s3.read_csv(obj_name)
            obj = s3.get_object(Bucket='amplifyobserverinsights-aoinsightslandingbucket29-5vcr471d4nm5', Key=obj_name)
            df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
            dfs.append(df)
            bar.next()

        bar.finish()
        
    df = combine_dfs(dfs)
    df.to_csv(filename)

    return df

def get_data(filename, force_redownload=False):
    start = time()
    data = Path(filename)
    
    if data.is_file() and not force_redownload:
        print('Deserializing data from', filename, '...')
        df = pd.read_csv(filename)
        
    else:
        data_obj_names = list_data_objs()
        df = download_data(filename, data_obj_names[1:]) # TODO: this is because list data objs is returning an empty thing
        
    print('Took', time() - start, 'seconds')    
    return df

In [6]:
# Data helper functions
def query_df(df, **kwargs):
    query = True
    for key, value in kwargs.items():
        query &= (df[key] == value)
        
    result = df[query]
    return result

def compare_vecs():
    print('unimplemented')

def inspect_doc(vectorizer, doc, n_best=10):
    if type(doc) == str:
        doc = [doc]
        
    if len(doc) > 1:
        raise ValueError('Only one document per call is supported')

    vocab = np.array(vectorizer.get_feature_names(), ndmin=2)
    weights = vectorizer.transform(doc).toarray()
    weights_desc_args = np.flip(weights.argsort())
    words_desc = np.take_along_axis(vocab, weights_desc_args, axis=1)
    weights_desc = np.take_along_axis(weights, weights_desc_args, axis=1)
    
    print(words_desc[:, :n_best])
    print(weights_desc[:, :n_best])
    
def get_weights(vectorizer, query, doc, n_best=10):
    if type(doc) == str:
        doc = [doc]
        
    tokenizer = vectorizer.build_tokenizer()
    tokens = tokenizer(query)
    print(tokens)
    vocab = np.array(vectorizer.get_feature_names(), ndmin=2)
    indices = []
    for tok in tokens:
        indices.append(np.argwhere(vocab == tok)[0, 1])

    indices = np.array(indices)
    weights = vectorizer.transform(doc).toarray().flatten()
    print(indices)
    weights_desc = np.take_along_axis(weights, indices, axis=0)
    print(weights_desc[:n_best])
    


In [7]:
role = sagemaker.get_execution_role()
print(role)
region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')
print(region)
print(smclient)

arn:aws:iam::092109498566:role/service-role/AmazonSageMaker-ExecutionRole-20210728T153017
us-west-2
<botocore.client.SageMaker object at 0x7f17559ebc90>


In [63]:
# Download and compile parquets
iden = boto3.client('sts').get_caller_identity()
# print(iden)
df = get_data('training_data.csv', force_redownload=True)

Downloading parquets |################################| 18/18



Took 1.9556849002838135 seconds


In [74]:
# Preprocess training data
corpus_col = 'title_body'
label_col = 'url'

begin_text = r'.*Describe the bug'
mid_text = r'### Expected behavior|### Reproduction steps|\r\n*'
end_text = r'### Code Snippet.*'
begin_text_feat = r'.*Describe the feature you\'d like to request'
mid_text_feat = r'### Describe the solution you\'d like|### Describe alternatives you\'ve considered'
end_text_feat = r'### Additional context.*'

cases = [
    begin_text,
    mid_text,
    end_text,
    begin_text_feat,
    mid_text_feat,
    end_text_feat
]
pat_cases = '(' + '|'.join(cases) + ')'
pat = re.compile(pat_cases, flags=(re.DOTALL | re.M))

print('Preprocessing data...')
start = time()

# js_df = query_df(df, repo='amplify-js')
js_df = df
# clean_template = lambda text : re.sub(pat, '', text)
# js_df[corpus_col] = js_df[corpus_col].apply(clean_template)

print('Took', time() - start, 'seconds')

Preprocessing data...
Took 4.7206878662109375e-05 seconds


In [75]:
# Train model
small_df = js_df
corpus = small_df[corpus_col]
labels = small_df[label_col]

pipe = get_fitted_model(corpus, labels, lemmatize='custom')

Training model...


  'stop_words.' % sorted(inconsistent))


Took 76.53842520713806 seconds


In [76]:
# Model stats
vocab = pipe['tfidfvectorizer'].get_feature_names()
print('Number of vocab words:', len(vocab))
repo_list = list(set(df['repo']))
print('Available repos:', repo_list)
small_words = [word for word in vocab if len(word) < 3]
print('Small words in vocab:\n', small_words)

Number of vocab words: 23734
Available repos: ['amplify-flutter', 'community', 'docs', 'amplify-ios', 'amplify-console', 'amplify-observer', 'amplify-cli', 'aws-appsync-realtime-client-ios', 'aws-sdk-ios', 'amplify-ci-support', 'amplify-codegen', 'amplify-js-samples', 'aws-amplify.github.io', 'aws-sdk-android', 'amplify-ui', 'amplify-js', 'amplify-android', 'amplify-adminui']
Small words in vocab:
 ['ad', 'al', 'au', 'az', 'bk', 'ca', 'cc', 'cd', 'cf', 'ci', 'cm', 'cs', 'cu', 'cv', 'db', 'dm', 'dy', 'ec', 'ed', 'em', 'er', 'es', 'fo', 'gb', 'gc', 'gi', 'gm', 'hl', 'hr', 'hz', 'ic', 'id', 'io', 'ip', 'iv', 'ki', 'km', 'kv', 'l', 'lf', 'lm', 'mb', 'mi', 'os', 'pc', 'pe', 'po', 'pr', 'r', 's', 'sc', 'sd', 'si', 'sl', 'sm', 'sn', 'ss', 'ti', 'tl', 'tn', 'tt', 'vi', 'w', 'x', 'zu']


In [None]:
with open('vocab.txt', 'w') as f:
    f.write(str(vocab))

In [None]:
tokenizer = pipe[0].build_tokenizer()
print(tokenizer.custom)
token = tokenizer('walk walks talk talks talking talked')
print(token)

In [None]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
inspect_doc(pipe[0], js_issue)
get_weights(pipe[0], 'workarounds issued cognito pool', js_issue)

In [None]:
# Train SM estimator
train_data = 's3://githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v/data'
sklearn_estimator = SKLearn('train.py',
                            role=sagemaker.get_execution_role(),
                            instance_type='ml.m5.4xlarge',
                            framework_version='0.23-1',
                            hyperparameters = {'n-best': 10})
sklearn_estimator.fit({'train': train_data})

In [None]:
# Deploy SM predictor
predictor = sklearn_estimator.deploy(instance_type='ml.m5.4xlarge',
                                     initial_instance_count=1)
data = ['password managers autofill']
response = predictor.predict(data)
print(response)

In [None]:
# with open('extra_tokens.txt', 'w') as f:
#     f.write(str(set(vocab2) - set(vocab)))

In [None]:
pw_mgr = query_df(js_df, number=5782)['bodyCleaned']
inspect_doc(pipe[0], pw_mgr)

In [79]:
pw_mgr_query = ['AmplifySignIn component does not work with password managers or native browser autofill']
pred, score = pipe.predict(pw_mgr_query)

[['https://github.com/aws-amplify/amplify-js/issues/8472'
  'https://github.com/aws-amplify/amplify-adminui/issues/233'
  'https://github.com/aws-amplify/amplify-js/issues/5782'
  'https://github.com/aws-amplify/amplify-js/issues/8289'
  'https://github.com/aws-amplify/amplify-js/issues/4748'
  'https://github.com/aws-amplify/amplify-js/issues/3799'
  'https://github.com/aws-amplify/amplify-js/issues/14'
  'https://github.com/aws-amplify/aws-sdk-ios/issues/3076'
  'https://github.com/aws-amplify/aws-sdk-ios/issues/313'
  'https://github.com/aws-amplify/amplify-js/issues/6111']]
[[0.7425792  0.33807384 0.31580219 0.29732867 0.29658787 0.29270244
  0.28858939 0.2834566  0.26672109 0.26536774]]
Prediction took 2.312330722808838 seconds


In [None]:
js_issue = ['image file upload fail file size 5 mb']
pipe.predict(js_issue)

In [82]:
js_issue = ['additionalHeaders param is never passed to underlying function -- How do I access the current request headers or set them per request']
pred, score = pipe.predict(js_issue)
inspect_doc(pipe[0], query_df(small_df, repo='amplify-console', number=1519)['title_body'])

[['https://github.com/aws-amplify/amplify-js/issues/4981'
  'https://github.com/aws-amplify/amplify-js/issues/5576'
  'https://github.com/aws-amplify/amplify-console/issues/1519'
  'https://github.com/aws-amplify/amplify-cli/issues/1770'
  'https://github.com/aws-amplify/amplify-js/issues/3706'
  'https://github.com/aws-amplify/amplify-js/issues/1646'
  'https://github.com/aws-amplify/amplify-cli/issues/5260'
  'https://github.com/aws-amplify/amplify-js/issues/515'
  'https://github.com/aws-amplify/amplify-cli/issues/4822'
  'https://github.com/aws-amplify/aws-sdk-android/issues/375']]
[[0.72054029 0.41931634 0.33497204 0.33299807 0.31423847 0.28113276
  0.27258603 0.26524243 0.26306481 0.26025255]]
Prediction took 2.3095221519470215 seconds
[['header' 'request' 'navegaor' 'server' 'end' 'proxy' 'standard' 'make'
  'cors' 'saw']]
[[0.61793193 0.41295811 0.21754669 0.20195419 0.18303126 0.14329843
  0.13805923 0.13789834 0.1327474  0.12866307]]


In [None]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
print(js_issue)
js_issue_ast = BeautifulSoup(js_issue, 'html.parser')
print(js_issue_ast)