In [38]:
# Update and import packages
!pip install -Uqr requirements.txt

import importlib
from time import time
from progress.bar import Bar
import json
import re

import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)
from sklearn.utils.estimator_checks import check_estimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import nltk
nltk.download(['punkt', 'wordnet'])
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import awswrangler as wr
import sagemaker
import boto3

import vector_similarity
importlib.reload(vector_similarity)
from vector_similarity import VectorSimilarity

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
# Sanity checks on VectorSimilarity
# check_estimator(VectorSimilarity())
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

array([['a', 'b', 'c']], dtype='<U1')

In [29]:
# Basic pipeline setup
basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = make_pipeline(
    TfidfVectorizer(),
    VectorSimilarity()
)
pipe.fit(basic_corpus, basic_labels)
print(pipe.predict(basic_corpus))
print(pipe.score(basic_corpus))

[['a' 'b' 'c' 'd']
 ['b' 'a' 'c' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.50443175 0.3494023  0.        ]
 [1.         0.50443175 0.3494023  0.        ]
 [1.         0.3494023  0.3494023  0.        ]
 [1.         0.         0.         0.        ]]


In [4]:
# Train and infer on small Data Wrangler dataset
sess = sagemaker.Session()
bucket = sess.default_bucket()

chunksize = 1000
output_content_type = "parquet"
flow_export_id = f"30-23-06-49-58efbaf1"
flow_export_name = f"flow-{flow_export_id}"
s3_output_prefix = f"export-{flow_export_name}/output"
s3_output_path = f"s3://{bucket}/{s3_output_prefix}"

if output_content_type.upper() == "CSV":
    dfs = wr.s3.read_csv(s3_output_path, chunksize=chunksize)
elif output_content_type.upper() == "PARQUET":
    dfs = wr.s3.read_parquet(s3_output_path, chunked=chunksize)
else:
    print(f"Unexpected output content type {output_content_type}") 

wrangled_df = next(dfs)

X = wrangled_df['bodyText']
y = wrangled_df['url']
pipe.fit(X, y)

print(pipe.predict(X[13:14]))
print(pipe.score(X[13:14]))
print(list(y[13:14]))

[['https://github.com/aws-amplify/amplify-adminui/issues/12'
  'https://github.com/aws-amplify/amplify-adminui/issues/21'
  'https://github.com/aws-amplify/amplify-adminui/issues/67'
  'https://github.com/aws-amplify/amplify-adminui/issues/82'
  'https://github.com/aws-amplify/amplify-adminui/issues/41'
  'https://github.com/aws-amplify/amplify-adminui/issues/28'
  'https://github.com/aws-amplify/amplify-adminui/issues/85'
  'https://github.com/aws-amplify/amplify-adminui/issues/45'
  'https://github.com/aws-amplify/amplify-adminui/issues/35'
  'https://github.com/aws-amplify/amplify-adminui/issues/33']]
[[1.         0.6949403  0.23125501 0.13250384 0.12011294 0.12004747
  0.11126224 0.08492276 0.08372554 0.07978957]]
['https://github.com/aws-amplify/amplify-adminui/issues/12']


In [30]:
# Helper functions
def query_df(df, **kwargs):
    query = True
    for key, value in kwargs.items():
        query &= (df[key] == value)
        
    result = df[query]
    return result

def infer(pipe, text):
    print('Inferring on the query:', text)
    start = time()
    if type(text) == str:
        text = list(text)
        
    print(pipe.predict(text))
    print(pipe.score(text))
    print('Took', time() - start, 'seconds')
    
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

def trained_model(df):
    corpus = df[corpus_col]
    labels = df[label_col]

    pipe = make_pipeline(
        TfidfVectorizer(tokenizer=LemmaTokenizer()),
    #     TfidfVectorizer(),
        VectorSimilarity()
    )
    pipe.fit(corpus, labels)
    return pipe

In [6]:
# List all data parquets
secret_name = "SageMakerS3Access"
region_name = "us-west-2"

secrets = boto3.client(
    service_name='secretsmanager',
    region_name=region_name
)

secrets_response = secrets.get_secret_value(SecretId=secret_name)
secrets_dict = json.loads(secrets_response['SecretString'])
(access_key, secret_key), = secrets_dict.items()

bucket_name = 'githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v'
bucket_subfolder = 'data/'

s3 = boto3.client('s3')
data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
data_obj_names = [key['Key'] for key in data_objects]

In [7]:
# Download and compile parquets
dfs = []
start_time = time()

with Bar(
    message='Downloading parquets',
    check_tty=False,
    hide_cursor=False,
    max=len(data_obj_names)
) as bar:
    
    for obj_name in data_obj_names:
        full_obj_name = f"s3://{bucket_name}/{obj_name}"
        df = wr.s3.read_parquet(full_obj_name)
        dfs.append(df)
        bar.next()
        
    bar.finish()

print('Took', time() - start_time, 'seconds')

[KDownloading parquets |################################| 426/426ownloading parquets |##                              | 35/426

Took 87.49258422851562 seconds






In [8]:
# Preprocess training data
corpus_col = 'bodyCleaned'
label_col = 'url'

df = pd.concat(
    dfs,
    ignore_index=True
)

# Clear empty values and reset indices
df = df[(not isinstance(df.bodyText, str)) and (df.bodyText != '')]
df = df.reset_index(drop=True)

begin_text = r'.*Describe the bug'
mid_text = r'### Expected behavior|### Reproduction steps|\r\n*'
end_text = r'### Code Snippet.*'
begin_text_feat = r'.*Describe the feature you\'d like to request'
mid_text_feat = r'### Describe the solution you\'d like|### Describe alternatives you\'ve considered'
end_text_feat = r'### Additional context.*'

cases = [
    begin_text,
    mid_text,
    end_text,
    begin_text_feat,
    mid_text_feat,
    end_text_feat
]
pat_cases = '(' + '|'.join(cases) + ')'
pat = re.compile(pat_cases, flags=(re.DOTALL | re.M))

js_df = query_df(df, repository='amplify-js')
clean_template = lambda text : re.sub(pat, '', text)
js_df[corpus_col] = js_df['title'] + ' ' + js_df['body'].apply(clean_template)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [9]:
# Serialize training data
# df.to_csv('training_data.csv')

In [55]:
# Train model
start = time()

small_df = js_df
corpus = small_df[corpus_col]
labels = small_df[label_col]

pipe = make_pipeline(
    TfidfVectorizer(tokenizer=LemmaTokenizer()),
#     TfidfVectorizer(),
    VectorSimilarity()
)
pipe.fit(corpus, labels)

print('Training took', time() - start, 'seconds')

Training took 15.939002275466919 seconds


In [56]:
# Model stats
vocab2 = pipe['tfidfvectorizer'].get_feature_names()
print('Number of vocab words:', len(vocab))
repo_list = list(set(df['repository']))
print('Available repos:', repo_list)

Number of vocab words: 40539
Available repos: ['amplify-console', 'amplify-ci-support', 'amplify-adminui', 'amplify-ios', 'amplify-cli', 'amplify-codegen', 'amplify-js', 'aws-sdk-android', 'amplify-flutter', 'aws-sdk-ios', 'amplify-js-samples', 'docs', 'amplify-android']


In [61]:
with open('extra_tokens.txt', 'w') as f:
    f.write(str(set(vocab2) - set(vocab)))

In [51]:
js_issue = ['Password managers']
infer(pipe, js_issue)

Inferring on the query: ['Password managers']
[['https://github.com/aws-amplify/amplify-js/issues/8289'
  'https://github.com/aws-amplify/amplify-js/issues/8472'
  'https://github.com/aws-amplify/amplify-js/issues/7919'
  'https://github.com/aws-amplify/amplify-js/issues/14'
  'https://github.com/aws-amplify/amplify-js/issues/3522'
  'https://github.com/aws-amplify/amplify-js/issues/5782'
  'https://github.com/aws-amplify/amplify-js/issues/5915'
  'https://github.com/aws-amplify/amplify-js/issues/4170'
  'https://github.com/aws-amplify/amplify-js/issues/2383'
  'https://github.com/aws-amplify/amplify-js/issues/2895']]
[[0.68070084 0.400885   0.26231051 0.23851386 0.22457498 0.21925387
  0.20909575 0.18528949 0.18398587 0.17632967]]
Took 2.92081880569458 seconds


In [45]:
js_issue = ['image file upload fail file size 5 mb']
infer(pipe, js_issue)

Inferring on the query: ['image file upload fail file size 5 mb']
[['https://github.com/aws-amplify/amplify-js/issues/7117'
  'https://github.com/aws-amplify/amplify-js/issues/7574'
  'https://github.com/aws-amplify/amplify-js/issues/6419'
  'https://github.com/aws-amplify/amplify-js/issues/125'
  'https://github.com/aws-amplify/amplify-js/issues/6824'
  'https://github.com/aws-amplify/amplify-js/issues/1673'
  'https://github.com/aws-amplify/amplify-js/issues/2965'
  'https://github.com/aws-amplify/amplify-js/issues/5348'
  'https://github.com/aws-amplify/amplify-js/issues/7625'
  'https://github.com/aws-amplify/amplify-js/issues/2211']]
[[0.34436212 0.29488971 0.27931818 0.27249194 0.25649529 0.24599132
  0.23468639 0.22993672 0.2279019  0.22142294]]
Took 2.990173816680908 seconds


In [44]:
js_issue = ['user endpoint disappear pinpoint']
infer(pipe, js_issue)

Inferring on the query: ['user endpoint disappear pinpoint']
[['https://github.com/aws-amplify/amplify-js/issues/6896'
  'https://github.com/aws-amplify/amplify-js/issues/7675'
  'https://github.com/aws-amplify/amplify-js/issues/4529'
  'https://github.com/aws-amplify/amplify-js/issues/3886'
  'https://github.com/aws-amplify/amplify-js/issues/62'
  'https://github.com/aws-amplify/amplify-js/issues/8385'
  'https://github.com/aws-amplify/amplify-js/issues/3126'
  'https://github.com/aws-amplify/amplify-js/issues/386'
  'https://github.com/aws-amplify/amplify-js/issues/2057'
  'https://github.com/aws-amplify/amplify-js/issues/3819']]
[[0.25109051 0.23307928 0.23134839 0.22407729 0.20790471 0.20510456
  0.20499026 0.19910681 0.19730286 0.19353353]]
Took 2.806058645248413 seconds


In [18]:
print(query_df(js_df, number=8108)[corpus_col])

14739    Connect to External Cognito Account **Is your feature request related to a problem? Please describe.**We are building multiple apps in a multi-account enterprise AWS environment and we're trying to consume a central cognito pool that is under a different account other than the amplify apps (each amplify app is under it's own account following best practices). Hence when we try to import the cognito pool for auth the IAM user cannot see that in the Role that is used by amplify. We looked at and successfully established cross account access using the method described [here](https://docs.amplify.aws/cli/usage/iam) however we cannot change roles after we ran amplify init so the end result is that we are provisioning the amplify app under the aws account that holds the central cognito instance. This issue is also mentioned [here](https://github.com/aws-amplify/amplify-cli/issues/7008).**Describe the solution you'd like**A straightforward way or recommended best practices for such s