In [46]:
# Update and import packages
!pip install -Uqr requirements.txt

# Basic packages
import importlib
from time import time
from pathlib import Path
from progress.bar import Bar
import json
import re
from bs4 import BeautifulSoup

# Data science/NLP packages
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

# AWS packages
import awswrangler as wr
import sagemaker
from sagemaker.sklearn import SKLearn
import boto3

# Local modules
import model
import train
for m in [model, train]:
    importlib.reload(m)

from model import VectorSimilarity, get_fitted_model
from train import combine_dfs

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [16]:
# Sanity checks on VectorSimilarity
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

[['a' 'b' 'c']]
[[ 2.  1. -1.]]
Prediction took 0.0004839897155761719 seconds


(array([['a', 'b', 'c']], dtype='<U1'), array([[ 2.,  1., -1.]]))

In [17]:
# Basic pipeline setup
basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = get_fitted_model(basic_corpus, basic_labels, lemmatize='custom')
pred, score = pipe.predict(basic_corpus)

Training model...
Took 0.005005359649658203 seconds
[['a' 'c' 'b' 'd']
 ['b' 'c' 'a' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.27710268 0.27710268 0.        ]
 [1.         0.         0.         0.        ]]
Prediction took 0.0006279945373535156 seconds


In [18]:
# Train and infer on small Data Wrangler dataset
sess = sagemaker.Session()
bucket = sess.default_bucket()

chunksize = 1000
output_content_type = "parquet"
flow_export_id = f"30-23-06-49-58efbaf1"
flow_export_name = f"flow-{flow_export_id}"
s3_output_prefix = f"export-{flow_export_name}/output"
s3_output_path = f"s3://{bucket}/{s3_output_prefix}"

dfs=[]
if output_content_type.upper() == "CSV":
    dfs = wr.s3.read_csv(s3_output_path, chunksize=chunksize)
elif output_content_type.upper() == "PARQUET":
    dfs = wr.s3.read_parquet(s3_output_path, chunked=chunksize)
else:
    print(f"Unexpected output content type {output_content_type}") 

wrangled_df = next(dfs)

X = wrangled_df['bodyText']
y = wrangled_df['url']
pipe.fit(X, y)

pred, score = pipe.predict(X[13:14])
print(list(y[13:14]))

[['https://github.com/aws-amplify/amplify-adminui/issues/12'
  'https://github.com/aws-amplify/amplify-adminui/issues/21'
  'https://github.com/aws-amplify/amplify-adminui/issues/67'
  'https://github.com/aws-amplify/amplify-adminui/issues/41'
  'https://github.com/aws-amplify/amplify-adminui/issues/27'
  'https://github.com/aws-amplify/amplify-adminui/issues/9'
  'https://github.com/aws-amplify/amplify-adminui/issues/7'
  'https://github.com/aws-amplify/amplify-adminui/issues/33'
  'https://github.com/aws-amplify/amplify-adminui/issues/104'
  'https://github.com/aws-amplify/amplify-adminui/issues/61']]
[[1.         0.86734452 0.28261051 0.15772424 0.11092404 0.10355965
  0.0885474  0.07604359 0.07187816 0.07161721]]
Prediction took 0.0007164478302001953 seconds
['https://github.com/aws-amplify/amplify-adminui/issues/12']


In [19]:
# File helper functions
def list_data_objs():
    secret_name = "SageMakerS3Access"
    region_name = "us-west-2"
    bucket_name = 'githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v'
    bucket_subfolder = 'data/'
    
    secrets = boto3.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    secrets_response = secrets.get_secret_value(SecretId=secret_name)
    secrets_dict = json.loads(secrets_response['SecretString'])
    (access_key, secret_key), = secrets_dict.items()

    s3 = boto3.client('s3')
    data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
    data_obj_names = [f"s3://{bucket_name}/{key['Key']}" for key in data_objects]
    
    return data_obj_names


def download_data(filename, data_obj_names):
    dfs = []
    
    with Bar(
        message='Downloading parquets',
        check_tty=False,
        hide_cursor=False,
        max=len(data_obj_names)
    ) as bar:

        for obj_name in data_obj_names:
            df = wr.s3.read_parquet(obj_name)
            dfs.append(df)
            bar.next()

        bar.finish()
        
    df = combine_dfs(dfs)
    df.to_csv(filename)

    return df

def get_data(filename, force_redownload=False):
    start = time()
    data = Path(filename)
    
    if data.is_file() and not force_redownload:
        print('Deserializing data from', filename, '...')
        df = pd.read_csv(filename)
        
    else:
        data_obj_names = list_data_objs()
        df = download_data(filename, data_obj_names)
        
    print('Took', time() - start, 'seconds')    
    return df

In [20]:
# Data helper functions
def query_df(df, **kwargs):
    query = True
    for key, value in kwargs.items():
        query &= (df[key] == value)
        
    result = df[query]
    return result

def compare_vecs():
    print('unimplemented')

def inspect_doc(vectorizer, doc, n_best=10):
    if type(doc) == str:
        doc = [doc]
        
    if len(doc) > 1:
        raise ValueError('Only one document per call is supported')

    vocab = np.array(vectorizer.get_feature_names(), ndmin=2)
    weights = vectorizer.transform(doc).toarray()
    weights_desc_args = np.flip(weights.argsort())
    words_desc = np.take_along_axis(vocab, weights_desc_args, axis=1)
    weights_desc = np.take_along_axis(weights, weights_desc_args, axis=1)
    
    print(words_desc[:, :n_best])
    print(weights_desc[:, :n_best])
    
def get_weights(vectorizer, query, doc, n_best=10):
    if type(doc) == str:
        doc = [doc]
        
    tokenizer = vectorizer.build_tokenizer()
    tokens = tokenizer(query)
    print(tokens)
    vocab = np.array(vectorizer.get_feature_names(), ndmin=2)
    indices = []
    for tok in tokens:
        indices.append(np.argwhere(vocab == tok)[0, 1])

    indices = np.array(indices)
    weights = vectorizer.transform(doc).toarray().flatten()
    print(indices)
    weights_desc = np.take_along_axis(weights, indices, axis=0)
    print(weights_desc[:n_best])
    


In [21]:
# Download and compile parquets
df = get_data('training_data.csv', force_redownload=False)

Deserializing data from training_data.csv ...
Took 3.7892115116119385 seconds


In [22]:
# Preprocess training data
corpus_col = 'bodyCleaned'
label_col = 'url'

begin_text = r'.*Describe the bug'
mid_text = r'### Expected behavior|### Reproduction steps|\r\n*'
end_text = r'### Code Snippet.*'
begin_text_feat = r'.*Describe the feature you\'d like to request'
mid_text_feat = r'### Describe the solution you\'d like|### Describe alternatives you\'ve considered'
end_text_feat = r'### Additional context.*'

cases = [
    begin_text,
    mid_text,
    end_text,
    begin_text_feat,
    mid_text_feat,
    end_text_feat
]
pat_cases = '(' + '|'.join(cases) + ')'
pat = re.compile(pat_cases, flags=(re.DOTALL | re.M))

print('Preprocessing data...')
start = time()

js_df = query_df(df, repository='amplify-js')
# js_df = df
clean_template = lambda text : re.sub(pat, '', text)
js_df[corpus_col] = js_df['title'] + ' ' + js_df['body'].apply(clean_template)

print('Took', time() - start, 'seconds')

Preprocessing data...
Took 53.61823582649231 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [47]:
# Train model
small_df = js_df
corpus = small_df[corpus_col]
labels = small_df[label_col]

pipe = get_fitted_model(corpus, labels, lemmatize='custom')

Training model...


  'stop_words.' % sorted(inconsistent))


Took 39.47605586051941 seconds


In [48]:
# Model stats
vocab = pipe['tfidfvectorizer'].get_feature_names()
print('Number of vocab words:', len(vocab))
repo_list = list(set(df['repository']))
print('Available repos:', repo_list)
small_words = [word for word in vocab if len(word) < 3]
print('Small words in vocab:\n', small_words)

Number of vocab words: 19361
Available repos: ['aws-sdk-ios', 'amplify-console', 'amplify-cli', 'docs', 'amplify-ci-support', 'aws-sdk-android', 'amplify-js', 'amplify-flutter', 'amplify-codegen', 'amplify-ios', 'amplify-adminui', 'amplify-js-samples', 'amplify-android']
Small words in vocab:
 ['ad', 'au', 'cc', 'cs', 'cv', 'dm', 'dy', 'eb', 'ec', 'ed', 'em', 'gb', 'gc', 'gm', 'hl', 'ho', 'id', 'io', 'iv', 'j', 'km', 'lb', 'li', 'lm', 'mi', 'nd', 'pa', 'pr', 'r', 'rg', 's', 'sc', 'sd', 'sl', 'sm', 'sn', 'te', 'tl', 'tn', 'w', 'x', 'xx']


In [49]:
with open('vocab.txt', 'w') as f:
    f.write(str(vocab))

In [50]:
tokenizer = pipe[0].build_tokenizer()
print(tokenizer.custom)
token = tokenizer('walk walks talk talks talking talked')
print(token)

True
['walk', 'walk', 'talk', 'talk', 'talk', 'talk']


In [None]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
inspect_doc(pipe[0], js_issue)
get_weights(pipe[0], 'workarounds issued cognito pool', js_issue)

In [None]:
# Train SM estimator
train_data = 's3://githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v/data'
sklearn_estimator = SKLearn('train.py',
                            role=sagemaker.get_execution_role(),
                            instance_type='ml.m5.4xlarge',
                            framework_version='0.23-1',
                            hyperparameters = {'n-best': 10})
sklearn_estimator.fit({'train': train_data})

In [None]:
# Deploy SM predictor
predictor = sklearn_estimator.deploy(instance_type='ml.m5.4xlarge',
                                     initial_instance_count=1)
data = ['password managers autofill']
response = predictor.predict(data)
print(response)

In [None]:
# with open('extra_tokens.txt', 'w') as f:
#     f.write(str(set(vocab2) - set(vocab)))

In [32]:
pw_mgr = query_df(js_df, number=5782)['bodyCleaned']
inspect_doc(pipe[0], pw_mgr)

[['chromium' 'form' 'element' 'password' 'understands' 'autofill' 'sign'
  'don' 'ini' 'manager']]
[[0.34380183 0.30088241 0.25141548 0.24555867 0.22225179 0.19976392
  0.19838702 0.18681306 0.17469726 0.17326228]]


In [33]:
pw_mgr_query = ['Password manager autofill']
pipe.predict(pw_mgr_query)

[['https://github.com/aws-amplify/amplify-js/issues/8472'
  'https://github.com/aws-amplify/amplify-js/issues/8289'
  'https://github.com/aws-amplify/amplify-js/issues/4748'
  'https://github.com/aws-amplify/amplify-js/issues/3799'
  'https://github.com/aws-amplify/amplify-js/issues/5782'
  'https://github.com/aws-amplify/amplify-js/issues/7919'
  'https://github.com/aws-amplify/amplify-js/issues/2479'
  'https://github.com/aws-amplify/amplify-js/issues/3522'
  'https://github.com/aws-amplify/amplify-js/issues/7957'
  'https://github.com/aws-amplify/amplify-js/issues/14']]
[[0.67230476 0.52779768 0.39072771 0.32529087 0.32522022 0.22567634
  0.22110873 0.19190107 0.18802274 0.18619061]]
Prediction took 0.4359712600708008 seconds


(array([['https://github.com/aws-amplify/amplify-js/issues/8472',
         'https://github.com/aws-amplify/amplify-js/issues/8289',
         'https://github.com/aws-amplify/amplify-js/issues/4748',
         'https://github.com/aws-amplify/amplify-js/issues/3799',
         'https://github.com/aws-amplify/amplify-js/issues/5782',
         'https://github.com/aws-amplify/amplify-js/issues/7919',
         'https://github.com/aws-amplify/amplify-js/issues/2479',
         'https://github.com/aws-amplify/amplify-js/issues/3522',
         'https://github.com/aws-amplify/amplify-js/issues/7957',
         'https://github.com/aws-amplify/amplify-js/issues/14']],
       dtype=object),
 array([[0.67230476, 0.52779768, 0.39072771, 0.32529087, 0.32522022,
         0.22567634, 0.22110873, 0.19190107, 0.18802274, 0.18619061]]))

In [None]:
js_issue = ['image file upload fail file size 5 mb']
pipe.predict(js_issue)

In [36]:
js_issue = ['additionalheaders param customize request headers']
pipe.predict(js_issue)

[['https://github.com/aws-amplify/amplify-js/issues/4981'
  'https://github.com/aws-amplify/amplify-js/issues/5576'
  'https://github.com/aws-amplify/amplify-js/issues/2075'
  'https://github.com/aws-amplify/amplify-js/issues/5053'
  'https://github.com/aws-amplify/amplify-js/issues/6157'
  'https://github.com/aws-amplify/amplify-js/issues/5296'
  'https://github.com/aws-amplify/amplify-js/issues/2035'
  'https://github.com/aws-amplify/amplify-js/issues/3087'
  'https://github.com/aws-amplify/amplify-js/issues/7906'
  'https://github.com/aws-amplify/amplify-js/issues/515']]
[[0.56658218 0.3766714  0.21331179 0.20978914 0.18709521 0.18661539
  0.18101199 0.17879506 0.17523136 0.17286493]]
Prediction took 0.436556339263916 seconds


(array([['https://github.com/aws-amplify/amplify-js/issues/4981',
         'https://github.com/aws-amplify/amplify-js/issues/5576',
         'https://github.com/aws-amplify/amplify-js/issues/2075',
         'https://github.com/aws-amplify/amplify-js/issues/5053',
         'https://github.com/aws-amplify/amplify-js/issues/6157',
         'https://github.com/aws-amplify/amplify-js/issues/5296',
         'https://github.com/aws-amplify/amplify-js/issues/2035',
         'https://github.com/aws-amplify/amplify-js/issues/3087',
         'https://github.com/aws-amplify/amplify-js/issues/7906',
         'https://github.com/aws-amplify/amplify-js/issues/515']],
       dtype=object),
 array([[0.56658218, 0.3766714 , 0.21331179, 0.20978914, 0.18709521,
         0.18661539, 0.18101199, 0.17879506, 0.17523136, 0.17286493]]))

In [None]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
print(js_issue)
js_issue_ast = BeautifulSoup(js_issue, 'html.parser')
print(js_issue_ast)