In [1]:
# Update and import packages
!pip install -Uqr requirements.txt

import importlib
from time import time
from pathlib import Path
from progress.bar import Bar
import json
import re
import markdown
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)
from sklearn.utils.estimator_checks import check_estimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import nltk
nltk.download(
    [
        'punkt',
        'wordnet',
        'tagsets',
        'averaged_perceptron_tagger'
    ], quiet=True
)
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import awswrangler as wr
import sagemaker
import boto3

import vector_similarity
importlib.reload(vector_similarity)
from vector_similarity import VectorSimilarity

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [2]:
# Sanity checks on VectorSimilarity
# check_estimator(VectorSimilarity())
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

array([['a', 'b', 'c']], dtype='<U1')

In [3]:
# Basic pipeline setup
basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = make_pipeline(
    TfidfVectorizer(),
    VectorSimilarity()
)
# Add the predict_score() function from VectorSimilarity - inelegant, but gets
# the job done
pipe.predict_score = lambda x : pipe[1].predict_score(pipe[0].transform(x))

pipe.fit(basic_corpus, basic_labels)
pred, score = pipe.predict_score(basic_corpus)
print(pred)
print(score)

[['a' 'b' 'c' 'd']
 ['b' 'a' 'c' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.50443175 0.3494023  0.        ]
 [1.         0.50443175 0.3494023  0.        ]
 [1.         0.3494023  0.3494023  0.        ]
 [1.         0.         0.         0.        ]]


In [None]:
# Train and infer on small Data Wrangler dataset
sess = sagemaker.Session()
bucket = sess.default_bucket()

chunksize = 1000
output_content_type = "parquet"
flow_export_id = f"30-23-06-49-58efbaf1"
flow_export_name = f"flow-{flow_export_id}"
s3_output_prefix = f"export-{flow_export_name}/output"
s3_output_path = f"s3://{bucket}/{s3_output_prefix}"

if output_content_type.upper() == "CSV":
    dfs = wr.s3.read_csv(s3_output_path, chunksize=chunksize)
elif output_content_type.upper() == "PARQUET":
    dfs = wr.s3.read_parquet(s3_output_path, chunked=chunksize)
else:
    print(f"Unexpected output content type {output_content_type}") 

wrangled_df = next(dfs)

X = wrangled_df['bodyText']
y = wrangled_df['url']
pipe.fit(X, y)

pred, score = pipe.predict_score(X[13:14])
print(pred)
print(score)
print(list(y[13:14]))

In [None]:
# List all data parquets
secret_name = "SageMakerS3Access"
region_name = "us-west-2"

secrets = boto3.client(
    service_name='secretsmanager',
    region_name=region_name
)

secrets_response = secrets.get_secret_value(SecretId=secret_name)
secrets_dict = json.loads(secrets_response['SecretString'])
(access_key, secret_key), = secrets_dict.items()

bucket_name = 'githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v'
bucket_subfolder = 'data/'

s3 = boto3.client('s3')
data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
data_obj_names = [key['Key'] for key in data_objects]

In [None]:
# File helper functions
def list_data_objs():
    secret_name = "SageMakerS3Access"
    region_name = "us-west-2"
    bucket_name = 'githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v'
    bucket_subfolder = 'data/'
    
    secrets = boto3.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    secrets_response = secrets.get_secret_value(SecretId=secret_name)
    secrets_dict = json.loads(secrets_response['SecretString'])
    (access_key, secret_key), = secrets_dict.items()

    s3 = boto3.client('s3')
    data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
    data_obj_names = [f"s3://{bucket_name}/{key['Key']}" for key in data_objects]
    
    return data_obj_names

def combine_dfs(dfs):
    df = pd.concat(
            dfs,
            ignore_index=True
        )

    # Clear empty values and reset indices
    df = df[(not isinstance(df.bodyText, str)) and (df.bodyText != '')]
    df = df.reset_index(drop=True)
    return df

def download_data(filename, data_obj_names):
    dfs = []
    
    with Bar(
        message='Downloading parquets',
        check_tty=False,
        hide_cursor=False,
        max=len(data_obj_names)
    ) as bar:

        for obj_name in data_obj_names:
            df = wr.s3.read_parquet(obj_name)
            dfs.append(df)
            bar.next()

        bar.finish()
        
    df = combine_dfs(dfs)
    return df

def get_data(filename, force_redownload=False):
    start = time()
    data = Path(filename)
    
    if data.is_file() and not force_redownload:
        print('Deserializing data from', filename, '...')
        df = pd.read_csv(filename)
        
    else:
        data_obj_names = list_data_objs()
        df = download_data(filename, data_obj_names)
        
    print('Took', time() - start, 'seconds')    
    return df

In [None]:
# Data helper functions
def query_df(df, **kwargs):
    query = True
    for key, value in kwargs.items():
        query &= (df[key] == value)
        
    result = df[query]
    return result

def compare_vecs():
    print('unimplemented')

In [None]:
# Model helper functions
def infer(pipe, text, show_score=False):
    print('Inferring on the query:', text)
    start = time()
    if type(text) == str:
        text = list(text)
        
    print(pipe.predict(text))
    
    if show_score:
        print(pipe.score(text))
    print('Took', time() - start, 'seconds')
    
class LemmaTokenizer:
    def __init__(self, custom=False):
        self.wnl = WordNetLemmatizer()
        self.custom = custom
    def __call__(self, doc):
        if self.custom:
            # Find alphabetical tokens at least 3 chars long
            tokens = re.findall(r"(?u)\b\w\w+\b", doc)
            tokens = [word for word in tokens if len(word) >=3]
            
            # Only use verb/noun tokens
            tags = nltk.pos_tag(tokens)
            tokens = [word for word, tag in tags if tag[0] in ['V', 'N']]
        
        else:
            tokens = word_tokenize(doc)
        
        lemmatized_tokens = [self.wnl.lemmatize(t) for t in tokens]
        return lemmatized_tokens

def get_trained_model(corpus, labels, lemmatize='default'):
    print('Training model...')
    start = time()
    lemmatize = lemmatize.lower()

    # Set lemmatization, if any
    if lemmatize == 'default':
        vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer())
    elif lemmatize == 'custom':
        vectorizer = TfidfVectorizer(
            tokenizer=LemmaTokenizer(custom=True)
        )
    elif lemmatize == 'none':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError('lemmatize must be {default, custom, none}')
    
    # Create and train pipeline
    pipe = make_pipeline(
        vectorizer,
        VectorSimilarity()
    )
    pipe.fit(corpus, labels)
    
    print('Took', time() - start, 'seconds')
    
    return pipe

def inspect_corpus():
    print('unimplemented')


In [None]:
# Download and compile parquets
df = get_data('training_data.csv', force_redownload=False)

In [None]:
# Preprocess training data
corpus_col = 'bodyCleaned'
label_col = 'url'

begin_text = r'.*Describe the bug'
mid_text = r'### Expected behavior|### Reproduction steps|\r\n*'
end_text = r'### Code Snippet.*'
begin_text_feat = r'.*Describe the feature you\'d like to request'
mid_text_feat = r'### Describe the solution you\'d like|### Describe alternatives you\'ve considered'
end_text_feat = r'### Additional context.*'

cases = [
    begin_text,
    mid_text,
    end_text,
    begin_text_feat,
    mid_text_feat,
    end_text_feat
]
pat_cases = '(' + '|'.join(cases) + ')'
pat = re.compile(pat_cases, flags=(re.DOTALL | re.M))

print('Preprocessing data...')
start = time()

js_df = query_df(df, repository='amplify-js')
clean_template = lambda text : re.sub(pat, '', text)
js_df[corpus_col] = js_df['title'] + ' ' + js_df['body'].apply(clean_template)

print('Took', time() - start, 'seconds')

In [None]:
# Train model (All default)
# start = time()

small_df = js_df
corpus = small_df[corpus_col]
labels = small_df[label_col]

# pipe = make_pipeline(
#     TfidfVectorizer(
#         tokenizer=LemmaTokenizer(),
# #         stop_words='english'
#     ),
#     VectorSimilarity()
# )
# pipe.fit(corpus, labels)
pipe = get_trained_model(corpus, labels, lemmatize='custom')
# print('Training took', time() - start, 'seconds')

In [None]:
# Model stats
vocab = pipe['tfidfvectorizer'].get_feature_names()
print('Number of vocab words:', len(vocab))
repo_list = list(set(df['repository']))
print('Available repos:', repo_list)
small_words = [word for word in vocab if len(word) < 3]
print('Small words in vocab:\n', small_words)

In [None]:
# with open('extra_tokens.txt', 'w') as f:
#     f.write(str(set(vocab2) - set(vocab)))

In [None]:
js_issue = ['Password managers']
infer(pipe, js_issue, show_score=True)

In [None]:
js_issue = ['image file upload fail file size 5 mb']
infer(pipe, js_issue, show_score=True)

In [None]:
js_issue = ['user endpoint disappear pinpoint']
infer(pipe, js_issue, show_score=True)

In [None]:
js_issue = query_df(js_df, number=8108)[corpus_col].item()
print(js_issue)
js_issue_ast = BeautifulSoup(js_issue, 'html.parser')
print(js_issue_ast)