In [2]:
import importlib
from time import time
from progress.bar import Bar
import json

import numpy as np
import pandas as pd
from sklearn.utils.estimator_checks import check_estimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

import awswrangler as wr
import sagemaker
import boto3

import vector_similarity
importlib.reload(vector_similarity)
from vector_similarity import VectorSimilarity

In [3]:
# Sanity checks on VectorSimilarity

# check_estimator(VectorSimilarity())
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])
y = np.array(['a', 'b', 'c'])

estimator = VectorSimilarity()
estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

array([['a', 'b', 'c']], dtype='<U1')

In [4]:
# Basic pipeline setup

basic_corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]
basic_labels = ['a', 'b', 'c', 'd']

pipe = make_pipeline(TfidfVectorizer(), VectorSimilarity())
pipe.fit(basic_corpus, basic_labels)
print(pipe.predict(basic_corpus))
print(pipe.score(basic_corpus))

[['a' 'b' 'c' 'd']
 ['b' 'a' 'c' 'd']
 ['c' 'b' 'a' 'd']
 ['d' 'c' 'b' 'a']]
[[1.         0.50443175 0.3494023  0.        ]
 [1.         0.50443175 0.3494023  0.        ]
 [1.         0.3494023  0.3494023  0.        ]
 [1.         0.         0.         0.        ]]


In [5]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
chunksize = 1000
output_content_type = "parquet"
flow_export_id = f"30-23-06-49-58efbaf1"
flow_export_name = f"flow-{flow_export_id}"
s3_output_prefix = f"export-{flow_export_name}/output"
s3_output_path = f"s3://{bucket}/{s3_output_prefix}"

if output_content_type.upper() == "CSV":
    dfs = wr.s3.read_csv(s3_output_path, chunksize=chunksize)
elif output_content_type.upper() == "PARQUET":
    dfs = wr.s3.read_parquet(s3_output_path, chunked=chunksize)
else:
    print(f"Unexpected output content type {output_content_type}") 

df = next(dfs)
# df

In [6]:
X = df['bodyText']
y = df['url']
pipe.fit(X, y)
print(pipe.predict(X[13:14]))
print(pipe.score(X[13:14]))
print(list(y[13:14]))

[['https://github.com/aws-amplify/amplify-adminui/issues/12'
  'https://github.com/aws-amplify/amplify-adminui/issues/21'
  'https://github.com/aws-amplify/amplify-adminui/issues/67'
  'https://github.com/aws-amplify/amplify-adminui/issues/82'
  'https://github.com/aws-amplify/amplify-adminui/issues/41'
  'https://github.com/aws-amplify/amplify-adminui/issues/28'
  'https://github.com/aws-amplify/amplify-adminui/issues/85'
  'https://github.com/aws-amplify/amplify-adminui/issues/45'
  'https://github.com/aws-amplify/amplify-adminui/issues/35'
  'https://github.com/aws-amplify/amplify-adminui/issues/33']]
[[1.         0.6949403  0.23125501 0.13250384 0.12011294 0.12004747
  0.11126224 0.08492276 0.08372554 0.07978957]]
['https://github.com/aws-amplify/amplify-adminui/issues/12']


In [None]:
# Download raw parquets
secret_name = "SageMakerS3Access"
region_name = "us-west-2"

client = boto3.client(
    service_name='secretsmanager',
    region_name=region_name
)

secrets_response = client.get_secret_value(
    SecretId=secret_name
)
secrets_dict = json.loads(secrets_response['SecretString'])
(access_key, secret_key), = secrets_dict.items()

# session._session.set_credentials(access_key, secret_key)

bucket_name = 'githubmachinelearningstack-rawdatabucket79e6ae92-dvgbsz21ce9v'
bucket_subfolder = 'data/'

s3 = boto3.client('s3')
data_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_subfolder)['Contents']
data_obj_names = [key['Key'] for key in data_objects]
dfs = []

start_time = time()
with Bar(message='Downloading parquets', check_tty=False, hide_cursor=False, max=len(data_obj_names)) as bar:
    for obj_name in data_obj_names:
        full_obj_name = f"s3://{bucket_name}/{obj_name}"
        df = wr.s3.read_parquet(full_obj_name)
        dfs.append(df)
        bar.next()
    bar.finish()
print('Took ', time() - start_time, ' seconds')

[KDownloading parquets |####                            | 63/426Downloading parquets |                                | 12/426

In [31]:
df.to_csv('training_data.csv')

In [11]:
start_time = time()
df = pd.concat(dfs)
print(f"{time() - start_time} seconds")

df = df[(not isinstance(df.bodyText, str)) and (df.bodyText != '')]

for index, row in df.iterrows():
    if not isinstance(row['bodyText'], str) or row['bodyText'] == '':
        print(row['url'])

0.9949593544006348 seconds


In [12]:
start = time()
pipe = make_pipeline(TfidfVectorizer(), VectorSimilarity())
small_df = df
corpus = small_df['bodyText']
labels = small_df['url']
pipe.fit(corpus, labels)
print(time() - start)

13.218099117279053


In [13]:
start = time()
string = """
Attempting to view content, it appears the UI never finishes loading, the select table dropdown is in a disabled state, and the main window says "fetching items...".

The browser console reports the following issue:

Error: parsing failed: Syntax Error: Unexpected <EOF>.

GraphQL request:1:1
1 |
  | ^
    at app (main.ee81f266.chunk.js:1)
    at 455.6c1c5071.chunk.js:2
    at g (455.6c1c5071.chunk.js:2)
    at 455.6c1c5071.chunk.js:2
    at dispatch (455.6c1c5071.chunk.js:2)
    at 455.6c1c5071.chunk.js:2
    at 455.6c1c5071.chunk.js:2
    at p (455.6c1c5071.chunk.js:2)
    at v (455.6c1c5071.chunk.js:2)
    at h (455.6c1c5071.chunk.js:2)
additionally, there are network response issues:

from app/env/details - error: "Cannot read property 'defaultAuthentication' of undefined"
from app/env/getModels - error: "ResourceNotFoundException: API models not found."
"""
sentence = [string]
print(pipe.predict(sentence))
print(pipe.score(sentence))
print(time() - start)

[['https://github.com/aws-amplify/amplify-adminui/issues/12'
  'https://github.com/aws-amplify/amplify-adminui/issues/21'
  'https://github.com/aws-amplify/amplify-adminui/issues/169'
  'https://github.com/aws-amplify/docs/issues/1936'
  'https://github.com/aws-amplify/amplify-js/issues/3621'
  'https://github.com/aws-amplify/amplify-console/issues/1369'
  'https://github.com/aws-amplify/amplify-js/issues/5354'
  'https://github.com/aws-amplify/amplify-js/issues/7154'
  'https://github.com/aws-amplify/amplify-console/issues/84'
  'https://github.com/aws-amplify/amplify-js/issues/4776']]
[[1.         0.5703742  0.53427687 0.32200108 0.31433721 0.31284222
  0.29470813 0.27291282 0.23395244 0.23275672]]
29.898285627365112


In [26]:
print(len(pipe['vectorsimilarity']._Vectors[1]))

84838


In [63]:
repo_list = list(set(df['repository']))
print(repo_list)

['amplify-android', 'amplify-codegen', 'amplify-js-samples', 'amplify-ios', 'amplify-adminui', 'amplify-js', 'aws-sdk-android', 'docs', 'amplify-console', 'amplify-flutter', 'amplify-ci-support', 'amplify-cli', 'aws-sdk-ios']


In [44]:
set(df['repository'])
pd.set_option("display.max_colwidth", None)
query = (df['repository'] == 'amplify-js') & (df['number'] == 8485)
js_issue = str(df[query]['body'])
# print(js_issue)

In [112]:
import re
begin_text = r'.*Describe the bug'
mid_text = r'### Expected behavior|### Reproduction steps|\r\n*'
end_text = r'### Code Snippet.*'
pat_cases = '({}|{}|{})'.format(begin_text, mid_text, end_text)
pat = re.compile(pat_cases, flags=(re.DOTALL | re.M))
# print(type(pat))
cleaned_issue = re.sub(pat, '', js_issue)
# cleaned_issue = re.sub('\r\n*', '', cleaned_issue)
# cleaned_issue = cleaned_issue.rstrip()
print(repr(cleaned_issue))
print(cleaned_issue)

"AWS Support asked me to raise this here as they can't find a solution and will escalate to you.I am building a Typescript/Next.js/React/Redux app and I am trying to pass the user object via the props from the _app.tsx pageI get a ```The user is not authenticated``` error when calling ```currentAuthenticatedUser()```. I am not using federated login.The user object is returned with the logged in user.It happens as soon as you navigate to the base URL of the app."
AWS Support asked me to raise this here as they can't find a solution and will escalate to you.I am building a Typescript/Next.js/React/Redux app and I am trying to pass the user object via the props from the _app.tsx pageI get a ```The user is not authenticated``` error when calling ```currentAuthenticatedUser()```. I am not using federated login.The user object is returned with the logged in user.It happens as soon as you navigate to the base URL of the app.


In [108]:
string = 'hellohellotesthellohello'
string = re.sub('hello*', '', string)
print(string)

test


In [104]:
string = '\r\nhello\r\ntest\r\ndonuts'
string = string.splitlines()
print(repr(string))
print(string)

['', 'hello', 'test', 'donuts']
['', 'hello', 'test', 'donuts']


In [115]:
query = (df['repository'] == 'amplify-cli') & (df['number'] == 6609)
cli_issue = str(df[query]['body'])

print(repr(cli_issue))

'33    Assume I have the following model:\r\n\r\n```graphql\r\ntype Employee\r\n@model \r\n@key(name: "byInstitute", fields: ["instituteId"], queryField: "listEmployeesByInstitute")\r\n{\r\n  id: ID!\r\n  instituteId: ID @assert(condition: ".length() > 1") \r\n  institute: Institute @connection(keyField: "instituteId")\r\n  userId: ID @assert(condition: ".length() > 1") \r\n  user: User @connection(keyField: "userId")\r\n}\r\n```\r\n\r\nThe question is if I want to have list of employees of an institute sorted by their names, how can I create the sort key?\r\n\r\nI mean something like this:\r\n\r\n```graphql\r\n@key(name: "byInstitute", fields: ["instituteId", "user.name"], queryField: "listEmployeesByInstitute")\r\n```\r\nIs it possible?\nName: body, dtype: string'
