In [2]:
import os

# so I don't 
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="-1"  #prevent access to GPU for inference

import pandas as pd
pd.options.display.max_rows = 150
pd.options.display.max_colwidth = 500
import json
from mdparse import transform_pre_rules, compose
from pathlib import Path



In [6]:
import torch
assert not torch.cuda.is_available()

# Look at Data For Kubeflow/Kubeflow

#### Get Kubeflow Data

 See the query in [GCP BigQuery Console](https://console.cloud.google.com/bigquery?sq=1073071082706:92b4ec67dbf5441ba95eb5b9d77e8993)

In [17]:
df = pd.read_csv(f'https://storage.googleapis.com/issue_label_bot/kubeflow_issues/000000000000.csv')
# filter for kubeflow/kubeflow
kfdf = df[df.repo.apply(lambda x: x.split('/')[1] =='kubeflow')]

Flatten list of labels

In [5]:
# unpack the lists of labels and flatten
def unpack_list(x):
    "convert list as string into list."
    if x == '':
        return 'no_labels'
    else:
        return json.loads(x)

#flatten lists
labels = []
label_series = kfdf.labels.apply(lambda x: unpack_list(x))
for x in label_series:
    labels.extend(x)

Top 10 / Bottom 10 labels

In [6]:
label_counts = pd.DataFrame({'labels': labels}).labels.value_counts()
display(label_counts.head(10))
display(label_counts.tail(10))

priority/p1           534
priority/p2           148
area/jupyter          142
platform/gcp          128
area/kfctl            114
release/0.3.0          98
community/question     96
area/0.4.0             90
area/bootstrap         83
priority/p0            62
Name: labels, dtype: int64

area/horovod              1
area/centraldashbosard    1
area/design               1
area/chainer              1
cloud/azure               1
area/kubebench            1
area/0.2.0                1
prirority/p1              1
p1-important              1
platform/minikf           1
Name: labels, dtype: int64

In [7]:
#Borrowed this from nb 2
def process_dict(dfdict, _):
    """process the data, but allow failure."""
    t = compose(transform_pre_rules)
    title = dfdict['title']
    body = dfdict['body']
    try:
        text = 'xxxfldtitle '+ t(title) + ' xxxfldbody ' + t(body)
    except:
        return None
    return {'url': dfdict['url'], 'text':text}

In [212]:
processed_issue_texts = [process_dict(x, 0) for x in kfdf.to_dict(orient='rows')]
processed_issue_texts[:2]

[{'url': '"https://github.com/kubeflow/kubeflow/issues/574"',
  'text': "xxxfldtitle tfjobs ui doesn't work behind iap; react app needs support iap? xxxfldbody tfjobs ui is deployed on dev.kubeflow.org. \\ r \\ r the ui shows up behind iap but its doesn't work \\ r - no tfjobs are listed \\ r - creating a job via the ui doesn't work. \\ r \\ r looking at the developer console we see requests to \\ r \\ r \\ r *URL* xxxlnkhb accounts.google.com xxxlnkhe \\ r \\ r which suggests to me the request is hitting the loadbalancer and being directed to do auth verification to sign in and its getting rejected. \\ r \\ r so i think one of two things is happening \\ r \\ r 1. the request is coming from the server running in k8s and incorrectly being redirected to the external loadbalncer and thus hitting iap when it shouldn't be \\ r 1. the request is coming from the client and the client needs to be updated to support iap. \\ r \\ r xxxatmention do you know where the request is coming from? \\ r 

## Read in Model

Notes: you can export a lightweight learner for inference per https://docs.fast.ai/tutorial.inference.html


In [9]:
from fastai.text.models import AWD_LSTM
from fastai.text import TextLMDataBunch as lmdb, load_data
from fastai.text.learner import language_model_learner
from fastai.basic_train import load_learner
path = Path('lang_model_onecycle/')

In [10]:
def pass_through(x):
    return x

You don't have to execute the below cell anymore

In [11]:
# data_lm = load_data(path, bs=128)

# learn = language_model_learner(data=data_lm,
#                                arch=AWD_LSTM,
#                                pretrained=False)

# learn.load('bestmodel')

# learn.export()

In [12]:
learn = load_learner(path)

## Inference

In [213]:
learn.model.reset() # so the hidden states reset between predictions
_ = learn.model.eval() # turn off dropout, etc. only need to do this after loading model.

#### Notes

Fastai encoder produces a tuple of two lists `raw_output` and `output`.  see [this reference](https://github.com/fastai/fastai/blob/master/fastai/text/models/awd_lstm.py#L123)  `raw_output` are the hidden states emitted for each element of the sequence without dropout.  Because you are turning off dropout during inference with `.eval()`, it really doesn't matter which one you get as they will both be the same (if they are not, this is a bug). 

In [14]:
ex = processed_issue_texts[0]['text']
print(ex)

xxxfldtitle v1alpha2 implement condition update xxxfldbody we should update the conditions according to the status. \ r \ r / cc xxxatmention 


In [15]:
ex_numericalized_x,  ex_numericalized_y = learn.data.one_item(ex)
ex_numericalized_x

tensor([[    2,    22, 35652,   454,  1619,   173,    23,    64,    66,   173,
             9,  2127,  1099,    13,     9,   357,    10,    50,   696,    50,
           696,    37,  1075,   118]])

The next two output tensors should be the same, this is testing that the model state is being reset correctly between predictions

In [16]:
encoder = learn.model[0]
rep = encoder.forward(ex_numericalized_x)[-1][-1]
print(rep)
print(rep.shape)

tensor([[[-0.0129,  0.0362,  0.0007,  ..., -0.0754, -0.0074,  0.0045],
         [-0.0251,  0.0263,  0.0664,  ..., -0.0272,  0.0092,  0.0330],
         [ 0.0580,  0.0300,  0.0196,  ..., -0.0416,  0.0290,  0.0129],
         ...,
         [-0.0111,  0.0130,  0.0432,  ..., -0.0640,  0.1140,  0.0357],
         [-0.0105, -0.0146,  0.0293,  ..., -0.1969,  0.2049,  0.0006],
         [-0.0057,  0.0225,  0.0220,  ..., -0.1356, -0.0231, -0.0011]]],
       grad_fn=<TransposeBackward0>)
torch.Size([1, 24, 400])


In [17]:
learn.model.reset()
rep = encoder.forward(ex_numericalized_x)[-1][-1]
print(rep)
print(rep.shape)

tensor([[[-0.0129,  0.0362,  0.0007,  ..., -0.0754, -0.0074,  0.0045],
         [-0.0251,  0.0263,  0.0664,  ..., -0.0272,  0.0092,  0.0330],
         [ 0.0580,  0.0300,  0.0196,  ..., -0.0416,  0.0290,  0.0129],
         ...,
         [-0.0111,  0.0130,  0.0432,  ..., -0.0640,  0.1140,  0.0357],
         [-0.0105, -0.0146,  0.0293,  ..., -0.1969,  0.2049,  0.0006],
         [-0.0057,  0.0225,  0.0220,  ..., -0.1356, -0.0231, -0.0011]]],
       grad_fn=<TransposeBackward0>)
torch.Size([1, 24, 400])


## Get Representations

Numericalized data

In [214]:
from tqdm import tqdm_notebook

In [215]:
# index into [0] b/c we don't care about the y value.
num_x = []

for x in tqdm_notebook(processed_issue_texts, total=len(processed_issue_texts)):
    num_x.extend(learn.data.one_item(x)[0])


HBox(children=(IntProgress(value=0, max=1384), HTML(value='')))




In [216]:
reps=[]
for x in tqdm_notebook(num_x, total=len(num_x)):
    reps.extend(encoder.forward(x[None, :])[-1][-1])

HBox(children=(IntProgress(value=0, max=1384), HTML(value='')))




In [9]:
from typing import List
class IssueRepresentation:
    
    def __init__(self, tensor:torch.tensor) -> torch.tensor:
        self.tensor=tensor
    
    @property
    def mean(self):
        return torch.mean(self.tensor, 0)
    
    @property
    def max(self):
        return torch.max(self.tensor, 0)[0]
    
    @property
    def last(self):
        return self.tensor[-1,:]
    
    @property
    def concat(self):
        return torch.cat([self.mean, self.max, self.last])

class IssueRepresentation_List:
    def __init__(self, irl=List[torch.tensor]):
        self.irl = [IssueRepresentation(x) for x in irl]
    
    @property
    def mean(self):
        return torch.stack([x.mean for x in self.irl])
    
    @property
    def max(self):
        return torch.stack([x.max for x in self.irl])
    
    @property
    def last(self):
        return torch.stack([x.last for x in self.irl])
    
    @property
    def concat(self):
        return torch.stack([x.concat for x in self.irl])
    

In [219]:
irl = IssueRepresentation_List(reps)

In [220]:
import pickle as pkl

with open('irl.pkl', 'wb') as f:
    pkl.dump(irl, f)

# See if Naive One Shot Learning Works

In [218]:
from IPython.display import display, Markdown, HTML

In [10]:
import pickle as pkl

with open('irl.pkl', 'rb') as f:
    irl = pkl.load(f)

In [58]:
## == True converts it into a 0/1 indices array
candidates_to_label = torch.tensor((kfdf.labels == '[]').values) == True

print(f'{candidates_to_label.sum()} issues w/o labels out of {len(kfdf)} total issues.')

542 issues w/o labels out of 1384 total issues.


In [68]:
no_label_reps = irl.concat[candidates_to_label]
label_reps = irl.concat[~candidates_to_label]

assert (no_label_reps.shape[0] + label_reps.shape[0]) == len(kfdf)

In [110]:
label_mask = kfdf.labels != '[]'

labeled_df = kfdf[label_mask].reset_index(drop=True)
no_label_df = kfdf[~label_mask].reset_index(drop=True)

assert len(labeled_df) + len(no_label_df) == len(kfdf)

In [263]:
class oneshotlabeler:
    def __init__(self, vecs, refdf):
        assert vecs.shape[0] == len(refdf)
        self.vecs = vecs
        self.refdf = refdf.reset_index(drop=True)
        self.cs = CosineSimilarity()
    
    def query(self, vec):
        assert vec.ndim == 1
        sims = cs.forward(vec.unsqueeze(0), self.vecs)
        idxs = sims.argsort(descending=True)
        ranked_sims = sims[idxs]
        
        closest_idx = idxs[0].item()
        ref_issue = self.refdf.iloc[closest_idx]
        
        msg = []
        msg.append(f'\n## Prediction:\n')
        msg.append(f'**Predicted Labels**: {json.loads(ref_issue.labels)}\n')
        msg.append(f'**Cosine similarity (0-1)**: {ranked_sims[0]:.2f}\n')
        msg.append(f'**Closest Issue URL**: {json.loads(ref_issue.url)}\n')
        msg.append(f'**Closest Issue Title**: {ref_issue.title}\n')
        msg.append(f'**Closest Issue Body**:\n {ref_issue.body[:600]}')
        display(Markdown('\n'.join(msg)))
        
    def random_prediction(self, no_label_df, no_label_vec):
        assert len(no_label_df) == no_label_vec.shape[0]
        sample = no_label_df.sample(1)
        idx = sample.index.values[0]
        
        msg = []
        msg.append(f'\n## Un-Labeled Target Issue To Predict:\n')
        msg.append(f'**Title:** {sample.title.values[0]}\n')
        msg.append(f'**Body:**\n {sample.body.values[0][:600]}\n')
        msg.append(f'**URL:** {sample.url.values[0]}')
        display(Markdown('\n'.join(msg)))
        
        self.query(no_label_vec[idx, :])

In [264]:
assert len(no_label_df) == no_label_reps.shape[0]

In [265]:
ol = oneshotlabeler(vecs=label_reps, 
                    refdf = labeled_df)

In [279]:
ol.random_prediction(no_label_df=no_label_df,
                     no_label_vec=no_label_reps)


## Un-Labeled Target Issue To Predict:

**Title:** gke deploy test failed because of insufficient quota

**Body:**
 http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/kubeflow-presubmit-kubeflow-gke-deploy-840-6c8e8a7-1599-d299?tab=workflow&nodeid=kubeflow-presubmit-kubeflow-gke-deploy-840-6c8e8a7-1599-d299-213896030\r \r in  840 \r \r    \r error:  gcloud.deployment-manager.deployments.create  error in operation  operation-1526938804121-56cbe25036da9-15fe77cd-70641b75 : errors:\r - code: resource_error\r  location: /deployments/z40-6c8e8a7-1599-d299/resources/cpu-pool-v1\r  message: \ {\\\ resourcetype\\\ :\\\ container.v1.nodepool\\\ ,\\\ resourceerrorcode\\\ :\\\ 403\\\ \\\r    ,\\\ resourcee

**URL:** "https://github.com/kubeflow/kubeflow/issues/842"


## Prediction:

**Predicted Labels**: ['area/build-release', 'area/testing']

**Cosine similarity (0-1)**: 0.97

**Closest Issue URL**: https://github.com/kubeflow/kubeflow/issues/822

**Closest Issue Title**: gke deploy test failed because another process was deleting

**Closest Issue Body**:
 https://k8s-gubernator.appspot.com/build/kubernetes-jenkins/pr-logs/pull/kubeflow_kubeflow/814/kubeflow-presubmit/1558/\r \r    \r error:  gcloud.deployment-manager.deployments.create  error in operation  operation-1526578740627-56c6a4f8e8639-3b63db3e-5d7cb2dd : e\r rrors:\r - code: resource_error\r  location: /deployments/z14-a015e87-1558-6462/resources/cpu-pool-v1\r  message: \ {\\\ resourcetype\\\ :\\\ container.v1.nodepool\\\ ,\\\ resourceerrorcode\\\ :\\\ 400\\\ \\\r    ,\\\ resourceerrormessage\\\ :{\\\ code\\\ :400,\\\ message\\\ :\\\ operation operation-1526578878193-71024dcd\\\r    \\

In [248]:
sample = no_label_df.sample(1)
sample

Unnamed: 0,url,repo,title,body,num_labels,labels
527,"""https://github.com/kubeflow/kubeflow/issues/1542""",kubeflow/kubeflow,add jsonnet tests for all libsonnet files,many libsonnet files do not have tests. tests should be added for the following:\r \r kubeflow/core/ambassador.libsonnet\r kubeflow/core/centraldashboard.libsonnet\r kubeflow/core/cert-manager.libsonnet\r kubeflow/core/cloud-endpoints.libsonnet\r kubeflow/core/echo-server.libsonnet\r kubeflow/core/google-cloud-filestore-pv.libsonnet\r kubeflow/core/jupyterhub.libsonnet\r kubeflow/core/metric-collector.libsonnet\r kubeflow/core/prometheus.libsonnet\r kubeflow/core/spartakus.libsonnet\r kubefl...,0,[]


In [258]:
sample.index.values[0]

527

In [239]:

msg = []
msg.append(f'\n## Un-Labeled Target Issue To Predict:\n')
msg.append(f'**Title:** {sample.title.values[0]}')
msg.append(f'**Body:**\n {sample.body.values[0][:600]}')
msg.append(f'**URL:** {sample.url.values[0]}')
msg

['\n## Un-Labeled Target Issue To Predict:\n',
 '**Title:** feast  feature store  and kubeflow',
 "**Body:**\n at  gojek  https://www.gojek.io/  we've recently open sourced a software project called  feast  https://github.com/gojek/feast , an internal feature store for managing, storing, and discovering features for machine learning. the software was jointly developed by gojek and google, and the first release is currently running in production at gojek. we are open sourcing the software because we've seen many teams face the same challenges with features as we have, and we'd love to get feedback from the community both on what we have built, and what we are planning to build.\\r \\r feast is meant to be",
 '**URL:** "https://github.com/kubeflow/kubeflow/issues/2141"']

In [257]:
ol.query(no_label_reps[417, :])


## Prediction:

**Predicted Labels**: ['area/0.4.0', 'area/inference', 'priority/p1']

**Cosine similarity (0-1)**: 0.92

**Closest Issue URL**: https://github.com/kubeflow/kubeflow/issues/1036

**Closest Issue Title**: tfserving supports collection of metrics with prometheus

**Closest Issue Body**:
 we'd like tfserving to support exporting relevant metrics in a k8s/prometheus compatible manner.\r \r here are some relevant ussues tracking the features in tf serving\r \r tensorflow/serving 462 - server metrics\r     cfegly's  comment  https://github.com/tensorflow/serving/issues/462 issuecomment-367594255  is a good summary of what we would like\r tensorflow/serving 800 - collect servable metrics\r \r

In [122]:
avg_vec = irl.concat
avg_vec.shape

torch.Size([1384, 1200])

In [123]:
from torch.nn.modules.distance import CosineSimilarity
cs = CosineSimilarity()

In [133]:
tst = cs.forward(avg_vec[544, :].unsqueeze(0), avg_vec)

In [134]:
idxs = tst.argsort(descending=True)
idxs

tensor([ 544,  963,  352,  ...,  772, 1020,  971])

In [135]:
tst[idxs]

tensor([1.0000, 0.9303, 0.9193,  ..., 0.7091, 0.7025, 0.7025],
       grad_fn=<IndexBackward>)

In [136]:
kfdf.reset_index(drop=True, inplace=True) # make sure index is 0 based

In [139]:
kfdf.iloc[963]

url                                                                                                                                                                                                                                                                                                                                                                                                                                                                            "https://github.com/kubeflow/kubeflow/issues/1412"
repo                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [77]:
kfdf.iloc

Index(['url', 'repo', 'title', 'body', 'num_labels', 'labels'], dtype='object')

In [81]:
idx = 963
print(f'closest issue: {json.loads(kfdf.iloc[idx].url)}\n')
print(f'Title: {kfdf.iloc[idx].title}\n')

print(f'cos)

closest issue: https://github.com/kubeflow/kubeflow/issues/1412

Title: how to spawn the jupyter container as a root user



In [21]:
kfdf[kfdf.labels == '[]'].tail()

Unnamed: 0,url,repo,title,body,num_labels,labels
540,"""https://github.com/kubeflow/kubeflow/issues/1285""",kubeflow/kubeflow,how can we change the tensorflow image in kubeflow?,i have created a model file and it was built in tensorflow 1.8. when i try to serve the model and i apply \ ks apply cloud -c serve\ i see that it is using tensorflow 1.7 image gcr.io/kubeflow-images-public/tensorflow-serving-1.7 . how can change the tensorflow-serving image ?,0,[]
541,"""https://github.com/kubeflow/kubeflow/issues/200""",kubeflow/kubeflow,"error server is unable to handle tensorflow.org/v1alpha1, kind=tfjob",more information:\r \r kubectl get pods --namespace=${namespace}\r name ready status restarts age\r ambassador-1695609295-7d7ml 2/2 running 0 23m\r ambassador-1695609295-dblhh 2/2 running 0 23m\r ambassador-1695609295-m0bzw 2/2 running 0 23m\r tf-hub-0 1/1 running 0 23m\r tf-job-dashboard-4279538329-vsxl8 1/1 running 0 ...,0,[]
542,"""https://github.com/kubeflow/kubeflow/issues/1768""",kubeflow/kubeflow,current master version fails when applying platform in gcp,"apply platform fails with the below error:\r - code: resource_error\r location: /deployments/kubeflow-tpu/resources/kubeflow-tpu\r message: '{\ resourcetype\ :\ container.v1.cluster\ ,\ resourceerrorcode\ :\ 400\ ,\ resourceerrormessage\ :{\ code\ :400,\ message\ :\ cluster.logging_service\r was \\\ logging.googleapis.com/kubernetes\\\ but must be one of \\\ \\\ , \\\ none\\\ , \\\ logging.googleapis.com\\\ .\ ,\ status\ :\ invalid_argument\ ,\ statusmessage\ :\ bad\r request\ ,...",0,[]
543,"""https://github.com/kubeflow/kubeflow/issues/3219""",kubeflow/kubeflow,set port name to http-xx for all services,istio relies on port name to distinguish http vs tcp services this affects some authz features .\r we should make sure the port name is http-xx for all services.\r see doc https://istio.io/help/faq/traffic-management/ naming-port-convention .\r \r cc @kunmingg \r \r,0,[]
544,"""https://github.com/kubeflow/kubeflow/issues/2522""",kubeflow/kubeflow,error when using pytorch in notebooks,"hey, \r \r when using pytorch via fastai in my notebook i get the error:\r \r runtimeerror: traceback most recent call last :\r file \ /opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py\ , line 138, in _worker_loop\r samples = collate_fn dataset i for i in batch_indices \r file \ /opt/conda/lib/python3.6/site-packages/fastai/torch_core.py\ , line 117, in data_collate\r return torch.utils.data.dataloader.default_collate to_data batch \r file \ /opt...",0,[]


## Notes

- Some labels have a fairly high N.  Do we really need few shot for these?
- Do you really want to maintain local models for each repo?  If you do should be a seperate service with API endpoint to keep dependencies clean.
- First lets see if few shot can even work?
- Looks like we might be able to get pretty far on keyword matching and BPE

In [None]:
class MultiBatchEncoder(nn.Module):
    "Create an encoder over `module` that can process a full sentence."
    def __init__(self, bptt:int, max_len:int, module:nn.Module, pad_idx:int=1):
        super().__init__()
        self.max_len,self.bptt,self.module,self.pad_idx = max_len,bptt,module,pad_idx

    def concat(self, arrs:Collection[Tensor])->Tensor:
        "Concatenate the `arrs` along the batch dimension."
        return [torch.cat([l[si] for l in arrs], dim=1) for si in range_of(arrs[0])]
    
    def reset(self): 
        if hasattr(self.module, 'reset'): self.module.reset()

    def forward(self, input:LongTensor)->Tuple[Tensor,Tensor]:
        bs,sl = input.size()
        self.reset()
        raw_outputs,outputs,masks = [],[],[]
        for i in range(0, sl, self.bptt):
            # encoder emits raw, output
            r, o = self.module(input[:,i: min(i+self.bptt, sl)])
            if i>(sl-self.max_len):
                masks.append(input[:,i: min(i+self.bptt, sl)] == self.pad_idx)
                raw_outputs.append(r)
                outputs.append(o)
        return self.concat(raw_outputs),self.concat(outputs),torch.cat(masks,dim=1)

In [299]:
learn.data.vocab.stoi['xxpad']

1

In [296]:
learn.data.bptt

70