In [1]:
from mlapp import GitHubApp
from IPython import display

In [2]:
app_id = 27079
#key_file_path = '/Users/hamelsmu/.ssh/hamel-python-app.2019-03-15.private-key.pem'
key_file_path = '/hamel-python-app.2019-03-15.private-key.pem'

## Get the app installation

The test installation automatically fetches the first installation the app is found on.  This app is only installed in one place, so this just fetches the installation made on the public `hamelsmu/example-github-app` repo. 

In [3]:
ghapp = GitHubApp(pem_path=key_file_path, app_id=app_id)
install = ghapp.get_test_installation()

## Interact With Issues

#### Create a new issue

In [4]:
issue = install.create_issue(owner='hamelsmu',
                             repository='example-github-app',
                             title='test issue', 
                             body='this is automatically created')

#### Comment on an issue

In [5]:
issue.create_comment('Wooo!  Its time to do some machine learning!')

<IssueComment [ml-auto-labeler[bot]]>

#### Add a label to an issue

In [6]:
issue.add_labels('AI-is-taking-over')

[<ShortLabel [AI-is-taking-over]>]

#### See the issue here

In [7]:
print(issue.html_url)

display.Markdown(f'[{issue.html_url}]({issue.html_url})')

https://github.com/hamelsmu/example-github-app/issues/13


[https://github.com/hamelsmu/example-github-app/issues/13](https://github.com/hamelsmu/example-github-app/issues/13)

## Extract Data For Training ML Models

`Issue, Label` pairs

Get a list of issues which you can use to train models

In [8]:
issues = GitHubApp.unpack_issues(client=install, 
                                 owner='kubeflow',
                                 repo='kubeflow')

100%|██████████| 489/489 [02:20<00:00,  2.85it/s]


In [9]:
print(f'there are {len(issues)} issues with labels')

there are 372 issues with labels


In [10]:
print(issues[0].title)
print('\n==Issue Body==')
print(issues[0].body)
print('\n==Labels==')
print(issues[0].labels)
print('\n==Labels==')
print(issues[0].url)

Kubeflow UIs does not display correctly on Firefox

==Issue Body==
All Kubeflow UIs are broken when browsing with Firefox. On the other hand, they display correctly when browsing with Chrome.

This might be critical for 0.5, see screenshots below. I do not know if this a long-standing issue we haven't covered, let me know if you can reproduce it.

@avdaredevil /cc @jlewi /cc @richardsliu 

Platform (GKE)
**Kubernetes version**:
```
Server Version: version.Info{Major:"1", Minor:"11+", GitVersion:"v1.11.7-gke.12", GitCommit:"06f08e60069231bd21bdf673cf0595aac80b99f6", GitTr
eeState:"clean", BuildDate:"2019-02-25T20:37:10Z", GoVersion:"go1.10.8b4", Compiler:"gc", Platform:"linux/amd64"}
```
Kubeflow
**Branch**: master
**Commit**: https://github.com/kubeflow/kubeflow/commit/0969d745b817bbee7a9dbc3e2a1a7cd72c0dc469

Client
====
**OS**: Ubuntu 18.04.1 LTS
**Mozilla Firefox**: 66.0.1
**Google Chrome** 73.0.3683.86

Screenshots

**[Chrome central dashboard]**
![Chrome-c

# Build Model

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from textacy.preprocess import preprocess_text
from sklearn.ensemble import RandomForestClassifier

def textacy_cleaner(text: str) -> str:
    """
    Defines the default function for cleaning text.
    This function operates over a list.
    """
    return preprocess_text(text,
                           fix_unicode=True,
                           lowercase=True,
                           transliterate=True,
                           no_urls=True,
                           no_emails=True,
                           no_phone_numbers=True,
                           no_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           no_contractions=False,
                           no_accents=True)

#### Get Raw data and labels

In [13]:
raw_titles = []
raw_bodies = []

for issue in issues:
    raw_titles.append(textacy_cleaner(issue.title))
    raw_bodies.append(textacy_cleaner(issue.body))

#### See top labels in Kubeflow

In [15]:
from collections import Counter
import numpy as np
c = Counter()

for label in labels:
    c.update(label)
    
# only model on labels that occur at least 10 times
labels_to_keep = [x[0] for x in c.most_common() if x[1]>=10]

labels_to_keep

['priority/p2',
 'priority/p1',
 'area/jupyter',
 'community/question',
 'area/testing',
 'platform/gcp',
 'area/kfctl',
 'area/inference',
 'area/bootstrap',
 'cla: yes',
 'area/0.4.0',
 'help wanted',
 'cuj/multi-user',
 'area/build-release',
 'area/front-end',
 'area/0.5.0',
 'cuj/build-train-deploy',
 'community/discussion']

In [77]:
labels_to_keep = ['area/jupyter',
 'area/kfctl',
 'area/testing',
 'area/inference',
 'area/bootstrap',
 'area/build-release',
 'area/front-end']

In [78]:
labels = [[x for x in x if x in labels_to_keep] for x in labels]

#### Filter out records that do not contain one of the top labels in the dataset

In [79]:
mask = [bool(x) for x in labels]

In [80]:
print(f'There are {sum(mask)} issues out of the original {len(issues)} that have one of the top 10 labels')

There are 200 issues out of the original 372 that have one of the top 10 labels


In [81]:
filtered_bodies = np.array(raw_bodies)[mask]
filtered_titles = np.array(raw_titles)[mask]
filtered_labels = np.array(labels)[mask]

In [82]:
assert len(filtered_bodies) == len(filtered_titles) == sum(mask) == len(filtered_labels)

#### Train / Test Split

In [83]:
# text_train, text_val, labels_train, labels_val = train_test_split(filtered_text, filtered_labels, test_size=.15)

b_t, b_v, t_t, t_v, l_t, l_v = train_test_split(filtered_bodies, 
                                                filtered_titles, 
                                                filtered_labels, 
                                                test_size=.15)


In [84]:
assert len(b_t) == len(t_t) == len(l_t)
assert len(b_v) == len(t_v) == len(l_v)

#### Apply TFIDF Transformation to titles

In [85]:
tfidf = TfidfVectorizer(min_df=2)

Xtitle_train = tfidf.fit_transform(t_t)
Xtitle_test = tfidf.transform(t_v)

In [86]:
mlb = MultiLabelBinarizer()

ytrain = mlb.fit_transform(l_t)
ytest = mlb.transform(l_v)

In [87]:
assert ytrain.shape[0] == Xtitle_train.shape[0]
assert ytest.shape[0] == Xtitle_test.shape[0]

# Can we use transfer learning? Let's try using the weights from the Issue Summarizer!

In [88]:
import pandas as pd
import numpy as np
import dill as dpickle
from keras.models import load_model
import seq2seq_utils as utils

In [89]:
seq2seq_Model = load_model('/ds/hamel/Seq2Seq_Tutorial/notebooks/seq2seq_model_tutorial.h5')

In [90]:
encoder = utils.extract_encoder_model(seq2seq_Model)

In [91]:
_, text_proc = utils.load_text_processor('/ds/hamel/Seq2Seq_Tutorial/notebooks/body_pp.dpkl')

Size of vocabulary for /ds/hamel/Seq2Seq_Tutorial/notebooks/body_pp.dpkl: 8,002


In [92]:
Xbody_train = encoder.predict(text_proc.transform(b_t.tolist()), batch_size=10)
Xbody_test = encoder.predict(text_proc.transform(b_v.tolist()), batch_size=10)

#### Train Model

In [93]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Concatenate
from tensorflow.keras import Model, optimizers
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd

In [100]:
body_inp = Input(shape=(Xbody_train.shape[1],), name='body-inp')
title_inp = Input(shape=(Xtitle_train.shape[1],), name='title-inp')

concat = Concatenate(name='concat')([body_inp, title_inp])
h1 = Dense(50)(concat)
out = Dense(ytrain.shape[1], activation='sigmoid')(concat)

model = Model([body_inp, title_inp], out)
adam = optimizers.Adam(lr = .01)
model.compile(optimizer=adam,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
body-inp (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
title-inp (InputLayer)          (None, 200)          0                                            
__________________________________________________________________________________________________
concat (Concatenate)            (None, 500)          0           body-inp[0][0]                   
                                                                 title-inp[0][0]                  
__________________________________________________________________________________________________
dense_16 (Dense)                (None, 7)            3507        concat[0][0]                     
Total para

In [101]:
mc = ModelCheckpoint(filepath='transfer_learning_best_model.hdf5', save_best_only=True)

model.fit(x=[Xbody_train, Xtitle_train],
          y=ytrain, 
          validation_data=([Xbody_test, Xtitle_test], ytest), 
          epochs=20, 
          callbacks=[mc])

Train on 170 samples, validate on 30 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f1097831e48>

In [351]:
# model.load_weights('best_model.hdf5')

In [104]:
test_predictions = model.predict([Xbody_test, Xtitle_test])

In [128]:
sum(np.argmax(ytest, axis=1) == np.argmax(test_predictions, axis=1))

12

In [130]:
12 / 30

0.4

# Show Test Results

In [107]:
threshold = .1
ground_truths = []
predictions = []

for i, data in enumerate(zip(mlb.inverse_transform(test_predictions >= threshold), mlb.inverse_transform(ytest))):
    pred, ground_truth = data
    ground_truths.append([ground_truth])
    predictions.append([pred])

In [108]:
pd.DataFrame({'ground_truth':ground_truths, 'prediction': predictions})

Unnamed: 0,ground_truth,prediction
0,"[(area/kfctl,)]",[()]
1,"[(area/front-end,)]",[()]
2,"[(area/build-release,)]",[()]
3,"[(area/jupyter,)]",[()]
4,"[(area/jupyter, area/testing)]",[()]
5,"[(area/build-release,)]",[()]
6,"[(area/front-end,)]",[()]
7,"[(area/jupyter,)]",[()]
8,"[(area/jupyter,)]",[()]
9,"[(area/inference,)]",[()]


## Conclusion:  

This simple model sucks, need to find something more compelling in this situation.