In [1]:
%load_ext autoreload
%autoreload 2

### references:

* https://pysnacks.com/bert-text-classification-with-fine-tuning/
* https://colab.research.google.com/drive/14b2rbIgwhQ1BI-zkyiMjQv-jV85xj9tf#scrollTo=5qSd2lLwJ7lH

## Load raw data from JSON

In [2]:
import itertools
import json
import logging
import os
import sys
import random
from pathlib import Path

In [3]:
from Levenshtein import ratio
from colorama import Fore, Style

In [4]:
logger = logging.getLogger()
logger.level = logging.DEBUG
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)

In [5]:
from pyframe.ds_android import get_input_for_BERT

raw_data = get_input_for_BERT()

[31m5 [33m47 [0m https://developer.android.com/reference/android/widget/ArrayAdapter
[31m9 [33m21 [0m https://stackoverflow.com/questions/6442054
[31m3 [33m22 [0m https://github.com/nostra13/Android-Universal-Image-Loader/issues/462
[31m22 [33m211 [0m https://www.raywenderlich.com/155-android-listview-tutorial-with-kotlin
[31m21 [33m59 [0m https://guides.codepath.com/android/Using-an-ArrayAdapter-with-ListView
[31m6 [33m33 [0m https://github.com/realm/realm-java/issues/776
[31m9 [33m15 [0m https://developer.android.com/training/volley/request
[31m14 [33m65 [0m https://stackoverflow.com/questions/28504524
[31m20 [33m59 [0m https://medium.com/@JasonCromer/android-asynctask-http-request-tutorial-6b429d833e28
[31m5 [33m97 [0m https://www.twilio.com/blog/5-ways-to-make-http-requests-in-java
[31m17 [33m33 [0m https://developer.android.com/guide/navigation/navigation-custom-back
[31m6 [33m55 [0m https://stackoverflow.com/questions/10108774
[31m5 [33m470 

[31m5 [33m57 [0m https://github.com/signalapp/Signal-Android/issues/3376
[31m22 [33m104 [0m https://developer.android.com/reference/org/json/JSONObject
[31m8 [33m31 [0m https://guides.codepath.com/android/converting-json-to-models
[31m5 [33m34 [0m https://developer.android.com/guide/topics/media-apps/volume-and-earphones
[31m4 [33m40 [0m https://developer.android.com/training/gestures/scale
[31m6 [33m32 [0m https://stackoverflow.com/questions/10630373


In [6]:
print('Sample entry from data:')
print(json.dumps(raw_data[0], indent=4, sort_keys=True))

Sample entry from data:
{
    "category_index": 0,
    "question": "Explanation of the getView() method of an ArrayAdapter",
    "source": "https://developer.android.com/reference/android/widget/ArrayAdapter",
    "text": "public class ArrayAdapter extends BaseAdapter implements Filterable, ThemedSpinnerAdapter",
    "weights": 0
}


In [7]:
from collections import Counter, defaultdict

cnt = Counter([d['category_index'] for d in raw_data])

total = sum(cnt.values())

labels_cnt = [cnt[0] / float(total), cnt[1] / float(total)]
print('label distribution')
print('')
print('not-relevant -- {:.0f}%'.format(labels_cnt[0] * 100))
print('RELEVANT ------ {:.0f}%'.format(labels_cnt[1] * 100))

label distribution

not-relevant -- 88%
RELEVANT ------ 12%


## Language Model

In [8]:
import os
import contextlib
import tensorflow as tf
import os
import codecs
import numpy as np
import math
import json

import numpy as np
import pandas as pd

from collections import defaultdict, Counter
from tqdm import tqdm

# sklearn libs
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

# Tensorflow Imports
import tensorflow as tf
from tensorflow.python import keras
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam

# Keras-bert imports
from keras_radam import RAdam
from keras_bert import Tokenizer
from keras_bert import get_custom_objects
from keras_bert import load_trained_model_from_checkpoint

os.environ['TF_KERAS'] = '1'

# Bert Model Constants
SEQ_LEN = 128
BATCH_SIZE = 32 # I need caching because a smaller batch size causes OOM errors
EPOCHS = 3
LR = 2e-5

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
PyTorch version 1.5.1 available.
TensorFlow version 2.2.0 available.


#### Checking for GPUs

https://stackoverflow.com/questions/57062456/function-call-stack-keras-scratch-graph-error/63123354#63123354

In [9]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 943933419010226311
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 16626297845566709475
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 3293032123992464670
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11772001024
locality {
  bus_id: 2
  numa_node: 1
  links {
  }
}
incarnation: 17392693508065729308
physical_device_desc: "device: 0, name: Tesla P100-PCIE-12GB, pci bus id: 0000:83:00.0, compute capability: 6.0"
]


In [10]:
gpus = tf.config.experimental.list_physical_devices('GPU')

gpu = next(iter(gpus))

try:
    tf.config.experimental.set_visible_devices(gpu, 'GPU')
    tf.config.experimental.set_memory_growth(gpu, True)
    
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)
    
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

1 Physical GPUs, 1 Logical GPU
From <ipython-input-10-dd4623d1ba75>:16: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

### Dataset procedures 

In [11]:
def undersample_df(df, n_times=4):
    class_0,class_1 = df.category_index.value_counts()
    c0 = df[df['category_index'] == 0]
    c1 = df[df['category_index'] == 1]
    df_0 = c0.sample(int(n_times * class_1))
    
    undersampled_df = pd.concat([df_0, c1],axis=0)
    return undersampled_df

In [12]:
CORPUS = raw_data

In [13]:
def get_ds_synthetic_data(min_w=3):
    short_task = {
        "bugzilla": """How to query bugs using the custom fields with the Bugzilla REST API?""",
        "databases": """Which technology should be adopted for the database layer abstraction: Object/Relational Mapping (ORM) or a Java Database Connectivity API (JDBC)?""",
        "gpmdpu": """Can I bind the cmd key to the GPMDPU shortcuts?""",
        "lucene": """How does Lucene compute similarity scores for the BM25 similarity?""",
        "networking": """Which technology should be adopted for the notification system, Server-Sent Events (SSE) or WebSockets?""",
    }

    with open('relevance_corpus.json') as ipf:
        aux = json.load(ipf)
        raw_data = defaultdict(list)
        for d in aux:
            if d['task'] == 'yargs':
                continue

            raw_data['text'].append(d['text'])
            raw_data['question'].append(short_task[d['task']])
            raw_data['source'].append(d['source'])
            raw_data['category_index'].append(1 if d['weight'] > min_w else 0)
            raw_data['weights'].append(d['weight'] if d['weight'] > min_w else 0)

        data = pd.DataFrame.from_dict(raw_data)
        data = undersample_df(data, n_times=1)
        data = data.sample(frac=1).reset_index(drop=True)
        
    return data

#### JSON to dataframes

In [14]:
def get_class_weights(y, smooth_factor=0, upper_bound=5.0):
    """
    Returns the weights for each class based on the frequencies of the samples
    :param smooth_factor: factor that smooths extremely uneven weights
    :param y: list of true labels (the labels must be hashable)
    :return: dictionary with the weight for each class
    """
    counter = Counter(y)

    if smooth_factor > 0:
        p = max(counter.values()) * smooth_factor
        for k in counter.keys():
            counter[k] += p

    majority = max(counter.values())

    clazz = {cls: float(majority / count) for cls, count in counter.items()}
    result = {}
    for key, value in clazz.items():
        if value > upper_bound:
            value = upper_bound
        
        result[key] = value
    return result

In [15]:
def add_raw_data(result, data):
    result['text'].append(data['text'])
    result['question'].append(data['question'])
    result['source'].append(data['source'])
    result['category_index'].append(data['category_index'])
    result['weights'].append(data['weights'])
  

In [16]:
def get_train_val_test(task_uid, size=0.9, undersample=False, aug=True):
    if not isinstance(task_uid, list):
        task_uid = [task_uid]
        
    train_data_raw = defaultdict(list)
    test_data_raw = defaultdict(list)
    
    for _data in tqdm(CORPUS):
        if _data['question'] in task_uid:
            add_raw_data(test_data_raw, _data)
        else:
            add_raw_data(train_data_raw, _data)
    
    train_val = pd.DataFrame.from_dict(train_data_raw)
    test = pd.DataFrame.from_dict(test_data_raw)
    
    # https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
    #  randomize rows....    
    train_val = train_val.sample(frac=1).reset_index(drop=True)
    test = test.sample(frac=1).reset_index(drop=True)
    
    if undersample:
        train_val = undersample_df(train_val)
        train_val = train_val.sample(frac=1).reset_index(drop=True)
        
    if aug:
        train_val = pd.concat([train_val, get_ds_synthetic_data()],axis=0)
        train_val = train_val.sample(frac=1).reset_index(drop=True)
    
    weights = get_class_weights(train_val['category_index'].tolist())
    
    train, val = train_test_split(
        train_val, 
        stratify=train_val['category_index'].tolist(), 
        train_size=size
    )
    
    return train, val, test, weights

### BERT model procedures

In [17]:
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

tokenizer = Tokenizer(token_dict)

https://stackoverflow.com/questions/44477489/keras-difference-between-categorical-accuracy-and-sparse-categorical-accuracy

In [18]:
# FIXME: global variable that is referenced inside the train/test functions...
model = None


# with tf.device("GPU:0"):
#     model = load_trained_model_from_checkpoint(
#       config_path,
#       checkpoint_path,
#       training=True,
#       trainable=True,
#       seq_len=SEQ_LEN,
#     )
#     inputs = model.inputs[:2]
#     dense = model.get_layer('NSP-Dense').output
#     outputs = keras.layers.Dense(units=2, activation='softmax', name="probs")(dense)
#     model = keras.models.Model(inputs, outputs)

#     optimizer = Adam(lr=LR)

#     model.compile(
#       optimizer=optimizer,
#       loss='sparse_categorical_crossentropy',
#       metrics=['sparse_categorical_accuracy'],
#     )

# print(model.summary())

### Encode data according to model input

In [19]:
def encode_data(df, tokenizer, over_sampling=1, testing=False):
    relevant = 1
    indices, segments, labels, metadata = [], [], [], []
    
    for index, row in df.iterrows():
        _ids, _segments = tokenizer.encode(
            first=row["text"], 
            second=row["question"], 
            max_len=SEQ_LEN
        )
        
        label = row["category_index"]
        if label == relevant:
            for _ in range(over_sampling):
                indices.append(_ids)
                segments.append(_segments)
                labels.append(label)
                metadata.append((row['weights'], row['text'], row["question"]))
        else:
            indices.append(_ids)
            segments.append(_segments)
            labels.append(label)
            metadata.append((row['weights'], row['text'], row["question"]))
        
    # zip data into single list, shuffle everything and decompress
    items = list(zip(indices, segments, labels, metadata))
    np.random.shuffle(items)
    indices, segments, labels, metadata = zip(*items)
    indices = np.array(indices)
    
    # checks if array size is equals to batch size. If it's not, remove the last n entries to make it divisable
    mod = indices.shape[0] % BATCH_SIZE
    if mod > 0 and not testing:
        indices, segments, labels, metadata = indices[:-mod], segments[:-mod], labels[:-mod], metadata[:-mod]
    
    X, y = [indices, np.array(segments)], np.array(labels)
    
#     logger.info("-" * 20)
#     logger.info("Data ::")    
#     logger.info(X[0].shape)
#     logger.info(X[1].shape)
#     logger.info(y.shape)
    
    return X, y, metadata

## Metrics aggregators

In [20]:
from sklearn.metrics import classification_report

recommendation_metrics = defaultdict(list)
prediction_metrics = defaultdict(list)

classification_report_lst = []
log_examples_lst = []

In [21]:
def aggregate_macro_metrics(store_at, precision, recall, fscore):   
    store_at['precision'].append(precision)
    store_at['recall'].append(recall)
    store_at['fscore'].append(fscore)

def aggregate_recommendation_metrics(store_at, k, precision_at_k, pyramid_precision_at_k):
    store_at['k'].append(k)
    store_at['precision'].append(precision_at_k)
    store_at['∆ precision'].append(pyramid_precision_at_k)

## Model testing & evaluation metrics

In [22]:
def log_examples(task_title, source, text, pweights, y_predict, y_probs, k=10):
    # get the predicted prob at every index
    idx_probs = [(idx, y_predict[idx], y_probs[idx]) for idx, _ in enumerate(y_predict)]
    
    # filter probs for all indexes predicted as relevant  
    idx_probs = list(filter(lambda k: k[1] == 1, idx_probs))
    
    most_probable = sorted(idx_probs, key=lambda i: i[2], reverse=True)
    
    result = [idx for idx, _, _ in most_probable][:k]
    
    for idx in result:
        log_examples_lst.append((
            source, 
            task_title,
            pweights[idx],
            y_predict[idx],
            y_probs[idx],
            text[idx]
        ))

In [23]:
def _precision_at_k(y_test, y_predict, y_prob, k=10):
    # get the predicted prob at every index
    idx_probs = [(idx, y_predict[idx], y_prob[idx]) for idx, _ in enumerate(y_test)]
    
    # filter probs for all indexes predicted as relevant  
    idx_probs = list(filter(lambda k: k[1] == 1, idx_probs))
    
    most_probable = sorted(idx_probs, key=lambda i: i[2], reverse=True)
    result = [y_test[idx] * y_predict[idx] for idx, _, _ in most_probable]   
    y_predict = [y for _, y, _ in most_probable]
    
    result = result[:k]
    y_predict = y_predict[:k]
    ratio = sum(result) / float(len(y_predict) + 0.00001)
    return ratio

In [24]:
def _pyramid_score(y_optimal, y_predicted, y_prob, k=10):

    # create reference table for weights 
#     y_predicted = [i for i in y_optimal]

    # get the predicted prob at every index
    idx_probs = [(idx, y_optimal[idx], y_predicted[idx], y_prob[idx]) for idx, _ in enumerate(y_optimal)]
    
    # filter probs for all indexes predicted as relevant  
    idx_probs = list(filter(lambda aux: aux[2] == 1, idx_probs))

    # sort
    most_probable = sorted(idx_probs, key=lambda i: i[3], reverse=True)

    # compute predicted and optimal score up until K
    predicted_score = [w for _, w, _, _ in most_probable][:k]
    optimal_score = sorted(y_optimal, reverse=True)[:k]
    
    ratio = sum(predicted_score) / float(sum(optimal_score) + 0.00001)
    return ratio

In [25]:
def test_model(source, df_test, model, tokenizer):
    
    test_x, test_y, metadata = encode_data(df_test, tokenizer, testing=True)
    
    logger.info(Fore.YELLOW + str(len(test_x)) + Style.RESET_ALL)
    
    text = [m[1] for m in metadata]
    pweights = [m[0] for m in metadata]
    task_title = metadata[0][2]

    predicts = model.predict(test_x, verbose=True)
    
    y_probs = predicts[:, 1]
    y_predict = predicts.argmax(axis=-1)

    accuracy = accuracy_score(test_y, y_predict)
    macro_f1 = f1_score(test_y, y_predict, average='macro')
    
    classification_report_lst.append(classification_report(test_y, y_predict))

    logger.info("-" * 20)    
    
    logger.info("Y")
    logger.info("[0s] {} [1s] {}".format(
        len(list(filter(lambda k: k== 0, test_y))),
        len(list(filter(lambda k: k== 1, test_y)))
    ))
    
        
    logger.info("predicted")
    logger.info("[0s] {} [1s] {}".format(
        len(list(filter(lambda k: k== 0, y_predict))),
        len(list(filter(lambda k: k== 1, y_predict)))
    ))
    
    logger.info("-" * 20)
    
    logger.info("Accuracy: {:.4f}".format(accuracy))
    logger.info("macro_f1: {:.4f}".format(macro_f1))

    precision, recall, fscore, _ = precision_recall_fscore_support(test_y, y_predict, average='macro')
    
    aggregate_macro_metrics(prediction_metrics, precision, recall, fscore)
    
    logger.info("Precision: {:.4f}".format(precision))
    logger.info("Recall: {:.4f}".format(recall))
    logger.info("F1: {:.4f}".format(fscore))
    
    logger.info("-" * 20)
    
    for k in [3, 5, 10]:
        p_at_k = _precision_at_k(test_y, y_predict, y_probs, k=k)
        score_at_k = _pyramid_score(pweights, y_predict, y_probs, k=k)
                                     
        aggregate_recommendation_metrics(recommendation_metrics, k, p_at_k, score_at_k)
        
        logger.info("")
        logger.info("Precision_at_{}: {:.4f}".format(k, p_at_k))
        logger.info("Pyramid_at_{}: {:.4f}".format(k, score_at_k))
    logger.info("-" * 20)
    
    log_examples(task_title, source, text, pweights, y_predict, y_probs, k=5)

# Evaluation

In [26]:
file_log = 'nnet-bert-w2v-new.ans'
print("Log at {}".format(file_log))

fh = logging.FileHandler(file_log)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

Log at nnet-bert-w2v-new.ans


#### select all tasks for 10-fold cross validation

In [27]:
all_tasks = list(set([d['question'] for d in raw_data]))
random.shuffle(all_tasks)

# all_tasks = all_tasks[:10]
# logger.info('\n'.join(all_tasks))

In [28]:
from sklearn.model_selection import KFold

n_splits = 10
kf = KFold(n_splits=n_splits, random_state=20210820, shuffle=True)
np_tasks_arr = np.array(all_tasks)

#### Test input by pertinent artifacts for test task

## TODO: cache splits in case of out-of-memory errors

## TODO: remove break after for full eval

In [29]:
idx_split = 0
for train_index, test_index in kf.split(np_tasks_arr):    
    test_tasks_lst = np_tasks_arr[test_index].tolist()
    
    logger.info("")
    logger.info(Fore.RED + f"Fold {idx_split}" + Style.RESET_ALL)
    logger.info('\n'.join(test_tasks_lst))
    
    df_train, df_val, df_test, weights = get_train_val_test(test_tasks_lst, undersample=True) 
    
    print('-' * 10)
    print('train')
    print(df_train.category_index.value_counts())    
    print('val')
    print(df_val.category_index.value_counts())    
    print('test')
    print(df_test.category_index.value_counts())    
    print('weights')
    print(str(weights))
    print('-' * 10)
    
    train_x, train_y, _ = encode_data(df_train, tokenizer, over_sampling=1)
    val_x, val_y, _ = encode_data(df_val, tokenizer)
    
    with tf.device("GPU:0"):
        model = load_trained_model_from_checkpoint(
          config_path,
          checkpoint_path,
          training=True,
          trainable=True,
          seq_len=SEQ_LEN,
        )
        
        inputs = model.inputs[:2]
        dense = model.get_layer('NSP-Dense').output
        outputs = keras.layers.Dense(units=2, activation='softmax', name="probs")(dense)
        model = keras.models.Model(inputs, outputs)

        optimizer = Adam(lr=LR)

        
        model.compile(
          optimizer=optimizer,
          loss='sparse_categorical_crossentropy',
          metrics=['sparse_categorical_accuracy'],
        )
        
        
        logger.info("")
        logger.info(Fore.RED + f"Training model" + Style.RESET_ALL)
        history = model.fit(
            train_x,
            train_y,
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            class_weight=weights,
            validation_data=(val_x, val_y)
        )
        
        logger.info("")
        logger.info(Fore.RED + f"Testing model" + Style.RESET_ALL)
        for source in df_test["source"].unique():
            df_source = df_test[df_test["source"] == source]   

            logger.info(source)
            test_model(source, df_source, model, tokenizer)
            
    idx_split += 1
    break


[31mFold 0[0m
Dagger 2 doesn't implement some of the component methods in Android project with custom annotation processor
how  to set Screenshot frame size
Quick Actions don't get displayed on Android 7.0
Hide MarkerView when nothing selected
Support for GoogleApiClient and new FusedLocationProviderApi


100%|██████████| 7918/7918 [00:00<00:00, 390638.22it/s]


----------
train
0    3274
1     941
Name: category_index, dtype: int64
val
0    364
1    105
Name: category_index, dtype: int64
test
0    806
1    120
Name: category_index, dtype: int64
weights
{0: 1.0, 1: 3.478011472275335}
----------

[31mTraining model[0m
Epoch 1/3
Epoch 2/3
Epoch 3/3

[31mTesting model[0m
https://developer.android.com/training/location/retrieve-current
[33m2[0m
--------------------
Y
[0s] 27 [1s] 9
predicted
[0s] 24 [1s] 12
--------------------
Accuracy: 0.6389
macro_f1: 0.5630
Precision: 0.5625
Recall: 0.5741
F1: 0.5630
--------------------

Precision_at_3: 0.3333
Pyramid_at_3: 0.3333

Precision_at_5: 0.4000
Pyramid_at_5: 0.4000

Precision_at_10: 0.3000
Pyramid_at_10: 0.3333
--------------------
https://guides.codepath.com/android/dependency-injection-with-dagger-2
[33m2[0m
--------------------
Y
[0s] 97 [1s] 24
predicted
[0s] 13 [1s] 108
--------------------
Accuracy: 0.2562
macro_f1: 0.2500
Precision: 0.4818
Recall: 0.4890
F1: 0.2500
-------------------

In [30]:
# logger.info("")
# logger.info(Fore.RED + f"Testing model" + Style.RESET_ALL)
# for source in df_test["source"].unique():
#     df_source = df_test[df_test["source"] == source]   
    
#     logger.info(source)
#     test_model(source, df_source, model, tokenizer)

## Metric reports

In [31]:
def avg_recommendation_metric_for(data, k=3, filter_outliers=True):
    __precision = []
    __pyramid = []
    
    total_len = len(data['k'])
    
    for idx in range(total_len):
        
        __value = data['k'][idx]
        if __value  == k:
            if filter_outliers:            
                if data['precision'][idx] > 0.:
                    __precision.append(data['precision'][idx])
                if data['∆ precision'][idx] > 0.:
                    __pyramid.append(data['∆ precision'][idx])
            else:
                __precision.append(data['precision'][idx])
                __pyramid.append(data['∆ precision'][idx])
                

    return np.mean(__precision), np.mean(__pyramid)

In [32]:
def avg_macro_metric_for(data):
    __precision = data['precision']
    __recall = data['recall']
    __fscore = data['fscore']

    return np.mean(__precision), np.mean(__recall), np.mean(__fscore)

### Precision at k

In [33]:
filter_outliers = False

In [34]:
_precision, __pyramid_score = avg_recommendation_metric_for(
    recommendation_metrics, 
    k=3, 
    filter_outliers=filter_outliers
)

logger.info(Fore.YELLOW + "k=3" + Style.RESET_ALL)
logger.info("precision: " + Fore.RED + "{:.3f}".format(_precision) + Style.RESET_ALL)
logger.info("pyramid:   " + Fore.RED + "{:.3f}".format(__pyramid_score) + Style.RESET_ALL)

[33mk=3[0m
precision: [31m0.424[0m
pyramid:   [31m0.379[0m


In [35]:
_precision, __pyramid_score = avg_recommendation_metric_for(
    recommendation_metrics, 
    k=5, 
    filter_outliers=filter_outliers
)

logger.info(Fore.YELLOW + "k=5" + Style.RESET_ALL)
logger.info("precision: " + Fore.RED + "{:.3f}".format(_precision) + Style.RESET_ALL)
logger.info("pyramid:   " + Fore.RED + "{:.3f}".format(__pyramid_score) + Style.RESET_ALL)

[33mk=5[0m
precision: [31m0.423[0m
pyramid:   [31m0.420[0m


In [36]:
_precision, __pyramid_score = avg_recommendation_metric_for(
    recommendation_metrics, 
    k=10, 
    filter_outliers=filter_outliers
)

logger.info(Fore.YELLOW + "k=10" + Style.RESET_ALL)
logger.info("precision: " + Fore.RED + "{:.3f}".format(_precision) + Style.RESET_ALL)
logger.info("pyramid:   " + Fore.RED + "{:.3f}".format(__pyramid_score) + Style.RESET_ALL)

[33mk=10[0m
precision: [31m0.350[0m
pyramid:   [31m0.433[0m


### Overall classification scores

In [37]:
_precision, _recall, _f1score = avg_macro_metric_for(prediction_metrics)

logger.info("")
logger.info(Fore.YELLOW + "Model metrics" + Style.RESET_ALL)
logger.info("precision: " + Fore.RED + "{:.3f}".format(_precision) + Style.RESET_ALL)
logger.info("recall:    " + Fore.RED + "{:.3f}".format(_recall) + Style.RESET_ALL)
logger.info("f1-score:  " + Fore.RED + "{:.3f}".format(_f1score) + Style.RESET_ALL)


[33mModel metrics[0m
precision: [31m0.563[0m
recall:    [31m0.600[0m
f1-score:  [31m0.484[0m


## Output examples

In [38]:
def examples_per_source_type(source_type='misc'):
    _sources = list(set([x[0] for x in log_examples_lst]))

    _template = "[w={}]" + Fore.RED + "[y={}]" + Fore.YELLOW + "[p={:.4f}]" + Style.RESET_ALL + " {}"

    for s in _sources:
        examples_in_source = []
        if source_type == 'api' and ('docs.oracle' in s or 'developer.android' in s):
            examples_in_source = list(filter(lambda k: k[0] == s, log_examples_lst))
            task_title = examples_in_source[0][1]
        elif source_type == 'so' and ('stackoverflow.com' in s):
            examples_in_source = list(filter(lambda k: k[0] == s, log_examples_lst))
            task_title = examples_in_source[0][1]            
        elif source_type == 'git' and ('github.com' in s):
            examples_in_source = list(filter(lambda k: k[0] == s, log_examples_lst))
            task_title = examples_in_source[0][1]
        elif source_type == 'misc' and 'github.com' not in s and 'docs.oracle' not in s and 'developer.android' not in s and 'stackoverflow.com' not in s:
            examples_in_source = list(filter(lambda k: k[0] == s, log_examples_lst))
            task_title = examples_in_source[0][1]
        
            
        if not examples_in_source:
            continue
            

        logger.info('')
        logger.info(Fore.RED + f"{task_title}" + Style.RESET_ALL)    
        logger.info(s)
        logger.info('')


        for _, _, pweights, y_predict, y_probs, text in examples_in_source:
            logger.info(_template.format(pweights, y_predict, y_probs, text))
            logger.info('')
        logger.info('-' * 20)

In [39]:
logger.info(Fore.RED + "API" + Style.RESET_ALL)
examples_per_source_type(source_type='api')

[31mAPI[0m

[31mSupport for GoogleApiClient and new FusedLocationProviderApi[0m
https://developer.android.com/training/location/retrieve-current

[w=0][31m[y=1][33m[p=0.8268][0m When your app is connected to these you can use the fused location provider's getLastLocation ( ) method to retrieve the device location.

[w=1][31m[y=1][33m[p=0.7919][0m In your activity's onCreate ( ) method, create an instance of the Fused Location Provider Client as the following code snippet shows.

[w=0][31m[y=1][33m[p=0.7852][0m This lesson shows you how to make a single request for the location of a device using the getLastLocation ( ) method in the fused location provider.

[w=1][31m[y=1][33m[p=0.7811][0m Specifically, use the fused location provider to retrieve the device's last known location.

[w=0][31m[y=1][33m[p=0.7725][0m To access the fused location provider, your app's development project must include Google Play services.

--------------------

[31mQuick Actions don't get d

In [40]:
logger.info(Fore.RED + "GIT" + Style.RESET_ALL)
examples_per_source_type(source_type='git')

[31mGIT[0m


In [41]:
logger.info(Fore.RED + "Stack Overflow" + Style.RESET_ALL)
examples_per_source_type(source_type='so')

[31mStack Overflow[0m

[31mDagger 2 doesn't implement some of the component methods in Android project with custom annotation processor[0m
https://stackoverflow.com/questions/57235136

[w=0][31m[y=1][33m[p=0.8536][0m The usual workaround for this is to use full-qualified names for generated classes, if they are used by other annotation processors.

[w=0][31m[y=1][33m[p=0.8459][0m There may be a more elegant way to solve this, but the simplest and most reliable solution is to do two passes with javac -- once to run just your annotation processor, and the second to do everything it normally does.

[w=2][31m[y=1][33m[p=0.8376][0m But the easiest option is to generate java code directly and your generated java classes will be picked up by javac automatically, launching second round of annotation processing, where dagger will process them.

[w=2][31m[y=1][33m[p=0.8376][0m Javac annotation processor uses rounds instead of defining processors order.

[w=0][31m[y=1][33m[p=0.8

In [42]:
logger.info(Fore.RED + "Miscellaneous" + Style.RESET_ALL)
examples_per_source_type(source_type='misc')

[31mMiscellaneous[0m

[31mDagger 2 doesn't implement some of the component methods in Android project with custom annotation processor[0m
https://guides.codepath.com/android/dependency-injection-with-dagger-2

[w=0][31m[y=1][33m[p=0.8613][0m For parent components, you would need to expose to the downstream component by specifying the type and a method:

[w=1][31m[y=1][33m[p=0.8602][0m You can define any number of custom scope annotations in your application by declaring them as a public @interface:

[w=1][31m[y=1][33m[p=0.8588][0m You will define it both where you provide the singletons ( @Provides annotation ), and where you inject them ( @Inject annotations ):

[w=0][31m[y=1][33m[p=0.8579][0m The methods that will actually expose available return types should also be annotated with the @Provides annotation.

[w=1][31m[y=1][33m[p=0.8572][0m You can use a class prefixed with Dagger ( i.e. DaggerTwitterApiComponent.java ) that will be responsible for instantiating an 