In [1]:
import sys,os
sys.path.append(os.getcwd())

import itertools
import json
import logging
import random
from pathlib import Path

from Levenshtein import ratio
from colorama import Fore, Style

logger = logging.getLogger()
logger.level = logging.DEBUG
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)

from ds_android import get_input_for_BERT

raw_data = get_input_for_BERT()

print('Sample entry from data:')
print(json.dumps(raw_data[0], indent=4, sort_keys=True))

[31m3 [33m17 [0m https://stackoverflow.com/questions/8712652
[31m8 [33m59 [0m https://dzone.com/articles/android-rotate-and-scale
[31m20 [33m145 [0m https://developer.android.com/training/dependency-injection/hilt-android
[31m4 [33m8 [0m https://stackoverflow.com/questions/30648172
[31m4 [33m81 [0m https://github.com/google/dagger/issues/1991
[31m9 [33m48 [0m https://prog.world/a-practical-guide-to-using-hilt-with-kotlin
[31m6 [33m33 [0m https://github.com/realm/realm-java/issues/776
[31m39 [33m129 [0m https://developer.android.com/training/permissions/requesting
[31m14 [33m21 [0m https://stackoverflow.com/questions/5233543
[31m4 [33m34 [0m https://github.com/morenoh149/react-native-contacts/issues/516
[31m27 [33m63 [0m https://guides.codepath.com/android/Understanding-App-Permissions
[31m9 [33m161 [0m https://www.avg.com/en/signal/guide-to-android-app-permissions-how-to-use-them-smartly
[31m5 [33m470 [0m https://developer.android.com/reference/an

[31m4 [33m40 [0m https://developer.android.com/training/gestures/scale
[31m6 [33m32 [0m https://stackoverflow.com/questions/10630373
[31m20 [33m196 [0m https://developer.android.com/training/dependency-injection/dagger-android
[31m6 [33m44 [0m https://stackoverflow.com/questions/57235136
[31m24 [33m121 [0m https://guides.codepath.com/android/dependency-injection-with-dagger-2
Sample entry from data:
{
    "category_index": 1,
    "question": "Android: rotate canvas around the center of the screen",
    "source": "https://stackoverflow.com/questions/8712652",
    "text": "You have to rotate the canvas first and then draw whatever you want.",
    "weights": 1
}


In [2]:
# If there is a previous execution for the same configuration, we load it from disk

# final results are the average of 3 distinct runs of this script.
# reason: avoid phishing results when BERT training procedures were exceptionally good
NUMBER_OF_RUNS = 3 
config_output = 'output/bert_ds_android_base.json'
# config_output = 'output/bert_ds_android_fe.json' # for frame-elements filter
# config_output = 'output/bert_ds_android_fa.json' # for frame-association filters

fold_results = dict()

if os.path.isfile(config_output):
    logger.info(Fore.YELLOW + "Loading data from cache" + Style.RESET_ALL)
    with open(config_output) as input_file:
        fold_results = json.load(input_file)
        
if 'venn_diagram_set' not in fold_results:
    fold_results['venn_diagram_set'] = []  

[33mLoading data from cache[0m


In [3]:
import os
import contextlib
import tensorflow as tf
import os
import codecs
import numpy as np
import math
import json

import numpy as np
import pandas as pd

from collections import defaultdict, Counter
from tqdm import tqdm

USE_TPU = False
os.environ['TF_KERAS'] = '1'

# @title Initialize TPU Strategy
if USE_TPU:
    TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
    resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)
    tf.contrib.distribute.initialize_tpu_system(resolver)
    strategy = tf.contrib.distribute.TPUStrategy(resolver)
    
from TFBertForTaskTextClassification import TFBertForTaskTextClassification
from TFBertForTaskTextClassification import TFBertForAndroidTaskTextClassification
from TFBertForTaskTextClassification import TFBertForSyntheticTaskTextClassification 

from metrics import MetricsAggregator

Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.


  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}


# BERT

In [4]:
# Init the model
model = TFBertForAndroidTaskTextClassification(model_id = '/home/msarthur/scratch/bert-base-uncased')

# Configure filters. All other values are as default
model.target_output = 10
model.use_frame_filtering = False
model.match_frame_from_task = False
model.n_undersampling = 4
        
# Load tokenizer
model.tokenizer(local_files_only=True)

In [5]:
# @title 10-fold cross validation WIP
CORPUS = raw_data

all_tasks = sorted(list(set([d['question'] for d in raw_data])))
rseed = 20210343
random.seed(rseed)
random.shuffle(all_tasks)

from sklearn.model_selection import KFold


file_handler = logging.FileHandler('LOG-bert_ds_android.ans')
file_handler.setLevel(logging.DEBUG)
logger.addHandler(file_handler)


n_splits = 10
kf = KFold(n_splits=n_splits, random_state=rseed)
np_tasks_arr = np.array(all_tasks)


idx_split = 0
for train_index, test_index in kf.split(np_tasks_arr):

    idx_split = str(idx_split)
    eval_fold = True
    # 10 runs per fold to avoid reporting peek results in a given fold
    # if idx_split in fold_results and fold_results[idx_split]['run_cnt'] >= NUMBER_OF_RUNS:
    #     logger.info(Fore.RED + f"Fold {idx_split} FULLY TESTED" + Style.RESET_ALL)
    #     eval_fold = False


    if eval_fold:
        model.metrics.reset_aggregators()

        test_tasks_lst = np_tasks_arr[test_index[:1]].tolist() # TODO: Arthur -- a single task for testing, just for the sake of it...

        logger.info("")
        logger.info(Fore.RED + f"Fold {idx_split}" + Style.RESET_ALL)
        logger.info('\n'.join(test_tasks_lst))

        # <------------------------------------------------------------------------- INPUT
        df_train, df_val, df_test, weights = model.get_train_val_test(
            CORPUS, test_tasks_lst
        )
        

        logger.info('-' * 10)
        logger.info(Fore.RED + 'train'+ Style.RESET_ALL)
        logger.info(str(df_train.category_index.value_counts()))
        logger.info("")

        logger.info(Fore.RED + 'test'+ Style.RESET_ALL)
        logger.info(str(df_test.category_index.value_counts()))
        logger.info("")

        logger.info(Fore.RED + 'weights'+ Style.RESET_ALL)
        logger.info(str(weights))
        logger.info('-' * 10)
        
        # <------------------------------------------------------------------------- TRAIN

        # Encode X_train
        train_encodings = model.encode(df_train)
        train_labels = df_train['category_index'].tolist()

        # Encode X_valid
        val_encodings = model.encode(df_val)
        val_labels = df_val['category_index'].tolist()


        # https://huggingface.co/transformers/custom_datasets.html
        train_dataset = tf.data.Dataset.from_tensor_slices((
            dict(train_encodings),
            train_labels
        ))

        val_dataset = tf.data.Dataset.from_tensor_slices((
            dict(val_encodings),
            val_labels
        ))


        fine_tunned_keras_model = model.build(
            train_dataset, val_dataset, weights, 
            checkpoint_filepath='best_model',
            local_files_only=False
        )
        
        
        if model.match_frame_from_task:
            __frame_pairs = model.fn_frame_pairs.get_most_common_frame_relationships(df_train)
            model.sentence_task_frame_pairs = __frame_pairs

        # <------------------------------------------------------------------------- TEST
        logger.info("")
        logger.info(Fore.RED + f"Testing model" + Style.RESET_ALL)
        for source in df_test["source"].unique():
            df_source = df_test[df_test["source"] == source]   
            logger.info(source)
            
            model.test(source, df_source, fine_tunned_keras_model)
                        

        # <------------------------------------------------------------------------- METRICS   
        
        prediction_metrics, api_metrics, so_metrics, git_metrics, misc_metrics = model.get_evaluation_metrics()
        
        MetricsAggregator.add_idx_fold_results(
            idx_split, fold_results, prediction_metrics,
            api_metrics, so_metrics, git_metrics, misc_metrics
        )

        fold_results['venn_diagram_set'] += model.metrics.venn_diagram_set
        fold_results['venn_diagram_set'] = list(set(fold_results['venn_diagram_set']))


        _precision, _recall, _f1score = MetricsAggregator.avg_macro_metric_for(prediction_metrics)

        logger.info("")
        logger.info(Fore.YELLOW + "Model metrics" + Style.RESET_ALL)
        logger.info("precision: " + Fore.RED + "{:.3f}".format(_precision) + Style.RESET_ALL)
        logger.info("recall:    " + Fore.RED + "{:.3f}".format(_recall) + Style.RESET_ALL)
        logger.info("f1-score:  " + Fore.RED + "{:.3f}".format(_f1score) + Style.RESET_ALL)


    idx_split = int(idx_split)
    idx_split += 1
    logger.info(f"next {idx_split}")
    break
    


[31mFold 0[0m
how can i get the value of text view in recyclerview item?


100%|██████████| 7931/7931 [00:00<00:00, 1017621.37it/s]

----------
[31mtrain[0m
0    1105
1     276
Name: category_index, dtype: int64

[31mtest[0m
0    478
1      1
Name: category_index, dtype: int64

[31mweights[0m
{1: 4.0, 0: 1.0}
----------



All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at /home/msarthur/scratch/bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.62522, saving model to best_model
Epoch 2/10

Epoch 00002: val_loss improved from 0.62522 to 0.60401, saving model to best_model
Epoch 3/10

Epoch 00003: val_loss improved from 0.60401 to 0.57332, saving model to best_model
Epoch 4/10

Epoch 00004: val_loss improved from 0.57332 to 0.55954, saving model to best_model
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.55954
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.55954
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.55954
Epoch 8/10
Restoring model weights from the end of the best epoch.

Epoch 00008: val_loss did not improve from 0.55954
Epoch 00008: early stopping

[31mTesting model[0m
https://guides.codepath.com/android/using-the-recyclerview
https://developer.android.com/codelabs/basic-android-kotlin-training-recyclerview-scrollable-list
https://stackoverflow.com/questions/37096547
--------------------
Y
[0s] 16 [1s] 1
predicted
[0s] 11 [1s] 6

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


# Python Tasks

In [9]:
import pandas as pd

def get_dict_for_task(input_task):
    input_folder = f'experiment/{input_task}'
    encoding = "UTF-8" if os.name != "nt" else "mbcs"
    result = []
    for task_path in Path(input_folder).rglob('*.json'):
        with open(task_path, encoding=encoding) as fi:
        # with open(path) as fi:
            data = json.load(fi)
            for __data in data:
                result.append(dict(
                    text=__data['text'],
                    question=__data['description'],
                    task=__data['task'],
                    source=__data['source'],
                    source_type='misc',
                    category_index=0,
                    weights=0
                ))
    return pd.DataFrame.from_dict(result)


In [8]:
def eval_model(model, test_data, max_pred_values=10):
    preds = model.predict(test_data.batch(1)).logits

    # transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()

    y_predict, y_probs = res.argmax(axis=-1), res[:, 1]
    aux = [(idx, prob) for idx, prob in enumerate(y_probs)]

    cnt = 0
    for idx, prob in sorted(aux, key=lambda k: k[1], reverse=True):
        cnt += 1
        if cnt > max_pred_values:
            y_predict[idx] = 0

    return y_predict, y_probs

In [10]:
def log_examples(task_title, text, pweights, y_predict, y_probs, k=10):
    # get the predicted prob at every index
    idx_probs = [(idx, y_predict[idx], y_probs[idx]) for idx, _ in enumerate(y_predict)]

    # filter probs for all indexes predicted as relevant
    idx_probs = list(filter(lambda k: k[1] == 1, idx_probs))

    most_probable = sorted(idx_probs, key=lambda i: i[2], reverse=True)

    result = [idx for idx, _, _ in most_probable][:k]

    for idx in result:
        print((idx,
            round(y_probs[idx], 3),
            text[idx]
        ))

In [11]:
def test_data(df_data, model, keras_model):
    for source in df_data["source"].unique():
        df_source = df_data[df_data["source"] == source]   
        

        test_encodings = model.encode(df_source)
        test_labels = df_source['category_index'].tolist()

        test_dataset = tf.data.Dataset.from_tensor_slices((
            dict(test_encodings),
            test_labels
        ))

        text = df_source['text'].tolist()
        task_title = df_source['question'].tolist()[0]    
        pweights = df_source['weights'].tolist()

        logger.info(task_title)
        logger.info(source)
        print()

        y_predict, y_probs = eval_model(keras_model, test_dataset)
        log_examples(task_title, text, pweights, y_predict, y_probs, k=10)

        print()
        print("-" * 20)
        print()
            

# Distances

In [12]:
input_task = 'distance'

df_data = get_dict_for_task(input_task)
test_data(df_data, model, fine_tunned_keras_model)

Given a string representing rendezvous address and a list of suggested picnic addresses you must write an algorithm using the geopy module to find the picnic address with the closest distance to the rendezvous address.
python - Obtaining latitude and longitude of multiple locations using Geopy - Stack Overflow

(20, 0.791, 'I have been trying to use Geopy from python to achieve this.')
(36, 0.784, "In you code it should be something like this - import csv from time import sleep from geopy.geocoders import Nominatim with open ('D: / location_to_lat_lon / tolocate.csv', ` r' ) as fp:")
(44, 0.778, 'Browse other questions tagged python csv geopy or ask your own question.')
(63, 0.776, 'Getting SSL Error on AWS EC2 Instance When I Try to Reach GeoPy API')
(30, 0.773, 'You are forgetting that location can be None at times due to various reasons including the geocoding service not having geo spatial data for the given address.')
(35, 0.762, 'In that case you have to ignore such kind of addre

Welcome to GeoPy’s documentation! — GeoPy 2.2.0 documentation

(151, 0.806, 'Return type None, geopy.location.Location or a list of them, if exactly_one = False.')
(490, 0.806, 'Return type geopy.location.Location or a list of them, if exactly_one = False.')
(684, 0.804, 'geopy attempts to follow semantic versioning, however some breaking changes are still being made in minor releases, such as:')
(691, 0.803, "For example, geopy.point.Point instances previously did coordinate values normalization, though it's not documented, and it was completely wrong for the latitudes outside the -LSB- -90 ; 90 -RSB- range.")
(644, 0.803, 'Bases: geopy.adapters.BaseSyncAdapter The fallback adapter which uses urllib from the Python standard library, see urllib.request.urlopen ( ).')
(34, 0.801, 'Or, if you are ready to wait, you can try geopy.extra.rate _ limiter.')
(30, 0.8, 'These services provide APIs, which anyone could implement, and geopy is just a library which provides these implementations fo

# NYTimes

In [13]:
input_task = 'nytimes'

df_data = get_dict_for_task(input_task)
test_data(df_data, model, fine_tunned_keras_model)

Given a string representing the url for NY Times Today's, you must write a python script using the BeautifulSoup and requests modules to scrap all the headlines of that page.
Beautiful Soup: Build a Web Scraper With Python – Real Python

(317, 0.808, 'To get the URL instead, you need to extract the value of one of the HTML attributes instead of discarding it.')
(319, 0.802, "The specific URL that you're looking for is the value of the href attribute of the second <a> tag at the bottom the HTML of a single job posting:")
(240, 0.801, 'find_all ( ) on a Beautiful Soup object, which returns an iterable containing all the HTML for all the job listings displayed on that page.')
(323, 0.801, "Then you extracted the href attribute, which contains the URL, using -LSB- `` href'' -RSB- and printed it to your console.")
(331, 0.8, "Then you're directly extracting the URL using the square-bracket notation and addressing the href attribute ( -LSB- `` href'' -RSB- ).")
(338, 0.799, 'Therefore, you c

How to extract HTTP response body from a Python requests call? - Stack Overflow

(52, 0.774, 'Correct way to try/except using Python requests module ?')
(79, 0.762, 'Question feed To subscribe to this RSS feed, copy and paste this URL into your RSS reader.')
(12, 0.739, 'How to extract HTTP response body from a Python requests call ?')
(20, 0.728, 'This should indeed print lots of content, but instead prints nothing.')
(47, 0.728, 'how to get response data from a python post request')
(30, 0.698, 'You can try this method:')
(17, 0.686, "I'm using the Python requests library.")
(54, 0.664, 'How to upload file with python requests ?')
(82, 0.614, 'Follow this question to receive notifications')
(48, 0.609, "Using headers with the Python requests library's get method")

--------------------

Given a string representing the url for NY Times Today's, you must write a python script using the BeautifulSoup and requests modules to scrap all the headlines of that page.
BeautifulSoup tutorial: S

# Titanic

In [14]:
input_task = 'titanic'

df_data = get_dict_for_task(input_task)
test_data(df_data, model, fine_tunned_keras_model)

Given a string representing a url for the titanic dataset (in csv format), you must write an algorithm using the pandas and seaborn modules to create a barchart with the aggregate average of the passengers'  fare according to the following constraints
python - 'Could not interpret input' error with Seaborn when plotting groupbys - Stack Overflow

(67, 0.815, 'Plotting a graph between 2 variables using pandas')
(21, 0.814, 'python pandas grouping aggregate seaborn')
(70, 0.814, 'Pandas Key Error: 0 while plotting a seaborn boxplot')
(54, 0.801, 'Browse other questions tagged python pandas grouping aggregate seaborn or ask your own question.')
(19, 0.793, 'I can plot it from Pandas no problem ...')
(20, 0.786, 'But why do I get this error when I try it in seaborn ?')
(12, 0.785, "` Could not interpret input' error with Seaborn when plotting groupbys")
(28, 0.742, "You'll obtain the same result.")
(33, 0.722, 'This way you could feed it directly to seaborn.')
(75, 0.694, 'ValueError: Coul

pandas.DataFrame.sort_values — pandas 1.3.5 documentation

(23, 0.765, '© Copyright 2008-2021, the pandas development team.')
(7, 0.712, 'If this is a list of bools, must match the length of the by.')
(0, 0.659, 'Sort by the values along either axis.')
(18, 0.644, 'It will be applied to each column in by independently.')
(3, 0.597, "if axis is 1 or ` columns' then by may contain column levels and/or index labels.")
(2, 0.576, "if axis is 0 or ` index' then by may contain index levels and/or column labels.")
(11, 0.502, 'mergesort and stable are the only stable algorithms.')

--------------------

Given a string representing a url for the titanic dataset (in csv format), you must write an algorithm using the pandas and seaborn modules to create a barchart with the aggregate average of the passengers'  fare according to the following constraints
pandas.core.groupby.DataFrameGroupBy.aggregate — pandas 1.3.5 documentation

(32, 0.765, '© Copyright 2008-2021, the pandas development team.')
