In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
from sklearn.metrics import confusion_matrix
import time
from utils.io import info, ok, warning, html
import os
from utils.data_item import DataItem
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import myversions.pigeonXT as pixt
import pickle
import warnings
from math import ceil
import re
import matplotlib.pyplot as plt
import spacy
from sklearn.neural_network import MLPClassifier
from utils.classifier import Classifier
from utils.term_highlighter import TermHighlighter
from joblib import load,dump

from sklearn.model_selection import cross_validate

import logging

class HRSystem(object):
    EXPANSION=10
#     LABELING_THRESHOLD=0.90
    RELEVANT_LABEL='Relevant'
    IRRELEVANT_LABEL='Irrelevant'
#     CACHE_PATH='./cache/'
#     LABELED_PATH=CACHE_PATH+'labeled_data_not_incanada.p'       ## <<< MODIFIED
#     UNLABELED_PATH=CACHE_PATH+'unlabeled_data.p'   ## <<< MODIFIED
#     VECTOR_TYPE=DataItem.TYPE_BOW
    VOCAB = np.array(open('../04. Model of DP/precomputed/vocab_with_dp.txt').read().splitlines())
    nlp = spacy.load('en_core_web_sm', disable=['textcat', 'parser','ner'])
#     MODELS_FILE = 'cache/models.joblib'
#     HIGHLIGHTHER_FILE = 'cache/highlighter.joblib'
    
    
    def _load_models(self):
        if not os.path.isfile(self.models_path):
            logging.debug('Creating the model from scratch. Need training.')
            assert not os.path.isfile(self.highlighter_path)
            self.classifiers = [Classifier(MLPClassifier( early_stopping=True,n_iter_no_change=20,max_iter=1500,hidden_layer_sizes=(20,), solver='adam'), type_=DataItem.TYPE_BOW),
                                Classifier(MLPClassifier( early_stopping=True,n_iter_no_change=20,max_iter=1500,hidden_layer_sizes=(100,), solver='adam'), type_=DataItem.TYPE_GLOVE300),
                          ]

            self.term_highlighter = TermHighlighter()

            logging.debug('Starting training of models.')
            self._retrain(partial=False)
            logging.debug('Finished training of models.')
            
        else:
            logging.debug('Loading models from disk. No need for training.')
            assert os.path.isfile(self.highlighter_path)
            self._models_fromdisk()
            
    def _load_data(self,from_scratch):
        ################
        # LABELED DATA #
        ################
        if not from_scratch:
            if not os.path.isfile(self.labeled_path):
                logging.debug('Computing labeled data from "labeled_data.csv"')

                self.labeled_data=[]
                #############################
                # LABELED OVER THREE ROUNDS #
                #############################
                for line in open('labeled_data.csv').read().splitlines()[1:]:
                    id_,label = line.split(';')
                    item = DataItem(id_)
                    if label=='R':
                        item.set_relevant()
                    else:
                        item.set_irrelevant()
                        assert label=='I'
                    if item.has_vector():
                        self.labeled_data.append(item)
                

            else:
                #############################################
                # RETRIEVING FROM DISK INSTEAD OF COMPUTING #
                #############################################
                logging.debug(f'Retrieving labeled data from disk ({self.labeled_path})')
                self.labeled_data = pickle.load(open(self.labeled_path,'rb'))
            

        ##################
        # UNLABELED DATA #
        ##################
        GM1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
        GM2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'
        if not os.path.isfile(self.unlabeled_path):
            logging.debug(f'Computing unlabeled data from {GM1} and {GM2}')
            self.unlabeled_data = [DataItem(GM1+file_) for file_ in os.listdir(GM1)] + [DataItem(GM2+file_) for file_ in os.listdir(GM2)]
            relevant_ids = set([item.id_ for item in self.labeled_data])
            self.unlabeled_data = [item for item in self.unlabeled_data if item.has_vector() and not item.id_ in relevant_ids]
            
        else:
            #############################################
            # RETRIEVING FROM DISK INSTEAD OF COMPUTING #
            #############################################
            logging.debug(f'Retrieving unlabeled data from disk ({self.unlabeled_path})')
            self.unlabeled_data = pickle.load(open(self.unlabeled_path,'rb'))
            
        
        self.rangen.shuffle(self.unlabeled_data)
        
        if from_scratch:
            logging.debug('Creating labeled data from scratch')
            self.labeled_data=[]
            valid=False
            while not valid:
                seed = input('Please insert URLs to relevant document (; sep) (e.g., https://www.proquest.com/docview/1288605023/...)')
                
                matches = re.findall('docview/([0-9]*)/',seed)
                logging.debug(f'User inputs: {seed}')
                if len(matches)>=1:
                    ids = set(matches)
#                     print(ids)
                    positions = [idx for idx,item in enumerate(self.unlabeled_data) if item.id_ in ids]
                    if len(positions)>=1:
                        for position in reversed(positions):
                            self.labeled_data.append(self.unlabeled_data[position])
                            self.labeled_data[-1].set_relevant()
                            del(self.unlabeled_data[position])
                        valid=True
                        loging.debug(f"Valid input, documents found: {','.join([item.id_ for item in self.labeled_data])}")
                    else:
                        logging.debug('User query had something that look like valid documents but not present in database.')
                        warning('Documents not found in database (The Globe and Mail 1936 onwards), please try again.')
                else:
                    logging.debug('User input does not look like valid URL.')
                    warning('Invalid URLs, please try again.')
    
    ################################################################
    #                         Constructor                          #
    ################################################################
    def __init__(self,from_scratch=False, session_name='default',labeling_batch=10):                                     
        self.session_name = session_name
        print('Starting system...'+' '*80, end='\r')
        if not os.path.exists(f'sessions/{session_name}'):
            logging.debug('Session directories do not exist, creating (data, log, models).')
            os.mkdir(f'sessions/{session_name}')
            os.mkdir(f'sessions/{session_name}/data/')
            os.mkdir(f'sessions/{session_name}/log/')
            os.mkdir(f'sessions/{session_name}/models/')
        self.labeled_path = f'sessions/{session_name}/data/labeled_data.p'
        self.unlabeled_path = f'sessions/{session_name}/data/unlabeled_data.p'
        self.models_path = f'sessions/{session_name}/models/models.joblib'
        self.highlighter_path = f'sessions/{session_name}/models/highlighter.joblib'
        self.iteration_counter_path = f'sessions/{session_name}/data/iteration_counter.p'

        logging.basicConfig(filename=f'sessions/{session_name}/log/system.log', 
                            format='%(asctime)s [%(levelname)s] %(message)s' ,
                            encoding='utf-8', 
                            datefmt='%Y-%m-%d %H:%M:%S',
                            force=True,
                            level=logging.DEBUG)
        if os.path.isfile(self.iteration_counter_path):
            self.iteration_count = pickle.load(open(self.iteration_counter_path,'rb'))
            logging.debug(f'Iteration counter retrieved from disk, iteration count={self.iteration_count}')
        else:
            self.iteration_count = 0
                                     
        logging.debug(f'System starting. Session name={session_name}. Iteration count={self.iteration_count}')
        self.remaining_relevant=None
        self.rangen = np.random.default_rng(2022)
        self.suggestions=[]
        self.annotations = pd.DataFrame([],columns=['label'])
        self.labeling_batch=labeling_batch
                                     
        print('Loading data...'+' '*80, end='\r')
        self._load_data(from_scratch)
        logging.debug(f"len(labeled_data)={len(self.labeled_data)} {self._labeled_data_str()}")
        logging.debug(f"len(unlabeled_data)={len(self.unlabeled_data)} {self._unlabeled_data_str()}")
#                                                                               for item in self.labeled_path[:2]+\
#                                                                              self.labeled_path[]+])}")
#         logging.debug(f'len(unlabeled_data)={len(self.unlabeled_data)} ()')
        ################################
        # PRELOAD LABELED DATA VECTORS #
        ################################
        for item in self.labeled_data:
            item.preload_vector(type_=DataItem.TYPE_GLOVE300) 
            item.preload_vector(type_=DataItem.TYPE_BOW)       
        

        print('Loading/training models...'+' '*80, end='\r')
        self._load_models()
        print('System started. Saving session and computing status...')
        self.save()
        self.status()
                                     
        logging.debug('System started succesfully.')
        
    ################################################################
    #                   save/load models                           #
    ################################################################  
    def _models_todisk(self):
        for file_ in [self.models_path,self.highlighter_path ]:
            if os.path.isfile(file_):
                logging.debug(f'Overwriting file {file_}')
            else:
                logging.debug(f'Creating file {file_}')
        dump(self.classifiers, self.models_path)
        dump(self.term_highlighter, self.highlighter_path)
                                     
    def _models_fromdisk(self):
        logging.debug(f'Retrieving {self.highlighter_path} and {self.models_path} from disk')
        self.term_highlighter = load(self.highlighter_path)
        self.classifiers = load(self.models_path)
        
    ################################################################
    #                          Re-Train                            #
    ################################################################  
    def _retrain(self, partial=True): 
        expanded=False
        if len(self.labeled_data)<10:

            expanded=True            
            self.labeled_data += [DataItem(item.id_) for item in self.rangen.choice(
                                                                                   self.unlabeled_data,
                                                                                   size=HRSystem.EXPANSION,
                                                                                   replace=False)]
                                     
            logging.debug(f'Only {len(self.labeled_data)} articles labeled. Expanding labeled data'\
                         f' (adding {HRSystem.EXPANSION} randomly selected negative examples)'\
                         f" ({','.join([str(item.id_) for item in self.labeled_data[-HRSystem.EXPANSION:]])})")
                                     
            for item in self.labeled_data[-HRSystem.EXPANSION:]:
                item.set_irrelevant()
        logging.debug(f'Training each of the {len(self.classifiers)} classifiers.')
        for clf in self.classifiers:
#             info(f'Training: {str(clf)}')
            logging.debug(f'Training model {str(clf)}')
            if not partial:
                clf.fit(self.labeled_data)
            else:
                clf.fit(self.labeled_data)
            
        logging.debug('Training term highlighter.')
        self.term_highlighter.fit(self.labeled_data)
        logging.debug('Done training.')
                     
        
        if expanded:
                logging.debug('Removing randomly selected negatives examples (expansion)'\
                             f" ({','.join([str(item.id_) for item in self.labeled_data[-HRSystem.EXPANSION:]])})")
                                     
                
                self.labeled_data = self.labeled_data[:-HRSystem.EXPANSION]
                logging.debug(f'Size of labeled data after removing expansion={len(self.labeled_data)}')
    
    def _move_suggestions_to_labeled(self): 
        assert len(self.suggestions)>0

        logging.debug(f'There are {len(self.suggestions)} suggestions labeled that need to be moved to labeled_data. Re-train required.')
        logging.info(f'len(suggestions)={len(self.suggestions)} {self._suggestions_str()}')
        logging.info(f'Annotations: '+','.join(self.annotations["label"]))
        logging.debug('Moving from suggestions --to--> labeled data (for latter training)')
        need_retrain=True
        
        relevant_count=0
            
        for item,label in zip(self.suggestions, self.annotations["label"]):
            if label==HRSystem.RELEVANT_LABEL:
                item.set_relevant()
                relevant_count+=1
            else:
                item.set_irrelevant()
                assert label==HRSystem.IRRELEVANT_LABEL
        logging.info(f'From the {len(self.suggestions)} suggestions {relevant_count} where found relevant.'\
                     f' ({relevant_count/len(self.suggestions):5.4f})')
        
        self.labeled_data = self.labeled_data+self.suggestions
        logging.debug(f"new len(labeled_data)={len(self.labeled_data)} {self._labeled_data_str()}")
        logging.debug(f"new len(unlabeled_data)={len(self.unlabeled_data)} {self._unlabeled_data_str()}")
                                     
        del(self.annotations)
    
        print('Re-training models using new suggestions ',end='\r')
        logging.debug(f'Re-training model ussing new {len(self.suggestions)} suggestions.')
        self._retrain()
    ################################################################ 
    #                        SAVE LISTS                            #   
    ################################################################                                                                                                   'relevant (estimated)')        
    def save(self):   
        if len(self.suggestions)>0:
            logging.debug('Attemping to save system\'s state but there are labeled suggestions to be stored first. Re-train required.')
            self._move_suggestions_to_labeled()
        else:
            logging.debug('Saving system\'s state. NO suggestions pending to be moved to labeled data.')
                                     
        for file_ in [self.labeled_path,self.unlabeled_path,self.iteration_counter_path]:
            if os.path.isfile(file_):
                logging.debug(f'Overwriting file {file_}')
            else:
                logging.debug(f'Creating file {file_}')
            

                                     
                                     
        pickle.dump(self.labeled_data, open(self.labeled_path, 'wb'))
        pickle.dump(self.unlabeled_data, open(self.unlabeled_path, 'wb'))
        pickle.dump(self.iteration_count, open(self.iteration_counter_path,'wb'))
        self._models_todisk()
    
    def _relevant_count(self):
        return len([item for item in self.labeled_data if item.label==DataItem.REL_LABEL])
        
    def _labeled_count(self):
        return len(self.labeled_data)
    def _unlabeled_count(self):
        return len(self.unlabeled_data)
    
    
    def _filter_and_sort_candidates(self,candidate_args):

                                     
        # First Model
        
        yhat1 = self.classifiers[0].predict([self.unlabeled_data[arg] for arg in candidate_args])
        candidate_args = np.array(candidate_args)[yhat1>0.5]
        logging.debug(f'Number of suggestions 1st model: {np.sum(yhat1>0.5)} (discarded: {yhat1<=0.5})')
        
                         
        # Second Model
        yhat1 = yhat1[yhat1>0.5]
        candidate_args = np.array(candidate_args)[yhat1>0.5]
        yhat1 = yhat1[yhat1>0.5]
        yhat2 = self.classifiers[1].predict([self.unlabeled_data[arg] for arg in candidate_args])   
        logging.debug(f'Number of suggestions 2nd model: {np.sum(yhat2>0.5)} (discarded: {yhat2<=0.5})')
                                     
        # Third Model
        mask = (yhat2>0.5) #& (yhat3>0.5)
        yhat1 = yhat1[mask]
        yhat2 = yhat2[mask]
        candidate_args = np.array(candidate_args)[mask]
        yhat4 = self.term_highlighter.predict([self.unlabeled_data[arg] for arg in candidate_args])
        logging.debug(f'Number of suggestions 3rd model: {np.sum(yhat4>0.5)} (discarded: {yhat4<=0.5})')
                                     
        # Average
        yhat = np.average(np.vstack([yhat1,yhat2,yhat4]), axis=0)                                     
        yhat = yhat[yhat4>0.5]
        candidate_args = np.array(candidate_args)[yhat4>0.5]
        candidate_args = np.array(candidate_args)[np.argsort(yhat)[::-1]]
               
        # Info for debugging
        end = min(len(candidate_args),self.labeling_batch)
        confidence_levels = [f'{yhat[arg]:4.3f}' for arg in np.argsort(yhat)[::-1]]
        logging.debug(f"Confidence levels for suggestions: {','.join(confidence_levels[:10])} ")
                      
        return candidate_args, yhat
                                     
    def _compute_suggestions(self):
#         cap = min(50000,len(self.unlabeled_data))
#         logging.debug(f'Computing suggestions. Calculating predictions for {cap} unlabeled articles.')
# #         info('Re-trained. Computing suggestions...')
        
#         candidate_args = self.rangen.choice(range(len(self.unlabeled_data)), size=cap, replace=False )
# #         info(f'Making predictions over {len(candidate_args)} with model: {self.classifiers[0]}')
#         logging.debug(f"Predicting with " +str(self.classifiers[0]).replace('\n','') )
#         print('Predicting with first model',end='\r')
#         yhat1 = self.classifiers[0].predict([self.unlabeled_data[arg] for arg in candidate_args])
# #         print(f'yhat1.shape={yhat1.shape}')
#         logging.debug(f'Only {np.sum(yhat1>0.5)} articles suggested found with first classifier ({len(candidate_args)-np.sum(yhat1>0.5)} discarded)')
        
# #         info(f'Descarding {len(candidate_args)-np.sum(yhat1>0.5)} articles.')
#         print(f'Predicting with second model ({len(candidate_args)-np.sum(yhat1>0.5)} discarded)',end='\r')
#         candidate_args = np.array(candidate_args)[yhat1>0.5]
#         yhat1 = yhat1[yhat1>0.5]
        
# #         info(f'Making predictions over {len(candidate_args)} with model: {self.classifiers[1]}')
#         logging.debug(f"Predicting with " +str(self.classifiers[1]).replace('\n','') )
#         yhat2 = self.classifiers[1].predict([self.unlabeled_data[arg] for arg in candidate_args])
#         logging.debug(f'Only {np.sum(yhat2>0.5)} articles suggested found with second classifier ({len(candidate_args)-np.sum(yhat2>0.5)} discarded)')
                   
# #         yhat3 = self.classifiers[2].predict([self.unlabeled_data[arg] for arg in candidate_args])
        
        
# #         info(f'Descarding {len(candidate_args)-np.sum(yhat2>0.5)} articles.')
#         mask = (yhat2>0.5) #& (yhat3>0.5)
        
#         yhat1 = yhat1[mask]
#         yhat2 = yhat2[mask]
# #         yhat3 = yhat3[mask]
# #         print(f'yhat1.shape={yhat1.shape}')
# #         print(f'yhat2.shape={yhat2.shape}')
# #         print(f'yhat3.shape={yhat3.shape}')
#         print(f'Predicting with last model ({len(candidate_args)-np.sum(yhat1>0.5)} discarded)',end='\r')
#         candidate_args = np.array(candidate_args)[mask]
        
#         logging.debug(f'Predicting with term highlighter ({str(self.term_highlighter.model)})')

# #         info(f'Making predictions over {len(candidate_args)} with model: {self.term_highlighter}')
#         yhat4 = self.term_highlighter.predict([self.unlabeled_data[arg] for arg in candidate_args])
#         logging.debug(f'Only {np.sum(yhat4>0.5)} articles suggested found with second classifier ({len(candidate_args)-np.sum(yhat4>0.5)} discarded)')

# #         print(f'yhat4.shape={yhat4.shape}')
#         yhat = np.average(np.vstack([yhat1,yhat2,yhat4]), axis=0)
# #         yhat = np.average(np.vstack([yhat1,yhat2,yhat4]), axis=0)
        
#         candidate_args = np.array(candidate_args)[np.argsort(yhat)[::-1]]
        cap = min(50000,len(self.unlabeled_data))
                                     
        self.estimated=False
        if cap!=len(self.unlabeled_data):
                  self.estimated=True
                                     
        logging.debug(f'Computing batch of suggestions, batch_size={cap}')
                      
        candidate_args = self.rangen.choice(range(len(self.unlabeled_data)), size=cap, replace=False )  
        candidate_args, _ = self._filter_and_sort_candidates(candidate_args)
                                     

                  
        self.remaining_relevant = len(candidate_args) #np.sum(yhat4>0.5) #np.sum(yhat>0.5)
        logging.debug(f'Relevant found in the {cap} articles analyzed: {self.remaining_relevant} ')
        if self.estimated:
            self.remaining_relevant = int((self.remaining_relevant/cap)*len(self.unlabeled_data))
            logging.debug(f'Estimated relevant in the remaining {len(self.unlabeled_data)} articles: {self.remaining_relevant} ')
#             self.remaining_relevant = estimated_relevants

        # Remove most promising from unlabeled 
        #   and add to suggestions
        
        ################
        ## UNTIL HERE ##
        ################
        
        self.suggestions = []
        self.annotations = []
        
        if self.remaining_relevant==0:
            logging.debug('No good candidates (predictions<0.5).')
            warning('There are no good candidates provided by the model. '\
            'This could happend at the beginin and at the end of the labeling process')
            
        end = min(len(candidate_args),self.labeling_batch)
        best_ten_args = candidate_args[:end]

#         best_ten_args = np.argsort(yhat)[-start:][::-1]

        for arg in best_ten_args:
            self.suggestions.append(self.unlabeled_data[arg])
    
        for arg in sorted(best_ten_args,reverse=True):
            del(self.unlabeled_data[arg])
        
        logging.debug('Moving the suggestions made by the model from unlabeled_data --to--> suggestions')              
        logging.debug(f"new len(unlabeled_data)={len(self.unlabeled_data)} {self._unlabeled_data_str()}")
        logging.debug(f'new len(suggestions)={len(self.suggestions)} {self._suggestions_str()}')

        for classifier in self.classifiers:
            for item in self.suggestions:
                item.preload_vector(type_=classifier.vector_type)
                item.preload_vector(type_=classifier.vector_type)
        

#         info(f'Moving {len(self.suggestions)} unlabeled suggestions'\
#              f' from unlabeled data ({len(self.unlabeled_data)} - {len(self.suggestions)})')
    ################################################################ 
    #                             LOOP                             #   
    ################################################################  
    def loop(self, finish_function=None):        
        ####################################################
        # MOVING FROM SUGGESTIONS (ANNOTATIONS) TO LABELED #
        ####################################################
        logging.debug(f'Starting loop no {self.iteration_count}')
        need_retrain=False
        if len(self.suggestions)>0:
            self._move_suggestions_to_labeled()
        
        print('Calculating new suggestions',end='\r')
        self._compute_suggestions()
        
        highlighter = None
        if self.term_highlighter.trained:
            logging.debug(f"Highlighting with {','.join(self.term_highlighter.sorted_terms()[:10])},...")
            highlighter = self.term_highlighter
        text_for_label = [suggestion.get_htmldocview(highlighter=highlighter)
                          for suggestion in self.suggestions]

        print()
        self.status()
        self.annotations = pixt.annotate(
                                         text_for_label,
                                         options=[HRSystem.RELEVANT_LABEL, HRSystem.IRRELEVANT_LABEL],
                                         stop_at_last_example=False,
                                         display_fn=html,
                                         final_process_fn=finish_function
                                        )
        self.iteration_count+=1
                          
    def review_labeled(self, how_many=20):
        highlighter = None
        if self.term_highlighter.trained:
            highlighter = self.term_highlighter
        text_for_label = [suggestion.get_htmldocview(highlighter=highlighter)
                          for suggestion in self.labeled_data[-how_many:]]

        df = pd.DataFrame(
                       {
                        'example': text_for_label,
                        'changed':[True]*how_many,
                        'label':[HRSystem.RELEVANT_LABEL if item.label==DataItem.REL_LABEL else HRSystem.IRRELEVANT_LABEL  
                                 for item in self.labeled_data[-how_many:] ]
                       }
                      )
        self.annotations = pixt.annotate(
                                         df,
                                         options=[HRSystem.RELEVANT_LABEL, HRSystem.IRRELEVANT_LABEL],
                                         stop_at_last_example=False,
                                         display_fn=html,
#                                          final_process_fn=finish_function
                                        )
        
        
        # CORREGIR self.labeled.
        # REPORTAR CNATIDAD DE CAMBIOS. 
        
        # if changes need retrain. Do here?
        
        
    ################################################################ 
    #                            EXPORT                            #   
    ################################################################  
    def export(self):
        if len(self.suggestions)>0:
            logging.debug('Attemping to export system\'s state but there are labeled suggestions to be stored first. Re-train required.')
            self._move_suggestions_to_labeled()
        else:
            logging.debug('Exporting... (NO suggestions pending to be moved to labeled data)')
                      
        filename = f'sessions/{self.session_name}/data/exported_data_'+time.strftime("%Y-%m-%d_%H-%M")+'.csv'
        with open(filename, 'w') as writer:
            writer.write('URL,relevant_or_suggested,confidence\n')
            for item in self.labeled_data:
                if item.is_relevant():
                    writer.write(f'https://proquest.com/docview/{item.id_},relevant,1\n')
            
            
        
            # MAKE PREDICTIONS AND STORE SUGGESTIONS....
            batch= 20000
            suggestions = []
            for i in range(0, len(self.unlabeled_data),batch):
                ini = i
                fin = min(ini+batch,len(self.unlabeled_data))
                args = list(range(ini,fin))
                candidate_args, yhat = self._filter_and_sort_candidates(candidate_args)
                suggestions = list(zip([self.unlabeled_data[arg] for arg in candidate_args], yhat))
            suggestions = sorted(suggestions, key=lambda x: x[1], reverse=True)
            for item,confidence in suggestions:
                writer.write(f'https://proquest.com/docview/{item.id_},suggested,{confidence:4.3f}\n')
        # System call to send file
        
    ################################################################ 
    #                            STATUS                            #   
    ################################################################ 
    def _center(str_,width=100):
        width_aux =width-2
        out =  '#'+' '*(int((width_aux-len(str_))/2))+str_+' '*(int((width_aux-len(str_))/2))+'#'
        if len(out)!=width:
            out = out[:-1]+' #'
        return out
    def _left(str_,width=100):
        output =  '# '+str_
        remaining = width-len(output)
        return output+' '*(remaining-1)+'#'
    def status(self):
        width = 100
        print('#'*width)

        print(HRSystem._center('~~~~~~~~~~~~~~~~~~'))
        print(HRSystem._center('~ System Status: ~'))
        print(HRSystem._center('~~~~~~~~~~~~~~~~~~'))
        print(HRSystem._left(f'Number of labeled articles:   {len(self.labeled_data):10,}    -    {self._relevant_count():10,} relevant '\
        f'   {len(self.labeled_data)-self._relevant_count():10,} irrelevants'))
        if self.remaining_relevant is None:
            print(HRSystem._left(f'Number of unlabeled articles: {len(self.unlabeled_data):10,}    -'+' '*10+' N/A suggestions '))
        else:
            print(HRSystem._left(f'Number of unlabeled articles:   {len(self.unlabeled_data):10,}  -    {self.remaining_relevant:10,} suggestions '\
            f'{len(self.unlabeled_data)-self.remaining_relevant:10,} irrelevants'))
            if self.estimated:
                print(HRSystem._left(' '*60+'(ESTIMATED)'))
        print(HRSystem._center(''))
        
        ytrue = DataItem.get_y(self.labeled_data)
        for model in self.classifiers + [self.term_highlighter]:
            print(HRSystem._left(str(model.model).replace('\n','').replace('  ','')))
            print(HRSystem._left('~'*len(str(model.model).replace('\n','').replace('  ',''))))

            yhat = model.predict(self.labeled_data)>0.5
            yhat = yhat.astype('int')
            scores = model.cross_validate_on(self.labeled_data,cv=3)
            metrics = [
                       scores['train_accuracy'],
                       scores['train_precision'],
                       scores['train_recall'],
                       scores['train_f1'],
                       ]
            performance = f'{str(model)};train_accuracy:{metrics[0]};train_precision:{metrics[1]};'
            performance += f'train_recall:{metrics[2]};train_f1:{metrics[3]};'
            logging.info(performance)
            print(HRSystem._left('accuracy   precision      recall      f1-score'))
            print(HRSystem._left('  '+'      '.join(f'{metric:5.4f}' for metric in metrics) +' (train)'))
            metrics = [
                       scores['test_accuracy'],
                       scores['test_precision'],
                       scores['test_recall'],
                       scores['test_f1'],
                       ]
            performance = f'{str(model)};test_accuracy:{metrics[0]};test_precision:{metrics[1]};'
            performance +=f'test_recall:{metrics[2]};test_f1:{metrics[3]};'
            logging.info(performance)
            print(HRSystem._left('  '+'      '.join(f'{metric:5.4f}' for metric in metrics)+' (test )'))
            tn, fp, fn, tp = (confusion_matrix(ytrue,yhat)).ravel()
            print(HRSystem._center('~~~~~~~~~~~~~~~~~~~~~~~~~~'))
            print(HRSystem._center('~~~~~Confusion Matrix~~~~~'))
            print(HRSystem._center('~~~~~~~~~~~~~~~~~~~~~~~~~~'))
            print(HRSystem._center(f'TN = {tn:6,}    FP = {fp:6,}'))
            print(HRSystem._center(f'FN = {fn:6,}    TP = {tp:6,}'))
            print(HRSystem._center(' '))
        print('#'*width)
    def _labeled_data_str(self):
        str_ = '<'+','.join([item.id_ for item in self.labeled_data[:2]])
        str_ +=', ... ,'
        str_ +=','.join([item.id_ for item in self.labeled_data[-2:]])+'>'
        return str_
    def _unlabeled_data_str(self):
        str_ = '<'+','.join([item.id_ for item in self.unlabeled_data[:2]])
        str_ +=', ... ,'
        str_ +=','.join([item.id_ for item in self.unlabeled_data[-2:]])+'>'
        return str_
    def _suggestions_str(self):
        str_ = '<'+','.join([item.id_ for item in self.suggestions])+'>'
        return str_

ok('Done')

2022-04-25 20:36:29.266402 [  [1;92mOK[0m   ] Done


In [2]:
from IPython.display import display, clear_output
from ipywidgets import (
        Button,
        Dropdown,
        HTML,
        HBox,
        VBox,
        IntSlider,
        FloatSlider,
        Textarea,
        Output,
        ToggleButton
)
data = []
session_name = 'test'
from_scratch = False

def disable_buttons():
    for button in buttons:
        button.disabled=True
def enable_buttons():
    for button in buttons:
        button.disabled=False
        
def on_click_init(button=None):
    clear_output(wait=False)
    disable_buttons()
    display(HBox(buttons))
    system = HRSystem(from_scratch=from_scratch, session_name=session_name)
    buttons[0].description='RE-INIT'
    data.append(system)
    if len(data)==2:
        del(data[0])
    enable_buttons()    ## THIS BUTTON SHOULD BE ENABLED FROM THE annotation.py library!!! (PIXT)
def on_click_loop(button=None):
    clear_output(wait=False)
    disable_buttons()
    display(HBox(buttons))
    system = data[0]
    system.loop(finish_function=enable_buttons)
def on_click_save(button=None):
    clear_output(wait=False)
    disable_buttons()
    display(HBox(buttons))
    system = data[0]
    system.save()
    enable_buttons()
def on_click_export(button=None):
    clear_output(wait=False)
    disable_buttons()
    display(HBox(buttons))
    system = data[0]
    system.export()
    enable_buttons()
def on_click_review(button=None):
    pass
descriptions = ['INIT', 'LOOP', 'SAVE', 'EXPORT', 'REVIEW']
on_click_functions = [on_click_init, on_click_loop, on_click_save, on_click_export,on_click_review]
buttons = [Button() for i in range(len(descriptions))]

for idx,button in enumerate(buttons):
    button.description = descriptions[idx]
    button.disabled=False
    button.on_click(on_click_functions[idx])

for i in range(4):
    buttons[i+1].disabled=True
    

display(HBox(buttons))


HBox(children=(Button(description='RE-INIT', disabled=True, style=ButtonStyle()), Button(description='LOOP', d…

Calculating new suggestions