In [76]:
################################################################################
#                                    IMPORTS                                   #                                        
################################################################################
import os
import numpy as np
import datetime
from whoosh import index
from whoosh.qparser import MultifieldParser
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import clear_output
from sklearn.naive_bayes import MultinomialNB
import spacy
import myversions.pigeonXT as pixt
from lxml import etree
import pickle
from sklearn.model_selection import cross_validate
from IPython.core.display import display, HTML
import re
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
# display(HTML('<h1>Hello, world!</h1>'))
# from ipywidgets import Output

from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1, precision, recall, accuracy
################################################################################
#                                       AUX                                    #                                        
################################################################################

def info(str_):
    print(f'{datetime.datetime.now()} [ \033[1;94mINFO\x1b[0m  ] {str_}')
def ok(str_):
    print(f'{datetime.datetime.now()} [  \033[1;92mOK\x1b[0m   ] {str_}')
def warning(str_):
    print(f'{datetime.datetime.now()} [\x1b[1;31mWARNING\x1b[0m] {str_}')
def html(str_=''):
    display(HTML(str_))
# info('Starting Script...')

## ## ##

################################################################################
#                                   DataItem                                   #                                        
################################################################################
class DataItem(object):
    nlp = spacy.load('en_core_web_sm', disable=['tagger','parser','lemmatizer','textcat', 'ner'])
    vocab = open('vocab/vocab_filtered.txt','r').read().splitlines()
    word2idx = dict([(word,idx) for idx, word in enumerate(vocab)])
    tokenizer = nlp.tokenizer
    TS_data_path = '/home/ec2-user/SageMaker/data/Toronto_Star_Publication_with_query/'
    GM_data_path = '/home/ec2-user/SageMaker/data/The_Globe_and_Mail_with_DP_filter_by_article_type/'
    vectors_path = '/home/ec2-user/SageMaker/mariano/notebooks/03. High Recall Retrieval System/vectors/'
    def __init__(self, file_id, source):
        self.file_id = file_id
        self.source = source
        self.label = 'Undefined'
        self.vector = None
        self.bow_vector = None
        self.prediction = None
    def __eq__(self, other):
        return self.file_id==other.file_id and self.source==other.source
    def __str__(self):
        return f'{self.file_id},{self.source}'
    def filename(self):
        if self.source=='Toronto Star':
            return DataItem.TS_data_path+self.file_id
        else:
            return DataItem.GM_data_path+self.file_id
    def vector(self):
        if self.vector is None:
            vector_filename = DataItem.vectors_path + self.file_id[:-4]+'.vector'
            self.vector = pickle.load(open(vector_filename, 'rb'))
        return self.vector
    def get_bow_vector(self):
        if self.bow_vector is None:
            bow_vector_file = DataItem.vectors_path+self.file_id[:-4]+'.bow_vector'
            if os.path.isfile(bow_vector_file):
                self.bow_vector = pickle.load(open(bow_vector_file, 'rb'))
            else:
                self.bow_vector = np.zeros(shape=(len(DataItem.vocab)+1,))
                if not text is None and not title is None:
                    for token in DataItem.tokenizer(title+' '+text):
                        t = token.text.replace('\n','')
                        t = t.lower()
                        if t in DataItem.word2idx:
                            self.bow_vector[DataItem.word2idx[t]]+=1
                        else:
                            self.bow_vector[len(DataItem.vocab)]+=1
                    
        return self.bow_vector
    def _title_and_text(self):
        tree = etree.parse(self.filename())
        root = tree.getroot()
        if root.find('.//HiddenText') is not None:
            text = (root.find('.//HiddenText').text)

        elif root.find('.//Text') is not None:
            text = (root.find('.//Text').text)

        else:
            text = None
        title = root.find('.//Title')
        if title is not None:
            title = title.text
        if not text is None:
            text = BeautifulSoup(text, parser='html.parser').get_text()
        
        return text,title
    def get_docview_html(self, keywords=[]):
        tree = etree.parse(self.filename())
        root = tree.getroot()
        if root.find('.//HiddenText') is not None:
            text = (root.find('.//HiddenText').text)

        elif root.find('.//Text') is not None:
            text = (root.find('.//Text').text)

        else:
            text = None
#         text = BeautifulSoup(text, parser='html.parser').get_text()
#         text = re.sub('\n\n*', '<br>',text.strip())
#         text = text.replace('\n+','<br>')
        title = root.find('.//Title').text
        date = root.find('.//NumericDate').text
        for keyword in keywords:
            text = re.sub(f'({keyword})', f'<mark>\\1</mark>', text, flags=re.IGNORECASE)
        # ADD DATE ########################
        url = f'https://proquest.com/docview/{self.file_id[:-4]}'
        url = f'<a href="{url}">{url}</a>'
        publisher = root.find('.//PublisherName').text
        return  '<html><hr style=\"border-color:black\">'\
                '<u>TITLE</u>: &emsp;&emsp;{}<br>'\
                '<u>DATE</u>: &emsp;&emsp;{}<br>'\
                '<u>PUBLISHER</u>: &emsp;{}<br>'\
                '<u>URL</u>:&emsp;&emsp;&emsp;{}<hr>'\
                '{}<hr style=\"border-color:black\"></html>'.format(
                                                                               str(title),
                                                                               date,
                                                                               publisher,
                                                                               url,
                                                                               str(text))


################################################################################
#                                    SYSTEM                                    #                                        
################################################################################
class HRSystem():

    annotation_batch_size = 10
    metric_names = ['fit_time', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1']
    index_path = '/home/ec2-user/SageMaker/mariano/notebooks/03. High Recall Retrieval System/index'
    
    def __init__(self):
        self.iteration_no = 0
        self.metrics = []
        self.annotation_batch_size = 10
        info('Loading index...')
        self.ix = index.open_dir(HRSystem.index_path)
        info(f'Index Loaded ({self.ix.doc_count():,} files found).')
        


    def _search(self,limit=10):
        # Search query
        query = input("Enter Search Terms: ")

        mp = MultifieldParser(['title','body'],schema=self.ix.schema)
        q = mp.parse(query) #mp.parse(u'Refugee and date:[19941112 TO 19941113]')

        #OPEN
        searcher = self.ix.searcher()
        results = searcher.search(q, limit=limit)

        aux_results = [(result['file_id'], result['source']) for result in results]
        while len(aux_results)==0:
            query = input("No hits found. Enter New Search Terms: ")
            mp = MultifieldParser(['title','body'],schema=self.ix.schema)
            q = mp.parse(query) #mp.parse(u'Refugee and date:[19941112 TO 19941113]')
            results = searcher.search(q, limit=limit)
            aux_results = [(result['file_id'], result['source']) for result in results]
        searcher.close()
        
        self.query = query
        return aux_results
    


    def setup(self):
        self.for_labeling = []
        results = self._search(limit=None)
        info(f'Search completed, number of hits: {len(results):,}.')
        for file_id,source in results[:min(len(results), self.annotation_batch_size)]:
            self.for_labeling.append(DataItem(file_id, source))
        
        results = results[min(len(results), self.annotation_batch_size):]

        self.annotations = pixt.annotate(
            [data_item.get_docview_html(keywords=self.query.split(' ')) for data_item in self.for_labeling],
            options = ['Relevant', 'Irrelevant'],
            stop_at_last_example=False,
            display_fn=html,
        )

        # Loading filenames
    #     filenames = [TS_data_path+filename for filename in os.listdir(TS_data_path)]
    #     filenames += [GM_data_path+filename for filename in os.listdir(GM_data_path)]
    #     info(f'Files found in dataset: {len(filenames):8,}')


        self.unlabeled_data = []
        for file_id,source in results:
            self.unlabeled_data.append(DataItem(file_id,source))
    #     unlabeled_data = []
    #     for filename in os.listdir(TS_data_path):
    #         if not filename in for_labeling_set:
    #             unlabeled_data.append(DataItem(filename, 'Toronto Star'))
    #     for filename in os.listdir(GM_data_path):
    #         if not filename in for_labeling_set:
    #             unlabeled_data.append(DataItem(filename, 'The Globe and Mail'))

        self.labeled_data = []
    #     model = SVCx(kernel='rbf')
        self.model = MultinomialNB()
        info(f'Unlabeled data:         {len(self.unlabeled_data):8,}')
        info(f'Labeled data:           {len(self.labeled_data):8}')
        info(f'Just labeled:           {len(self.for_labeling):8}')
#         return unlabeled_data, labeled_data, for_labeling, annotations, model

    def loop(self):
        self.iteration_no += 1
        metrics = {}
        self.metrics.append(metrics)
#         self.metrics[self.iteration_no - 1] = metrics
        
        for item,label in zip(self.for_labeling, self.annotations['label']):
            item.label=label
            self.labeled_data.append(item)

        self.for_labeling = []

        # if there are no positive AND negative examples training is not performed (rely on search engine results)
        if len(set([item.label for item in self.labeled_data]))==1:
            if self.labeled_data[0].label==0:
                warning('Positive examples missing to build a predictive model')
            else:
                warning('Negative examples missing to build a predictive model')
            warning('Using results from search engine to retrieve examples for labeling...')
            results = self._search(limit=None)
            to_remove = set()
            i = 0
            # FROM RESULTS ADD TO for_labeling THE FIRST TEN IF NOT PRESENT IN labeled_data (already seen)
            label_data_set = set([(item.file_id,item.source) for item in self.labeled_data])
            unlabeled_data_set = set([(item.file_id, item.source) for item in self.unlabeled_data])

            while len(self.for_labeling)!=self.annotation_batch_size and i < len(results):
                file_id, source = results[i]
                item_to_add = DataItem(file_id, source)
                if not (file_id,source) in label_data_set:
                    self.for_labeling.append(item_to_add)
                    to_remove.add((item_to_add.file_id,item_to_add.source))
                i+=1
            # ADD NEW RESULTS TO UNLABELED (if not in labeled_data (already seen))
            for file_id, source in results:
                item = DataItem(file_id, source)
                if not (file_id,source) in label_data_set and not (file_id,source) in unlabeled_data_set:
                    self.unlabeled_data.append(item)

            # REMOVING FROM UNLABELED EVERYTHING THAT WAS SET UP FOR BEING LABELED (in for_labeling)
            i=0
            while (i<len(self.unlabeled_data)):
                if (self.unlabeled_data[i].file_id, self.unlabeled_data[i].source) in to_remove:
                    del(self.unlabeled_data[i])
                else:
                    i+=1

        else:   
            info('Using predictive model to search example for labeling')
            # Re training from labeled data
            vectors = [item.get_bow_vector() for item in self.labeled_data]
            X = np.vstack(vectors)
            y = np.array([1 if item.label == 'Relevant' else 0 for item in self.labeled_data])
            self.X=X
            self.y=y
            info('Training model on latest data...')
            self.model.partial_fit(X,y, classes=[0,1])
            
            
            # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
            #                                   STATUS                                    #
            # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
            self.status()


            #################################
            ## THIS HAS TO BE DONE IN BATCHES
            #################################
            info('Computing predictions using newly trained model...')
            batch_size = 20000
            no_batches = int(1 + (len(self.unlabeled_data)/batch_size))
            no_of_positives=0
            for i in range(no_batches):
#                 info(f'BATCH INFO: from {i*batch_size:8,} to {min(len(self.unlabeled_data),(i+1)*batch_size):8,}')
                batch = self.unlabeled_data[i*batch_size:min(len(self.unlabeled_data),(i+1)*batch_size)]
                X_batch = np.vstack([item.get_bow_vector() for item in batch])
                yhat = self.model.predict(X_batch)
                no_of_positives+= len([elem for elem in yhat if elem>0.5])
                for item,prediction in zip(batch, yhat):
                    item.prediction = prediction
                    
            info(f'Number of unlabeled instances predicted as \'postive\': {no_of_positives:,} ' )
            metrics['remaining_positives'] = no_of_positives
            
            info('Sorting elements by relevance...')
            yhat = [elem.prediction for elem in self.unlabeled_data]
            args_to_remove = [arg for arg in np.argsort(yhat)[-10:]]
            args_to_remove = sorted(args_to_remove, reverse=True)
            for arg in args_to_remove:
                self.for_labeling.append(self.unlabeled_data[arg])
                del(self.unlabeled_data[arg])

            del(X_batch)
            del(yhat)
        self.annotations = pixt.annotate(
                                    [data_item.get_docview_html(keywords=self.query.split(' ')) for data_item in self.for_labeling],
                                    options = ['Relevant', 'Irrelevant'],
                                    stop_at_last_example=False,
                                    display_fn=html,
                                    )

        info(f'Unlabeled data:         {len(self.unlabeled_data):8,}')
        info(f'Labeled data:           {len(self.labeled_data):8}')
        info(f'Just labeled:           {len(self.for_labeling):8}')    

        
    
    def status(self):
        metrics = self.metrics[self.iteration_no-1]
        X = self.X
        y = self.y
        info('Model trained. Showing status of model...')
        least_populated_class_count = min(len([elem for elem in y if elem==0]),len([elem for elem in y if elem==1]))
        if least_populated_class_count>=3:
            scores = cross_validate(MultinomialNB(),X,y,cv=3, scoring=['accuracy', 'precision', 'recall', 'f1'])
            del(scores['score_time'])
            for metric in scores:
                metrics[metric] = scores[metric]
                
        else:
            warning(f'Not computing scores due to lack of exmpales.')
            warning(f'Least populated_class_count: {least_populated_class_count}')
            warning(f'min({len([elem for elem in y if elem==0])},'\
                    f'{len([elem for elem in y if elem==1])})='\
                    f'{least_populated_class_count}')

            for metric in HRSystem.metric_names:
                metrics[metric] = None
        metrics_df = pd.DataFrame(np.zeros(shape=(4,len(HRSystem.metric_names))) , 
                                      index=['fold 1', 'fold 2', 'fold 3', 'average'], 
                                      columns = HRSystem.metric_names 
                                     )
        metrics_df.iloc[3,:] = np.average(metrics_df.iloc[:3,:],axis=0)
        for column,metric in enumerate(metrics):
            if metrics[metric] is None:
                metrics_df.iloc[:,column]=np.nan
            else:
                for row,value in enumerate(metrics[metric]):
                    metrics_df.iloc[row,column]=value
        print(metrics_df)
    def export(self):
        pass
system = HRSystem()
system.setup()

2022-02-10 21:41:33.348775 [ [1;94mINFO[0m  ] Loading index...
2022-02-10 21:41:36.147059 [ [1;94mINFO[0m  ] Index Loaded (192,427 files found).
Enter Search Terms: dp OR displace
2022-02-10 21:41:47.267877 [ [1;94mINFO[0m  ] Search completed, number of hits: 46,805.


HTML(value='0 of 10 Examples annotated.')

VBox(children=(HBox(children=(Button(description='Relevant', style=ButtonStyle()), Button(description='Irrelev…

Output()

2022-02-10 21:41:47.702138 [ [1;94mINFO[0m  ] Unlabeled data:           46,795
2022-02-10 21:41:47.702247 [ [1;94mINFO[0m  ] Labeled data:                  0
2022-02-10 21:41:47.702264 [ [1;94mINFO[0m  ] Just labeled:                 10


In [65]:
print(len([item for item in system.labeled_data if item.label=='Irrelevant']))
print(len([item for item in system.labeled_data if item.label=='Relevant']))

1
9


In [74]:
system.loop()

2022-02-10 21:33:52.955773 [ [1;94mINFO[0m  ] Using predictive model to search example for labeling
2022-02-10 21:33:52.979175 [ [1;94mINFO[0m  ] Training model on latest data...
2022-02-10 21:33:52.981436 [ [1;94mINFO[0m  ] Model trained. Showing status of model...
         fit_time  test_accuracy  test_precision  test_recall  test_f1
fold 1        NaN            NaN             NaN          NaN      NaN
fold 2        NaN            NaN             NaN          NaN      NaN
fold 3        NaN            NaN             NaN          NaN      NaN
average       NaN            NaN             NaN          NaN      NaN
2022-02-10 21:33:53.002182 [ [1;94mINFO[0m  ] Computing predictions using newly trained model...
2022-02-10 21:34:38.056813 [ [1;94mINFO[0m  ] Number of unlabeled instances predicted as 'postive': 46,597 
2022-02-10 21:34:38.057042 [ [1;94mINFO[0m  ] Sorting elements by relevance...


HTML(value='0 of 10 Examples annotated.')

VBox(children=(HBox(children=(Button(description='Relevant', style=ButtonStyle()), Button(description='Irrelev…

Output()

2022-02-10 21:34:38.213406 [ [1;94mINFO[0m  ] Unlabeled data:           46,587
2022-02-10 21:34:38.213486 [ [1;94mINFO[0m  ] Labeled data:                 20
2022-02-10 21:34:38.213521 [ [1;94mINFO[0m  ] Just labeled:                 10


In [7]:
X = system.X
y = system.y

In [9]:
scores = cross_validate(MultinomialNB(),X,y,cv=3, scoring=['accuracy', 'precision', 'recall', 'f1'])
scores.keys()

dict_keys(['fit_time', 'score_time', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1'])

In [26]:


    
unlabeled_data, labeled_data, for_labeling, annotations, model = loop(
                                                                      unlabeled_data,
                                                                      labeled_data, 
                                                                      for_labeling, 
                                                                      annotations, 
                                                                      model
                                                                     )

2022-02-08 16:14:18.184120 [ INFO  ] Using predictive model to search example for labeling
2022-02-08 16:14:18.278396 [ INFO  ] Score average: 0.750
2022-02-08 16:14:18.278565 [ INFO  ] BATCH INFO: from        0 to   20,000
2022-02-08 16:14:20.011948 [ INFO  ] BATCH INFO: from   20,000 to   40,000
2022-02-08 16:14:21.202807 [ INFO  ] BATCH INFO: from   40,000 to   46,537


HTML(value='0 of 10 Examples annotated.')

HBox(children=(Button(description='Relevant', style=ButtonStyle()), Button(description='Irrelevant', style=But…

Output()

2022-02-08 16:14:21.777174 [ INFO  ] Unlabeled data:           46,527
2022-02-08 16:14:21.777237 [ INFO  ] Labeled data:                 80
2022-02-08 16:14:21.777271 [ INFO  ] Just labeled:                 10
Annotation done.


In [None]:
def status()

In [2]:
for_l.bow_vector()

NameError: name 'for_labeling' is not defined

In [12]:
annotations['label']=='Relevant'

0     True
1    False
2     True
3     True
4    False
5     True
6    False
7     True
8    False
9     True
Name: label, dtype: bool

In [3]:
list(annotations)

[('1140932582.xml', 'Relevant'),
 ('1444798819.xml', 'Relevant'),
 ('1412331290.xml', 'Relevant'),
 ('1237819981.xml', 'Relevant'),
 ('1400851797.xml', 'Relevant'),
 ('1151322272.xml', 'Relevant'),
 ('1237696430.xml', 'Relevant'),
 ('1444894211.xml', 'Relevant'),
 ('1146079007.xml', 'Relevant'),
 ('1151111154.xml', 'Relevant')]

In [11]:
for_labeling['file_id']

'1140932582.xml'

In [8]:
import pigeonXT as pixt
a = pixt.annotate(
    [str('<html>Hello world<br/></html>'), 'Hello world2'],
    options=['programming', 'not programming']
)

a

HTML(value='0 of 2 Examples annotated.')

HBox(children=(Button(description='programming', style=ButtonStyle()), Button(description='not programming', s…

Output()

Unnamed: 0,example,changed,label
0,<html>Hello world<br/></html>,False,
1,Hello world2,False,


Annotation done.


In [9]:
a

Unnamed: 0,example,changed,label
0,<html>Hello world<br/></html>,True,programming
1,Hello world2,True,programming


In [29]:
jupyter lab

SyntaxError: invalid syntax (2598950515.py, line 1)