In [262]:
import requests
import pandas as pd
from random_word import Wordnik
import time
import math
import numpy as np
import spacy

In [None]:
def generate_search_strings(num_items: int, counter: int):
    '''generate random words as search strings for lexica'''    
    batch_size = 10
    num_calls = math.ceil(num_items/batch_size)
    output = []
    wordnik_service = Wordnik()
    for i in range(num_calls):
        try:
            # Return a single random word
            res = wordnik_service.get_random_words(includePartOfSpeech ="noun,verb,adverb",hasDictionaryDef=True, limit=batch_size)
            #TODO: Check if adding a duplicate search term
            [output.append(x) for x in res]
        except Exception as e:
            print(e)
    print('Generated ', int(batch_size * num_calls), ' search terms.' )
    return output

def lexica_search(terms: list, counter: int):
    '''search and store lexica results via their locked-down and rate-limited api'''
    search_base='https://lexica.art/api/v1/search?q='
    prompts = pd.DataFrame(columns=['search_string','source','prompt'])
    print('Starting counter is: ', counter)
    for i in range(counter,len(terms)):
        #print('Searching term: ' , item)
        query = terms[i]
        query = query.replace(' ', '+')
        try:
            d = requests.get(url=(search_base + query))
            data = d.json()
            obj = data['images']
        except Exception as e:
            print(e)
            print('Writing counter to file: ', counter)
            time.sleep(35)
            f = open('./counter.txt', 'w')
            f.write(str(counter))
            f.close()
            return prompts, counter
        #print('Adding items to db for search term: ', item)
        for item in obj:
            row = [query, item['src'], item['prompt']]
            prompts.loc[item['id']] = row
        counter +=1
        print('Commited prompts for term ', counter, ' out of ', len(terms))
        time.sleep(.5)
    f = open('./counter.txt', 'w')
    f.write(counter)
    f.close()
    return prompts, counter

### Creating a database of prompts for training

In [None]:
with open('./1000-most-common.txt') as file:
    lines = file.readlines()
    common = [line.rstrip() for line in lines]
with open('./counter.txt') as file:
    lines = file.readlines()
    counter = int(lines[0])

while counter != (len(common)-1):
    print("Starting the procedure again with counter: ", counter)
    res_common, counter = lexica_search(terms = common, counter = counter)
    filename = './prompts-with-common-' + str(counter) + '.json'
    res_common.to_json(filename, orient='split')


#res.to_json('./common-df.json',orient='split')
res = pd.read_json('./common-df.json', orient='split')
master = pd.read_json('./master-prompts.json', orient='split')

full = pd.concat([master,res])
full.shape
full.to_csv('./full-prompts.txt')

In [267]:
# Tokenizing prompts w/spacy
full = pd.read_json('./full-prompts.json', orient='split')
nlp = spacy.load('en_core_web_sm')
def tokenize(prompt):
    temp = []
    f = nlp(prompt)
    for ent in f.ents:
        temp.append({'token': ent.text,'char_start': ent.start_char, 'char_end': ent.end_char, 'label': None, 'is_weak_label': False, 'pos': ent.label_})
    return temp

full['tokens'] = full['prompt'].apply(tokenize)

In [309]:
#Adding GT for the prompts based off of labels from Label Studio
import json
f = open('./gt.json')
gt_file = json.load(f)
#print('Example entry: ', gt_file[58])

filtered = []
for x in gt_file:
    filtered.append({'annotations': x['annotations'][0]['result'],'data': x['data']})
#print(filtered[58])

full['gt_raw'] = None
for i in range(full.shape[0]):
    ss = full.index[i]
    for item in filtered:
        if item['data']['Unnamed: 0'] == ss:
            #print('found annoation match')
            #print('df row: ', full.loc[ss])
            #print('annotations row: ', item)
            full['gt_raw'][i] = item

found annotation match
df row:  search_string                                            provokers
source           https://lexica-serve-encoded-images.sharif.wor...
prompt           a special operations member that looks like co...
tokens           [{'token': 'colin farrell', 'char_start': 44, ...
Name: 0158539f-d69b-4328-b26c-0d3e22795c1c, dtype: object
annotations row:  {'annotations': [{'value': {'start': 168, 'end': 192, 'text': 'cornelis van poelenburgh', 'labels': ['Artist']}, 'id': '-PCrj-bI4Z', 'from_name': 'label', 'to_name': 'text', 'type': 'labels', 'origin': 'manual'}, {'value': {'start': 197, 'end': 208, 'text': 'dosso dossi', 'labels': ['Artist']}, 'id': 'y01Q-cujWT', 'from_name': 'label', 'to_name': 'text', 'type': 'labels', 'origin': 'manual'}], 'data': {'Unnamed: 0': '0158539f-d69b-4328-b26c-0d3e22795c1c', 'search_string': 'provokers', 'source': 'https://lexica-serve-encoded-images.sharif.workers.dev/md/0158539f-d69b-4328-b26c-0d3e22795c1c', 'prompt': 'a special opera

In [311]:
#Re-format annotations to work with Spacy
for i in range(full.shape[0]):
    for item in full.iloc[i]['tokens']:
        try:
            annotations = full.iloc[i]['gt_raw']['annotations']
            #print(annotations)
            for gt in annotations:
                l = gt['value']
                #print(l['start'])
                #print(l['end'])
                #print(abs(int(item['char_start']) - int(l['start'])) <= 3)
                #print(item['char_end'])
                if (abs(int(item['char_start']) - int(l['start'])) <= 3) or (abs(int(item['char_end']) - int(l['end'])) <= 3) :
                    #print('Found an artist: ')
                    #print(l['text'])
                    item['label'] = 'artist'
                else:
                    item['label'] = 'other'
        except TypeError as e:
            pass  
            #print('No annotations')
full_labeled = full[full['gt_raw'].notna()]
full = full.drop(['gt_raw'], axis=1)
full_labeled = full_labeled.drop(['gt_raw'], axis=1)

## Weak Labeling with BART Large MNLI

In [188]:
'''Experiment: Can BART serve as a source of weak labeling for the end model?'''
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [199]:
temp = full.iloc[222]['tokens']
print(temp)
labels = ['artist', 'other']
threshold = 0.80
for item in temp:
    if item['label'] is None and item['pos'] == 'PERSON':
        res = classifier(item['token'], labels)
        print(res['sequence'],' : ', res['scores'][0])
        if (res['scores'][0] > threshold):
            item['label'] = 'artist'
            item['is_weak_label'] = True
            #print(item)
            annotation = df['gt_raw'][222]
            print(annotation)
            annotation = annotation['annotations'].append(item)

#Omitting adding these weak labels to the training set for now, since the model was able to get strong scores to start.
'''for i in range(full.shape[0]):
    temp = full.iloc[i]['tokens']
    for item in temp:
        if item['label'] is None and item['pos'] == 'PERSON':
            res = classifier(item['token'], labels)
            print(res['sequence'],' : ', res['scores'][0])
            if (res['scores'][0] > threshold):
                item['label'] = 'artist'
                item['is_weak_label'] = True'''

#full.to_json('./full-checkpoint.json', orient = 'split')

[{'token': 'james jean', 'char_start': 162, 'char_end': 172, 'label': 'artist', 'is_weak_label': True, 'pos': 'PERSON'}]


In [337]:
# Convert dataframe to Docs for training
import spacy
from spacy.tokens import DocBin
#take full_labeled and parse out all the tokens and assign labels
full_labeled['token_convert'] = None
for i in range(full_labeled.shape[0]):
    tmp = []
    for item in full_labeled['tokens'][i]:
        #print(item)
        if item['label'] is None:
            item['label'] = 'other'
        tmp.append((item['char_start'], item['char_end'], item['label']))
    full_labeled['token_convert'][i] = tmp
#print(full_labeled['token_convert'][0])

full_labeled.head()
training_data = []
for i in range(full_labeled.shape[0]):
    training_data.append((full_labeled['prompt'][i],full_labeled['token_convert'][i]))
#print(training_data[-10:-5])

[('data science ', []), ('extreme long shot view of a face in agony coming out of the ground by kentaro miura, hyper-detailed', [(70, 83, 'artist')]), ('lies we tell ourselves', []), ('Taco Bell if society was perfect.', [(0, 9, 'other')]), ('why do we die?', [])]


In [339]:
nlp = spacy.blank("en")
#print(len(training_data))
# the DocBin will store the example documents
db = DocBin()
for text, annotations in training_data:
    doc = nlp(text)
    #print(doc)
    ents = []
    #print(annotations)
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        #print(span.text)
        if span is not None:
            ents.append(span)
    #print(ents)
    doc.ents = ents
    db.add(doc)
db.to_disk("./train.spacy")