In [1]:
import requests
import pandas as pd
from random_word import Wordnik
import time
import math
import numpy as np
np.set_printoptions(edgeitems=25, linewidth=100000)

In [None]:
def generate_search_strings(num_items: int, counter: int):    
    batch_size = 10
    num_calls = math.ceil(num_items/batch_size)
    output = []
    wordnik_service = Wordnik()
    for i in range(num_calls):
        try:
            # Return a single random word
            res = wordnik_service.get_random_words(includePartOfSpeech ="noun,verb,adverb",hasDictionaryDef=True, limit=batch_size)
            #TODO: Check if adding a duplicate search term
            [output.append(x) for x in res]
        except Exception as e:
            print(e)
    print('Generated ', int(batch_size * num_calls), ' search terms.' )
    return output

def lexica_search(terms: list, counter: int):
    search_base='https://lexica.art/api/v1/search?q='
    prompts = pd.DataFrame(columns=['search_string','source','prompt'])
    print('Starting counter is: ', counter)
    for i in range(counter,len(terms)):
        #print('Searching term: ' , item)
        query = terms[i]
        query = query.replace(' ', '+')
        try:
            d = requests.get(url=(search_base + query))
            data = d.json()
            obj = data['images']
        except Exception as e:
            print(e)
            print('Writing counter to file: ', counter)
            time.sleep(35)
            f = open('./counter.txt', 'w')
            f.write(str(counter))
            f.close()
            return prompts, counter
        #print('Adding items to db for search term: ', item)
        for item in obj:
            row = [query, item['src'], item['prompt']]
            prompts.loc[item['id']] = row
        counter +=1
        print('Commited prompts for term ', counter, ' out of ', len(terms))
        time.sleep(.5)
    f = open('./counter.txt', 'w')
    f.write(counter)
    f.close()
    return prompts, counter



In [None]:
with open('./1000-most-common.txt') as file:
    lines = file.readlines()
    common = [line.rstrip() for line in lines]
with open('./counter.txt') as file:
    lines = file.readlines()
    counter = int(lines[0])

while counter != (len(common)-1):
    print("Starting the procedure again with counter: ", counter)
    res_common, counter = lexica_search(terms = common, counter = counter)
    filename = './prompts-with-common-' + str(counter) + '.json'
    res_common.to_json(filename, orient='split')


In [None]:
'''import re
import sys
import os'''

'''dir_list = os.listdir(os.getcwd())
search_term = 'prompts-with-common'
frames = []
for item in dir_list:
    if re.search(search_term,item):
        temp_df = pd.read_json(item, orient='split')
        frames.append(temp_df)
res = pd.concat(frames)'''

#res.to_json('./common-df.json',orient='split')
res = pd.read_json('./common-df.json', orient='split')
master = pd.read_json('./master-prompts.json', orient='split')

full = pd.concat([master,res])
full.shape
full.to_csv('./full-prompts.txt')


In [55]:
#Tokenizing prompts
import spacy
import pandas as pd
full = pd.read_json('./full-prompts.json', orient='split')
nlp = spacy.load('en_core_web_sm')
def tokenize(prompt):
    temp = []
    f = nlp(prompt)
    for ent in f.ents:
        temp.append({'token': ent.text,'char_start': ent.start_char, 'char_end': ent.end_char, 'label': None, 'is_weak_label': False, 'pos': ent.label_})
    return temp

full['tokens'] = full['prompt'].apply(tokenize)

print(full.head())

                                     search_string  \
0158539f-d69b-4328-b26c-0d3e22795c1c     provokers   
05fee663-7db8-4119-a455-e760bdefd759     provokers   
24b2541b-3bfa-4d7e-983c-680667e0823a     provokers   
3d62e1aa-faa4-4a41-9871-855451deb310     provokers   
41073b1a-8bc3-417e-88e7-633f4123f404     provokers   

                                                                                 source  \
0158539f-d69b-4328-b26c-0d3e22795c1c  https://lexica-serve-encoded-images.sharif.wor...   
05fee663-7db8-4119-a455-e760bdefd759  https://lexica-serve-encoded-images.sharif.wor...   
24b2541b-3bfa-4d7e-983c-680667e0823a  https://lexica-serve-encoded-images.sharif.wor...   
3d62e1aa-faa4-4a41-9871-855451deb310  https://lexica-serve-encoded-images.sharif.wor...   
41073b1a-8bc3-417e-88e7-633f4123f404  https://lexica-serve-encoded-images.sharif.wor...   

                                                                                 prompt  \
0158539f-d69b-4328-b26c-0d3e22795c1c 

In [57]:
#Adding GT for the prompts based off of labels from label studio
import json
f = open('./gt.json')
gt_file = json.load(f)
#print('Example entry: ', gt_file[58])

filtered = []
for x in gt_file:
    filtered.append({'annotations': x['annotations'][0]['result'],'data': x['data']})
#print(filtered[58])


test = full.index[0]
for item in filtered:
    if item['data']['Unnamed: 0'] == test:
        print('found annoation match')
        print('df row: ', full.loc[test])
        print('annotations row: ', item)


found annoation match
df row:  search_string                                            provokers
source           https://lexica-serve-encoded-images.sharif.wor...
prompt           a special operations member that looks like co...
tokens           [{'token': 'colin farrell', 'char_start': 44, ...
Name: 0158539f-d69b-4328-b26c-0d3e22795c1c, dtype: object
annotations row:  {'annotations': [{'value': {'start': 168, 'end': 192, 'text': 'cornelis van poelenburgh', 'labels': ['Artist']}, 'id': '-PCrj-bI4Z', 'from_name': 'label', 'to_name': 'text', 'type': 'labels', 'origin': 'manual'}, {'value': {'start': 197, 'end': 208, 'text': 'dosso dossi', 'labels': ['Artist']}, 'id': 'y01Q-cujWT', 'from_name': 'label', 'to_name': 'text', 'type': 'labels', 'origin': 'manual'}], 'data': {'Unnamed: 0': '0158539f-d69b-4328-b26c-0d3e22795c1c', 'search_string': 'provokers', 'source': 'https://lexica-serve-encoded-images.sharif.workers.dev/md/0158539f-d69b-4328-b26c-0d3e22795c1c', 'prompt': 'a special operat

In [58]:
full['gt_raw'] = None
for i in range(full.shape[0]):
    ss = full.index[i]
    for item in filtered:
        if item['data']['Unnamed: 0'] == ss:
            #print('found annoation match')
            #print('df row: ', full.loc[ss])
            #print('annotations row: ', item)
            full['gt_raw'][i] = item



In [59]:
for i in range(full.shape[0]):
    for item in full.iloc[i]['tokens']:
        try:
            annotations = full.iloc[i]['gt_raw']['annotations']
            #print(annotations)
            for gt in annotations:
                l = gt['value']
                #print(l['start'])
                #print(l['end'])
                #print(abs(int(item['char_start']) - int(l['start'])) <= 3)
                #print(item['char_end'])
                if (abs(int(item['char_start']) - int(l['start'])) <= 3) or (abs(int(item['char_end']) - int(l['end'])) <= 3) :
                    #print('Found an artist: ')
                    #print(l['text'])
                    item['label'] = 'artist'
        except TypeError as e:
            pass  
            #print('No annotations')
#full = full.drop(['gt_raw'], axis=1)

## Weak Labeling with BART Large MNLI

In [60]:
full.loc['03d93c89-8639-440b-b627-e779ddb22d36']['tokens']

[{'token': '8 k',
  'char_start': 167,
  'char_end': 170,
  'label': None,
  'is_weak_label': False,
  'pos': 'QUANTITY'},
 {'token': 'marc hill',
  'char_start': 234,
  'char_end': 243,
  'label': 'artist',
  'is_weak_label': False,
  'pos': 'PERSON'},
 {'token': 'johannes wessermark',
  'char_start': 260,
  'char_end': 279,
  'label': 'artist',
  'is_weak_label': False,
  'pos': 'ORG'},
 {'token': 'winter',
  'char_start': 287,
  'char_end': 293,
  'label': None,
  'is_weak_label': False,
  'pos': 'DATE'}]

In [43]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [73]:
temp = full.iloc[1354]['tokens']
labels = ['artist', 'other']
threshold = 0.90
for i in range(full.shape[0]):
    temp = full.iloc[i]['tokens']
    for item in temp:
        if item['label'] is None and item['pos'] == 'PERSON':
            res = classifier(item['token'], labels)
            print(res['sequence'],' : ', res['scores'][0])
            if (res['scores'][0] > threshold):
                item['label'] = 'artist'
                item['is_weak_label'] = True

full.to_json('./full-checkpoint.json', orient = 'split')

colin farrell  :  0.8999399542808533
brad pitt  :  0.8906015753746033
greg rutkowski  :  0.8513946533203125
ismail  :  0.8555510640144348
kilian eng  :  0.7126706838607788
jake parker  :  0.8989964127540588
John Collier  :  0.8607586026191711
James jean  :  0.8960735201835632
Brian  :  0.5786029100418091
ross tran  :  0.8816928863525391
Ruan Jia  :  0.5014441013336182
charlie bowater  :  0.8331677317619324
ross tran  :  0.8816928863525391
zootopia stile  :  0.5239210724830627
octane render  :  0.6758478283882141
greg rutkowski  :  0.8513946533203125
tuomas korpi  :  0.7030742168426514
greg rutkowski  :  0.8513946533203125
greg rutkowski  :  0.8513946533203125
eddie mendoza  :  0.6862586140632629
greg rutkowski  :  0.8513946533203125
bill sienkiewicz  :  0.8794418573379517
gaston bussiere  :  0.7389631271362305
j. c. leyendecker  :  0.6466609239578247
hyper  :  0.843937873840332
Bayard Wu  :  0.8346425294876099
Greg Rutkowski  :  0.8997286558151245
greg rutkowski  :  0.8513946533203125


In [None]:
'''search_base='https://lexica.art/api/v1/search?q='
query='in the style of'
query = query.replace(' ', '+')
d = requests.get(url=(search_base + query))
data = d.json()
obj = data['images']
images = pd.DataFrame(columns=['source','prompt'])
for item in obj:
    row = [item['src'],item['prompt']]
    images.loc[item['id']] = row

images.head()

wordnik_service = Wordnik()
res = wordnik_service.get_random_words(includePartOfSpeech ="noun,verb,adverb",hasDictionaryDef=True, limit=50)

type(res)'''

In [72]:
for item in filtered:
    print(item['data']['prompt'])

two skinny old people draped in silky gold, green and pink, wearing gas masks, standing inside a dystopian abandoned hospital room, ayami kojima, greg hildebrandt, mark ryden, hauntingly surreal, highly detailed painting by, james jean and jenny saville, soft light 4 k 
red light from above defines the shape of her shadow on the floor below 
death under humans
boy on ground waving to a pretty girl on the second floor, illustration concept art anime key visual trending pixiv fanbox by wlop and greg rutkowski and makoto shinkai and studio ghibli 
cthulhu cross section scientific illustration biology book, highly detailed 
the worst birthday ever. cartoon 
Polish meme
a gorgeous female photo, professionally retouched, soft lighting, realistic, smooth face, full body shot, torso, dress, perfect eyes, wide angle, sharp focus on eyes, 8 k high definition, insanely detailed, intricate, elegant, art by marc hill and artgerm and johannes wessermark, snowy winter 
female character study of cute 