In [58]:
#import csv
import urllib
from urllib.parse import quote

#language identifier 1
!pip -q install fasttext
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model_ft = fasttext.load_model(model_path)

#language identifier 2
!pip -q install langid
from langid.langid import LanguageIdentifier, model
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) #instantiate identifier

import requests
import re
#testing Open Library API
r = requests.get('https://openlibrary.org/search.json?q=subject:("dogs"+OR+"cats")+subject:("Juvenile fiction"+OR+"Juvenile literature")&fields=subject')
r = r.json()
subs = [d['subject'] for d in r['docs']] #gets the list, AKA value from k:v in subject:list dictionary
print(subs[0])

['Gold discoveries', 'Wolfdogs', 'Classic Literature', 'American Adventure stories', 'Open Library Staff Picks', 'Juvenile fiction', 'Fiction', 'Gold mines and mining', 'Historical Fiction', 'Wolves', 'Human-animal relationships', 'Juvenile Literature', 'Dogs', 'Thriller', 'Accelerated Reader', 'Mensch', 'Survival', 'Children: Grades 4-6', 'Children: Grades 3-4', "Children's fiction", 'Wolves, fiction', 'Gold mines and mining, fiction', 'American fiction (fictional works by one author)', 'Dogs, fiction', 'Canada, fiction', 'Indians of North America', 'Large type books', 'Loups', 'Romans, nouvelles, etc. pour la jeunesse', 'Chiens', 'Pôle Nord', 'Romans', 'Adventure stories', 'Nature, fiction', 'Fiction, action & adventure', 'Fiction, historical, general', 'Human-animal relationships, fiction', 'History', 'Wolfdogs -- Fiction', 'Canada, Northern -- Fiction', 'Wilderness survival, fiction', 'Alaska, fiction', 'Yukon, fiction', 'California, fiction', 'Dogs, juvenile literature', 'Adventur

In [None]:
def fast_eng(word):
    #doesn't work well, so giving it 5% error
    pred_array = model_ft.predict(word, k = 5)
    return pred_array[0][0] == '__label__eng_Latn' or (pred_array[0][1] == '__label__eng_Latn' and pred_array[1][0] - pred_array[1][1] <= 0.15)

def lang_eng(word):
    return identifier.classify(word)[0] == 'en'

In [64]:
#discipline tags is a list
#diversity tags is a list
#k is the number of items to return

#finds results that match ANY of the first list of tags and ANY of the second list of tags
def search_recs(discipline_tags, diversity_tags, k):
  #encode URI
  discipline_tags, diversity_tags = list(map(lambda x: urllib.parse.quote(x.encode("utf-8")), discipline_tags)), list(map(lambda x: urllib.parse.quote(x.encode("utf-8")), diversity_tags))
  #if this ever throws errors, maybe we need to specify unicode

  #exact string matching
  discipline_tags, diversity_tags = list(map(lambda x: f"\"{x}\"", discipline_tags)), list(map(lambda x: f"\"{x}\"", diversity_tags))

  #match any of the tags
  str_disc, str_div = '+OR+'.join(discipline_tags), '+OR+'.join(diversity_tags)

  print(f'https://openlibrary.org/search.json?q=subject:({str_disc})+subject:({str_div})&fields=subject&limit={k}')
  return requests.get(f'https://openlibrary.org/search.json?q=subject:({str_disc})+subject:({str_div})&fields=author_name,title,isbn,subject&limit={k}').json()

In [65]:
print(search_recs(['social themes', 'comics & graphic novels'], ['race relations', 'americans'], 2))

https://openlibrary.org/search.json?q=subject:("social%20themes"+OR+"comics%20%26%20graphic%20novels")+subject:("race%20relations"+OR+"americans")&fields=subject&limit=2
{'numFound': 890, 'start': 0, 'numFoundExact': True, 'docs': [{'author_name': ['Kahlil Gibran'], 'isbn': ['9780099505204', '8415676433', '1794497390', '9772477831', '2290313939', '9781072828785', '9798653309106', '1793261636', '9388118456', '9781677406852', '8486344069', '9781086370157', '9781529045857', '2909611108', '2203602147', '9781734114218', '9780434290673', '9788121604376', '9781707256938', '2890740552', '9780099416937', '1513263226', '9789657141427', '9781070110813', '9781841936161', '2851088343', '9781090842220', '1677406852', '9798665054773', '3257239602', '2742731954', '9781549641572', '3530268003', '9781940849911', '9781093216264', '9789879186183', '9781072870630', '9780141194677', '9798472862769', '1678628182', '1515439860', '9781782012306', '9780316308458', '1082769509', '9781076112798', '9798589937541',

In [60]:
#book = get_books(syllabus); takes in a list of ISBNs
def get_tags(books):
  r = [requests.get(f'https://openlibrary.org/search.json?q=isbn:{isbn}&fields=subject').json()['docs'][0]['subject'] for isbn in books] #just give me the list of subjects plz
  return r

In [61]:
lst = get_tags([9780192832696, 9780451015594])
print(lst)

[['Married people, fiction', 'American fiction (fictional works by one author)', 'Fiction, psychological', 'Long island (n.y.), fiction', 'Fiction', 'Rich people', 'Mistresses', 'Married women', 'Traffic accidents', 'First loves', 'Revenge', 'American Manuscripts', 'Facsimiles', 'Manuscripts', 'Antiheroes, l fiction', 'Man-woman relationships, fiction', 'Upper class', 'American literature', 'Young adult fiction, comics & graphic novels, classic adaptation', 'Young adult fiction, social themes, class differences', 'Young adult fiction, classics', 'Fiction, historical,  New york (n.y.)', 'fiction"', 'Comic books, strips', 'Love', 'YOUNG ADULT FICTION', 'Comics & Graphic Novels', 'Classic Adaptation', 'Social Themes', 'Class Differences', 'Historical', 'Literary', 'Man-woman relationship', 'Rich', 'Criticism and interpretation', 'Drama', 'Modern fiction', 'Classics', 'Literature', 'Open Library Staff Picks', 'Wealth', 'Readers', 'Economic conditions', 'American fiction', 'Social life and 

In [None]:
#takes in a list
def clean_tags(l):
  #for idx, l in enumerate(tags): #index, list of lists

  #We can either keep a tag if both methods AGREE that it is english OR only use one and set a probability threshold for english likelihood
  #if english, using fast text; https://aclanthology.org/E17-2068/ and
  #if english, using langid http://www.aclweb.org/anthology/P12-3005
  #off a cursory glance, performs better than the fasttext one, but still not as robust as using both
  l = [s for s in l if fast_eng(s) or lang_eng(s)]
  #pre-lowercase because of proper nouns
  
  #lowercase
  l = [s.lower() for s in l]
  #print(l)
  #remove mentions of "fiction" to prevent stripped pertinent information due to commas later on
  l = [s.split('in fiction')[0] for s in l] #remove any mention of 'fiction'
  l = [s.split(', fiction')[0] for s in l] #remove any mention of 'fiction'
  l = [s.split('fiction, ')[0] for s in l] #remove any mention of 'fiction'
  #print(l)
  #clean for extraness
  l = [s.split(',')[0]  for s in l] #remove anything after a comma
  l = [s.split('--')[0]  for s in l] #remove anything with the --
  l = [s.split('(')[0]  for s in l] #remove parenthesis and anything within it
  l = [s.split('[')[0]  for s in l] #remove parenthesis and anything within it
  l = [s.split('{')[0]  for s in l] #remove parenthesis and anything within it
  l = [s.split('/')[0]  for s in l] #look at info before slash
  l = [s.split('"')[0]  for s in l] #remove quotes
  l = [s for s in l if ":" not in s] #remove anything with colons
  l = [s.split(' - ')[0]  for s in l] #remove dashes
  #print(l)
  #remove other uninformative tags
  l = [s for s in l if "reading level" and "translations" and "staff" and "language materials" not in s]
  l = [s for s in l if not s.startswith("award")] #remove award mentions
  #print(l)
  
  #remove dewey system stuff until further notice
  l = [s for s in l if not bool(re.search(r'\d', s))]

  #ampersand in the tags is causing problems

  #remove whitespace
  l = [s.strip(' \t\n\r') for s in l]

  #remove empty string
  l = [s for s in l if bool(s) != False]

  return l #list, nonunique

In [None]:
print([clean_tags(l) for l in lst])

[['social themes', 'comics & graphic novels', 'modern fiction', 'psychological fiction', 'manners and customs', 'long island', 'love stories', 'married people', 'fiction', 'new york', 'historical', 'upper class', 'wives', 'economic conditions', 'american fiction', 'drama', 'classic adaptation', 'american literature', 'fictional works publication type', 'married women', 'readers', 'fictional works', 'young adult', 'long now manual for civilization', 'first loves', 'wealth', 'love', 'criticism and interpretation', 'moral conditions', 'comic books', 'novel', 'classics', 'large type books', 'facsimiles', 'literature', 'man-woman relationships', 'social life and customs', 'man-woman relationship'], ['sexual behavior', 'americans', 'bisexuals', 'classic literature', 'homosexuality', 'general', 'men', 'bohemianism', 'classics', 'american fiction', 'blacks', 'lgbtq novels before stonewall', 'sexual orientation', 'race relations', 'fiction', 'american literature']]


In [None]:
# Create a new dictionary to hold the modified keys and values
new_tags = {}

for k, v in tags.items():
    n = k.lower().replace('-', '_').strip('.')
    new_tags[n] = clean_tags(v)

tags = new_tags

In [None]:
from collections import Counter

# Initialize a dictionary to hold the results
result = {}

# Iterate over each key and value list in the data
for key, values in tags.items():
    counter = Counter(values) #occurences
    total = sum(counter.values()) #totals

    # Create a new dictionary with percentage occurrences
    percentage_dict = {word: (count / total) for word, count in counter.items()}

    # Store the result for the current key
    result[key] = percentage_dict

result