In [174]:
import csv

#language identifier 1
#pip install fasttext
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model_ft = fasttext.load_model(model_path)

#language identifier 2
#pip install langid
from langid.langid import LanguageIdentifier, model
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) #instantiate identifier

import requests
import re
#testing Open Library API
r = requests.get('https://openlibrary.org/search.json?q=subject:("dogs"+OR+"cats")+subject:("Juvenile fiction"+OR+"Juvenile literature")&fields=subject')
r = r.json()
subs = [d['subject'] for d in r['docs']] #gets the list, AKA value from k:v in subject:list dictionary
subs[0]

['Gold discoveries',
 'Wolfdogs',
 'Classic Literature',
 'American Adventure stories',
 'Open Library Staff Picks',
 'Juvenile fiction',
 'Fiction',
 'Gold mines and mining',
 'Historical Fiction',
 'Wolves',
 'Human-animal relationships',
 'Juvenile Literature',
 'Dogs',
 'Thriller',
 'Accelerated Reader',
 'Mensch',
 'Survival',
 'Children: Grades 4-6',
 'Children: Grades 3-4',
 "Children's fiction",
 'Wolves, fiction',
 'Gold mines and mining, fiction',
 'American fiction (fictional works by one author)',
 'Dogs, fiction',
 'Canada, fiction',
 'Indians of North America',
 'Large type books',
 'Loups',
 'Romans, nouvelles, etc. pour la jeunesse',
 'Chiens',
 'Pôle Nord',
 'Romans',
 'Adventure stories',
 'Nature, fiction',
 'Fiction, action & adventure',
 'Fiction, historical, general',
 'Human-animal relationships, fiction',
 'History',
 'Wolfdogs -- Fiction',
 'Canada, Northern -- Fiction',
 'Wilderness survival, fiction',
 'Alaska, fiction',
 'Yukon, fiction',
 'California, ficti

In [175]:
#book = get_books(syllabus); takes in a list of ISBNs
def get_tags(books):
  r = [requests.get(f'https://openlibrary.org/search.json?q=isbn:{isbn}&fields=subject').json()['docs'][0]['subject'] for isbn in books] #just give me the list of subjects plz
  return r

In [190]:
lst = get_tags([9780192832696, 9780451015594])
print(lst)

[['Married people, fiction', 'American fiction (fictional works by one author)', 'Fiction, psychological', 'Long island (n.y.), fiction', 'Fiction', 'Rich people', 'Mistresses', 'Married women', 'Traffic accidents', 'First loves', 'Revenge', 'American Manuscripts', 'Facsimiles', 'Manuscripts', 'Antiheroes, l fiction', 'Man-woman relationships, fiction', 'Upper class', 'American literature', 'Young adult fiction, comics & graphic novels, classic adaptation', 'Young adult fiction, social themes, class differences', 'Young adult fiction, classics', 'Fiction, historical,  New york (n.y.)', 'fiction"', 'Comic books, strips', 'Love', 'YOUNG ADULT FICTION', 'Comics & Graphic Novels', 'Classic Adaptation', 'Social Themes', 'Class Differences', 'Historical', 'Literary', 'Man-woman relationship', 'Rich', 'Criticism and interpretation', 'Drama', 'Modern fiction', 'Classics', 'Literature', 'Open Library Staff Picks', 'Wealth', 'Readers', 'Economic conditions', 'American fiction', 'Social life and 

In [196]:
#takes in a list of lists
def clean_tags(tags):
  for idx, l in enumerate(tags): #index, list of lists

    #lowercase
    l = [s.lower() for s in l]

    #language identifier
    #We can either keep a tag if both methods AGREE that it is english OR only use one and set a probability threshold for english likelihood
    l = [s for s in l if model_ft.predict(s)[0][0] == '__label__eng_Latn'] #if english, using fast text; https://aclanthology.org/E17-2068/
    #if english, using langid
    l = [s for s in l if identifier.classify(s)[0] == 'en'] #off a cursory glance, performs better than the fasttext one, but still not as robust as using both; http://www.aclweb.org/anthology/P12-3005

    #remove mentions of "fiction" to prevent stripped pertinent information due to commas later on
    l = [s.split('in fiction')[0] for s in l] #remove any mention of 'fiction'
    l = [s.split(', fiction')[0] for s in l] #remove any mention of 'fiction'
    l = [s.split('fiction, ')[0] for s in l] #remove any mention of 'fiction'

    #clean for extraness
    l = [s.split(',')[0]  for s in l] #remove anything after a comma
    l = [s.split('--')[0]  for s in l] #remove anything with the --
    l = [s.split('(')[0]  for s in l] #remove parenthesis and anything within it
    l = [s.split('[')[0]  for s in l] #remove parenthesis and anything within it
    l = [s.split('{')[0]  for s in l] #remove parenthesis and anything within it
    l = [s.split('/')[0]  for s in l] #look at info before slash
    l = [s.split('"')[0]  for s in l] #remove quotes
    l = [s for s in l if ":" not in s] #remove anything with parentheses
    l = [s for s in l if "reading level" not in s] #remove any mention of reading level

    #remove other uninformative tags
    l = [s for s in l if "translations" not in s]
    l = [s for s in l if "staff" not in s] #staff picks
    l = [s for s in l if "language materials" not in s] #language materials

    #remove whitespace
    l = [s.strip(' \t\n\r') for s in l]

    #remove empty string
    l = [s for s in l if bool(s) != False]

    #make unique, update list
    tags[idx] = set(l)

  return tags


In [197]:
clean_tags(lst)

[{'813',
  'american literature',
  'classic adaptation',
  'classics',
  'comic books',
  'comics & graphic novels',
  'criticism and interpretation',
  'drama',
  'economic conditions',
  'facsimiles',
  'fiction',
  'fictional works',
  'fictional works publication type',
  'first loves',
  'historical',
  'large type books',
  'literature',
  'long island',
  'long now manual for civilization',
  'love',
  'love stories',
  'man-woman relationship',
  'man-woman relationships',
  'manners and customs',
  'married people',
  'married women',
  'modern fiction',
  'moral conditions',
  'novel',
  'psychological fiction',
  'readers',
  'social life and customs',
  'social themes',
  'wealth',
  'wives'},
 {'813',
  'american literature',
  'blacks',
  'bohemianism',
  'classic literature',
  'classics',
  'fiction',
  'general',
  'homosexuality',
  'lgbtq novels before stonewall',
  'race relations',
  'sexual behavior',
  'sexual orientation'}]