In [90]:
import csv

#pip install fasttext
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

import requests
import re
#testing Open Library API
r = requests.get('https://openlibrary.org/search.json?q=subject:("dogs"+OR+"cats")+subject:("Juvenile fiction"+OR+"Juvenile literature")&fields=subject')
r = r.json()
subs = [d['subject'] for d in r['docs']] #gets the list, AKA value from k:v in subject:list dictionary
subs[0]

['Gold discoveries',
 'Wolfdogs',
 'Classic Literature',
 'American Adventure stories',
 'Open Library Staff Picks',
 'Juvenile fiction',
 'Fiction',
 'Gold mines and mining',
 'Historical Fiction',
 'Wolves',
 'Human-animal relationships',
 'Juvenile Literature',
 'Dogs',
 'Thriller',
 'Accelerated Reader',
 'Mensch',
 'Survival',
 'Children: Grades 4-6',
 'Children: Grades 3-4',
 "Children's fiction",
 'Wolves, fiction',
 'Gold mines and mining, fiction',
 'American fiction (fictional works by one author)',
 'Dogs, fiction',
 'Canada, fiction',
 'Indians of North America',
 'Large type books',
 'Loups',
 'Romans, nouvelles, etc. pour la jeunesse',
 'Chiens',
 'Pôle Nord',
 'Romans',
 'Adventure stories',
 'Nature, fiction',
 'Fiction, action & adventure',
 'Fiction, historical, general',
 'Human-animal relationships, fiction',
 'History',
 'Wolfdogs -- Fiction',
 'Canada, Northern -- Fiction',
 'Wilderness survival, fiction',
 'Alaska, fiction',
 'Yukon, fiction',
 'California, ficti

In [123]:
#book = get_books(syllabus); takes in a list of ISBNs
def get_tags(books):
  r = [requests.get(f'https://openlibrary.org/search.json?q=isbn:{isbn}&fields=subject').json()['docs'][0]['subject'] for isbn in books] #just give me the list of subjects plz
  return r

In [138]:
lst = get_tags([9780060173227, 9780451015594])
print(lst)

[['fiction', 'fiction classics', 'contemporary fiction', 'racial segregation', 'mob mentality', 'Southern Gothic', 'southern life', 'racial injustice', 'class', 'courage', 'compassion', 'gender roles', 'laws', 'loss of innocence', 'rape trials', 'domestic fiction', 'legal stories', 'Bildungsromans', 'Race relations', 'Lawyers', 'Girls', 'Prejudices', 'Fathers and daughters', 'Trials (Rape)', 'Great Depression', 'African Americans', 'Father-daughter relationship,', 'Toleranz', 'Kind', 'American fiction (fictional works by one author)', 'Fiction, coming of age', 'Fiction, family life', 'Fiction, legal', 'Fathers and daughters, fiction', 'Southern states, fiction', 'Lawyers, fiction', 'African americans, fiction', 'Social life and customs', 'Manners and customs', 'Padres e hijas', 'Novela', 'Relaciones raciales', 'Procesos por violación', 'Spanish language materials', 'Father-daughter relationship', 'New York Times reviewed', 'American literature', 'Fiction, family life, general', 'Large 

In [143]:
#takes in a list of lists
def clean_tags(tags):
  for idx, l in enumerate(tags): #index, list of lists

    #lowercase
    l = [s.lower() for s in l]

    #language
    l = [s for s in l if model.predict(s)[0][0] == '__label__eng_Latn'] #if english

    #remove mentions of "fiction" to prevent stripped pertinent information due to commas later on
    l = [s.split('in fiction')[0] for s in l] #remove any mention of 'fiction'
    l = [s.split(', fiction')[0] for s in l] #remove any mention of 'fiction'
    l = [s.split('fiction, ')[0] for s in l] #remove any mention of 'fiction'

    #clean for extraness
    l = [s.split(',')[0]  for s in l] #remove anything after a comma
    l = [s.split('--')[0]  for s in l] #remove anything with the --
    l = [s.split('(')[0]  for s in l] #remove parenthesis and anything within it
    l = [s.split('/')[0]  for s in l] #remove parenthesis and anything within it
    l = [s for s in l if ":" not in s] #remove anything with parentheses

    #remove whitespace
    l = [s.strip(' \t\n\r') for s in l]

    #remove empty string
    l = [s for s in l if bool(s) != False]

    #make unique, update list
    tags[idx] = set(l)

  return tags


In [144]:
clean_tags(lst)

[{'american literature',
  'class',
  'classics',
  'compassion',
  'contemporary fiction',
  'courage',
  'domestic fiction',
  'families',
  'father-daughter relationship',
  'fathers and daughters',
  'fiction',
  'fiction classics',
  'girls',
  'great depression',
  'large type books',
  'laws',
  'lawyers',
  'legal stories',
  'literary',
  'loss of innocence',
  'manners and customs',
  'mob mentality',
  'race relations',
  'racial injustice',
  'racial segregation',
  'rape trials',
  'social life and customs',
  'southern gothic',
  'southern life',
  'southern states',
  'spanish language materials',
  'trials'},
 {'american literature',
  'blacks',
  'bohemianism',
  'classic literature',
  'classics',
  'fiction',
  'general',
  'homosexuality',
  'lgbtq novels before stonewall',
  'literary',
  'paris',
  'psychology',
  'race relations',
  'sexual behavior',
  'sexual orientation'}]