In [612]:
import json
import re
import requests
import csv
import pandas as pd
import time
from ast import literal_eval
import urllib
from urllib.parse import quote

with open('./top-level-lcc.json', 'r') as reader:
    basic = json.load(reader)

with open('./v2_merged_lcc.json', 'r') as reader: #using the version with a lot of detail for black history and lgbtq history
    detail = json.load(reader)

syllabus = pd.read_csv('./edstud_lgbt_med.csv')

In [305]:
def split_lcc(call_number):
    if call_number == None:
        return None
    #specifically for how OL formats their LCC call numbers
    out = call_number.replace('-', '') #remove hyphen
    out = re.sub(r'(?<=[A-Z])0+', '', out) #Remove leading zeros after the letters segment
    
    # Adjust regex match for letter and number segments, ensuring float conversion for fractional part
    match = re.match(r'([A-Z]+)(\d+(\.\d+)?)', out)
    
    if match:
        return (match.group(1), float(match.group(2)))
    else:
        return None

def lookup_meaning(code): #takes in tuple (call number)
    l = []

    try:
        d = detail[code[0]]
        for i in d:
            if code[1] >= i['start'] and code[1] <= i['stop']:
                l.append(i['subject'])
    except:
        pass

    return l
    #returns a list of definitions for the code

In [306]:
ex1 = 'HV-1568.00000000.B376 2016' #The Minority Body by Elizabeth Barnes
ex2 = 'DAW1008.00000000.B37 1987' #A guide to Central Europe by Richard Bassett
ex3 = 'E--185.86.00000000.H739 2001' #Salvation: Black People and Love by bell hooks
ex4 = 'PL-0788.40000000.G4 E5 2000' #Genji Monogatari by Murasaki Shikibu
ex5 = 'DS-0032.80000000' #Orientalism by Edward Said

print(lookup_meaning(split_lcc(ex1)))
print(lookup_meaning(split_lcc(ex2)))
print(lookup_meaning(split_lcc(ex3)))
print(lookup_meaning(split_lcc(ex4)))
print(lookup_meaning(split_lcc(ex5)))

['Special classes', 'People with disabilities', 'Protection, assistance and relief', 'Social pathology.  Social and public welfare.']
['General', 'History of Central Europe']
['Afro-Americans', 'Status and development since emancipation', 'United States', 'Elements in the population', 'United States. Elements in the population. African Americans. Social conditions. Social life and customs.']
['Languages of Eastern Asia, Africa, Oceania', 'Individual authors and works', 'Japanese language and literature', 'Japanese literature']
['History', 'History of Asia']


Search by LCCN or ISBN on OpenLibrary:

In [None]:
def searchby_lccn(lccn, fields = 'author_name,subject,lcc,title', limit = 5): 
    r = []
    response = requests.get(f'https://openlibrary.org/search.json?q=lcc:{lccn}&fields={fields}&limit={limit}').json()
    return response['docs']

def searchby_isbn(isbn, field = 'lcc', limit = 1):
    time.sleep(2) #being polite
    response = requests.get(f'https://openlibrary.org/search.json?q=isbn:{isbn}&fields={field}&limit={limit}').json()

    if bool(response['docs']): #falsy
        #print(response['docs'], isbn) #error checking

        if bool(response['docs'][0].get(f'{field}')): #if there is an lcc
            return response['docs'][0].get(f'{field}')[0] #string, first lcc returned
    else:
        return '' #nothing returned

def reformat_openlibrary_lccn(syllabus): #doesn't account for specific subclasses
    lccn_tup = []

    for isbn in syllabus['isbn']: #get the lccn
        val = split_lcc(searchby_isbn(isbn)) #after querying Open library, split them into tuples

        if val is not None:
            lccn_tup.append(val)
            
    return lccn_tup

In [308]:
#queries
#1. Direct call number: HV3181*
print('Search by call number:', searchby_lccn('HV3181\*', 'title, lcc', 1))

#2. Anything under a broad category: H* includes H, HA, HB, etc.
print('Search by class:', searchby_lccn('H\*', 'title', 1))

#3. Anything under a specific category: H-- includes only H, HA- includes only HA
print('Search by Subclass:', searchby_lccn('H--\*', 'title', 1))
#books might differ slightly in exact code, so i limit to only title here, you can look at the lcc return list for more details

#4. A range of outputs: HQ TO HV
print('Search range:', searchby_lccn('[HQ TO HV]', 'title', 1)) #can also do this with numbers

#5. A string of call numbers is not allowed to my understanding
#search_lccn('[HJ6799, HJ8001]', 'title,lcc', 2)

#this could be made simpler; right now i return a tuple for simplification when looking up keys in the dictionary
print("Search with Example 2:", search_lccn(split_lcc(ex2)[0] + str(split_lcc(ex2)[1]), 'subject', 1))

Search by call number: [{'lcc': ['HV-3181.00000000.S64', 'HV-3181.00000000'], 'title': 'Black Empowerment'}]
Search by class: [{'title': 'The Roman Breviary'}]
Search by Subclass: [{'title': 'The Roman Breviary'}]
Search range: [{'title': 'The Canterbury Tales'}]
Search with Example 2: [{'subject': ['Guidebooks', 'Travel & holiday guides', 'Travel Guides', 'Travel', 'Travel - Foreign', 'Central Europe', 'Europe - General']}]


In [309]:
lccn_tup = reformat_openlibrary_lccn(syllabus)

In [None]:
'''for k, v in prefixes:
    d = detail[k]
    for i in d:
        if code[1] >= i['start'] and code[1] <= i['stop']:
            l.append(i['subject'])

Find common parents in order to make the most efficient search

In [310]:
def get_all_parents(lccn, lcc_data):
    init = lccn[0] + str(lccn[1])
    all_parents = {init} #let itself be a "parent" just in case!

    try:
        d = lcc_data[lccn[0]] #key is first element in list

        for i in d: #for each dictionary in the list
            if lccn[1] >= i['start'] and lccn[1] <= i['stop']: #check if a subset

                #if this is the deepest node; if itself is the only parent, until now, then overwrite
                if len(i['parents']) >= len(all_parents): 
                    all_parents = i['parents']
        return all_parents
    except:
        return None


def find_most_recent_common_parent(tupes, lcc_data):
    node_parent_sets = [get_all_parents(t, lcc_data) for t in tupes]
    
    prefixes = {}
    inter = {}
    #get all parents for each prefix
    for t in tupes:
        val = get_all_parents(t, lcc_data)
        if val != None:
            if t[0] not in prefixes.keys():
                prefixes[t[0]] = [val] #make a list with all the floats
            else:
                prefixes[t[0]].append(val) #make a list with all the floats

    for k,v in prefixes.items():
        inter[k] = list(set(v[0]).intersection(*map(set, v[1:])))[0] #make it a string

    return inter

In [311]:
mrcp = find_most_recent_common_parent(lccn_tup, detail)
print(mrcp)

{'HQ': 'HQ1-2044', 'LB': 'LB5-3640', 'LC': 'LC8-6691', 'BF': 'BF1-990', 'PZ': 'PZ1-90'}


Our area of desired diversity (perspective of interest in other words) is LGBTQ Studies. Our overarching discipline area is Education.

Could this level of granularity be useful? Unsure

In [416]:
with open('./updated_lgbtq_lcc.json', 'r') as reader: #everything about lgbtq studies, specifically
    diversity = json.load(reader) #in practice, we call also ONLY load in the relevant subclasses

diversity_subset = {k: v for k,v in diversity.items() if k in mrcp.keys()}

In [417]:
def find_diversity_topics(div_dict):
    #vals = []
    topics = []
    for k,v in mrcp.items():
        try:
            entries = div_dict[k]
            #vals.append({k: [entry for entry in entries if v in entry['parents']]}) #gives a lot of topics underneath the parent node
            topics.append([entry['subject'] for entry in entries if v in entry['parents']])

        except:
            pass
    return topics

topics = find_diversity_topics(diversity_subset)
print(topics)

[['Human sexuality. Sex. Sexual orientation.', 'Kinsey, Alfred.', 'Bisexuality. General works.', 'Bisexuality. By region or country, A-Z.', 'Homosexuality. Lesbianism. Periodicals. Serials.', 'Homosexuality. Lesbianism. Congresses.', 'Homosexuality. Lesbianism. Societies.', 'Homosexuality. Lesbianism. Dictionaries.', 'Homosexuality. Lesbianism. Computer networks. Electronic information resources (including the Internet and digital libraries).', 'Gay and lesbian studies.', 'Homosexuality. Lesbianism. Biography (Collective).', 'Homosexuality. Lesbianism. Travel.', 'Homosexuality. Lesbianism. Gay parents.', 'Lesbians. Biography. Collective.', 'Lesbians. Biography. Individual, A-Z.', 'Lesbians. General works.', 'Lesbians. Sex instruction.', 'Lesbian mothers.', 'Middle-aged lesbians. Older lesbians.', 'Lesbians. By region or country, A-Z.', 'Gay men. Biography. Collective.', 'Gay men. Biography. Individual, A-Z.', 'Kameny, Frank.', 'Gay men. General works.', 'Gay men. Sex instruction.', 'Ga

Getting the distribution of key words

In [602]:
from collections import Counter
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords') #to remove uninformative words
stop_words = set(stopwords.words('english'))
lcc_stop = open("lcc_stop_words.txt", "r").read().split("\n") #stop words for the library of congress classification system

[nltk_data] Downloading package stopwords to C:\Users\Debbie
[nltk_data]     Olorunisola\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [603]:

def get_prop_occurrences(topics_lst, lcc_stop, kind = 'by phrase', top_n = 15): #splits by phrase/full subject
    all_tags = []

    if type(topics_lst[0]) == str:
        topics_lst = [topics_lst] #kinda nasty srry, but its how it functions

    if kind == 'by phrase':
        for i in topics_lst:
            tags = '. '.join(i).split('. ')
            tags = [x.lower().split('.')[0] for x in tags]
            tags = [x for x in tags if not any(sub in x for sub in lcc_stop)] #get rid of common but uninformative loc terms
            tags = [' '.join([word for word in x.split(' ') if word not in stop_words]) for x in tags] #get rid of stop words
            tags = [x.lstrip().rstrip() for x in tags] #remove leading and trailing ws

            all_tags += tags

    else: #if by words
        for i in topics_lst:
            tags = ' '.join(i).split() #split by words
            tags = [x.lower().split('.')[0].split(',')[0].split(')')[0] for x in tags] #pruning for commas, periods, and parenthesis
            tags = [x for x in tags if x not in lcc_stop and x not in stop_words]
            tags = [x.lstrip().rstrip() for x in tags] 

            all_tags += tags

    #make proportions
    prop = Counter(all_tags) 
    prop = dict(prop.most_common(top_n))
    total = sum(prop.values())
    prop = {k: v/total for k, v in prop.items()}
    return prop

In [604]:
prop_div = get_prop_occurrences(topics, lcc_stop)
print(prop_div)
print(get_prop_occurrences(topics, lcc_stop, kind = 'by words'))

{'homosexuality': 0.15714285714285714, 'lesbianism': 0.15714285714285714, 'biography': 0.11428571428571428, 'gay lesbian culture': 0.1, 'lesbians': 0.07142857142857142, 'gay men': 0.07142857142857142, 'transvestism': 0.05714285714285714, 'transsexualism': 0.05714285714285714, 'homophobia': 0.04285714285714286, 'bisexuality': 0.02857142857142857, 'sex instruction': 0.02857142857142857, 'same-sex relationships': 0.02857142857142857, 'heterosexism': 0.02857142857142857, 'gay rights movement': 0.02857142857142857, 'gay liberation movement': 0.02857142857142857}
{'gay': 0.25806451612903225, 'homosexuality': 0.08870967741935484, 'lesbianism': 0.08870967741935484, 'lesbian': 0.07258064516129033, 'biography': 0.07258064516129033, 'lesbians': 0.06451612903225806, 'men': 0.06451612903225806, 'culture': 0.056451612903225805, 'movement': 0.04838709677419355, 'same-sex': 0.04032258064516129, 'transvestism': 0.03225806451612903, 'transsexualism': 0.03225806451612903, 'marriage': 0.03225806451612903,

Get the same distribution of topics for books on the syllabus

In [605]:
topics_syll = []
for i in lccn_tup:
   topics_syll = [x.lstrip() for x in tags3]

print(topics_syll)

prop_syll = get_prop_occurrences(topics_syll, lcc_stop)
print(prop_syll)

['Sex role', 'The Family', 'Marriage', 'Women', 'Sex role', 'The Family', 'Marriage', 'Women', 'Early childhood education', 'Theory and practice of education', 'Theory and practice of education', 'Elementary or public school education', 'Preschool education', 'Nursery schools', 'Theory and practice of education', 'Early childhood education', 'Theory and practice of education', 'Community and the school', 'Home and school', 'Special aspects of education', 'Social aspects of education', 'School management and discipline', 'Theory and practice of education', 'School administration and organization', 'Consciousness', 'Cognition', 'Psychology', 'Fiction and juvenile belles lettres', 'Juvenile belles lettres', 'Multicultural education (General)', 'Special aspects of education', 'Types of education', 'The Family', 'Marriage', 'Women', 'Children', 'Child development', 'The family', 'Marriage', 'Home']
{'family': 0.16, 'marriage': 0.16, 'women': 0.12, 'sex role': 0.08, 'early childhood educatio

Looking up books in the range with a set list of important topics

In [625]:
def search_subjects(lcc, topics, field = 'subject', limit = 1, exact_string_matching = False):
    if type(topics) == str:
        time.sleep(2) #being polite
        response = requests.get(f'https://openlibrary.org/search.json?q=lcc:{lcc}&subject={topics}&fields={field}&limit={limit}').json()

        if bool(response['docs']): #falsy
            return response['docs']
        else:
            return '' #nothing returned

    elif type(topics) == list:    
        q = f'https://openlibrary.org/search.json?q=lcc:{lcc}&fields={field}&limit={limit}'

        if exact_string_matching: #for cases where a single word is used
            
            topics = list(map(lambda x: urllib.parse.quote(x.encode("utf-8")), topics)) #encode tags
            topics = list(map(lambda x: f"\"{x}\"", topics)) #exact string matching

            #topics = '+OR+'.join(topics) #comma (,) and pipe (|) are similar AND, not OR for some reason
            topics = ''.join(list(map(lambda x: f'&subject={x}', topics)))
            q += topics
        
        else:
            topics = ','.join(topics)
            q += topics
        
        print(q)

        time.sleep(2) #being polite
        response = requests.get(q).json()
        
        if bool(response['docs']): #falsy
            return response['docs']
        else:
            return '' #nothing returned

    else:
        return None

Use most recent common parents

In [None]:
prop_syll = get_prop_occurrences(topics_syll, lcc_stop, 'by word', 3) #by phrase isn't working at the moment
prop_div = get_prop_occurrences(topics, lcc_stop, 'by word', 3)

subject_terms = list(prop_div.keys()) + list(prop_syll.keys())

In [None]:
suggestions = []
for k,v in mrcp.items(): 
    if '-' in v:
        lst = v.split('-')
        lccn_query = '[' + lst[0] + ' TO ' + k + lst[1] + ']'
        print(search_subjects(lccn_query, subject_terms, 'title', 3, True)) #AT LEAST ONE search is not working i fear


https://openlibrary.org/search.json?q=lcc:[HQ1 TO HQ2044]&fields=title&limit=3&subject="gay"&subject="homosexuality"&subject="lesbianism"&subject="education"&subject="school"&subject="family"
[{'title': 'Men are From Mars, Women are From Venus'}, {'title': 'Being Mortal'}, {'title': 'Women Who Love Too Much'}]
https://openlibrary.org/search.json?q=lcc:[LB5 TO LB3640]&fields=title&limit=3&subject="gay"&subject="homosexuality"&subject="lesbianism"&subject="education"&subject="school"&subject="family"
[{'title': 'Stick Man'}, {'title': "Représentation du monde chez l'enfant"}, {'title': "A Mother's Reckoning"}]
https://openlibrary.org/search.json?q=lcc:[LC8 TO LC6691]&fields=title&limit=3&subject="gay"&subject="homosexuality"&subject="lesbianism"&subject="education"&subject="school"&subject="family"
[{'title': 'Ko munsŏ chipsŏng'}, {'title': 'Let her fly'}, {'title': 'Household education. By Harriet Martineau'}]
https://openlibrary.org/search.json?q=lcc:[BF1 TO BF990]&fields=title&limit=

Trying Rao's Entropy!

In [None]:
import pandas as pd
import numpy as np
import tensorflow_hub
from sentence_transformers import SentenceTransformer #"tensorflow>=1.7.0", tensorflow-hub

In [None]:
# Load model for embeddings
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = tensorflow_hub.load(module_url)

print ("module %s loaded" % module_url)

def embed(input):
  return model(input)

This is not a measure of entropy over the syllabus, this is a comparison between the syllabus and the diversity area topics

In [None]:
# Load pre-trained model for sentence embeddings

# Define Rao's entropy formula
def raos_entropy(all_cats):
    #i'm aware this is presently incorrect bc the probably of topics is not btwn 0 and 1, but This Is a Start!
    entropy = 0.0

    # Calculate pairwise cosine distances between topics
    tags = list(all_cats.keys())
    embeddings = embed(tags) #needs to be embedded over one space
    distance_matrix = np.inner(embeddings, embeddings) #cosine sim

    # rao's entropy
    rqe = 0.0

    for i, cat_i in enumerate(tags):
        for j, cat_j in enumerate(tags):
            p_i = all_cats.get(cat_i, 0) # Probability for category i (fall through if 0)
            p_j = all_cats.get(cat_j, 0)
            #print(p_i, p_j)
            # cosine distance (1 - cosine similarity)
            distance = 1 - distance_matrix[i, j]
            #print(distance)
            
            rqe += p_i * p_j * distance

    return rqe

# Calculate diversity using Rao's entropy
all_cats = {**prop_syll, **prop_div}
entropy = raos_entropy(all_cats)
print(f"Rao's Entropy (Diversity): {entropy}")

Rao's Entropy (Diversity): 2.07819346733668


To be implemented: RQE for set of book recommendations!