# Analyses of MARC21 metadata records - bibliographies


This Notebook contains code for the analyses described in the paper 
_Ohren, O. Getting meaning out of metadata - analysis of selected bibliographies at the National Library of Norway. Oslo 2024_

In [None]:
from pprint import pprint as pp
import re #regular expressions
import requests
import urllib, urllib.parse     # used for percent-encoding strings
import xml
from xml import etree
from xml.etree import ElementTree
from io import StringIO
import pymarc
from pymarc import Record, marcxml, Field, XMLWriter
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import csv
import difflib
from difflib import SequenceMatcher
import itertools
import numpy
import unicodedata as ucd
# debugging
import pdb
import traceback
#data storage
import sqlite3
from wordcloud import WordCloud
#for Analysis 5 (geomapping)
import geocoder
import folium


In [None]:
import os
import sys
repopath=os.path.abspath('../Gitrepos/tkpy')
if repopath not in sys.path:
    sys.path.append(repopath)
import harvest, iomarc, iogeneral, marcpy1, marcpy2
from marcpy1 import valueCounter, leaderValues, similar
from marcpy2 import select, selectAssigned, filterRecordsByControlField, filterRecordsByLeader, filterRecords
from marcpy2 import fetchRecordSimple, indexRecords
from iomarc import printFields, printFieldss, writeMarcToFile 
from iogeneral import writeToFile, readlines

## Utilities and string functions

In [None]:
def trim(strng):
    #converts strn to a string without newlines and without extra spaces
    strlst=strng.splitlines()     #remove \n
    result=''
    for s in strlst:
        result = result + ' ' + s.strip()
    return result.strip()

def oneLineStr(s):
    #returns a one line version of s. I.e. without line separators, and without leading or trailing spaces on
    #    each line of s.
    return ''.join(list(map(lambda x: x.strip() + ' ', s.splitlines()))).strip()

def reverseDict (d):
    #returns a new dict where the keys are the values of d, and the values lists of the keys of d
    #assumes that the values of d are strings or numbers (unmutable)
    r=dict()
    for k1 in d.keys():
        if type(d[k1]) in [str, int, float] and d[k1] not in r.keys():
            r[d[k1]] = [k1]
            for k2 in d.keys():
                if d[k1]==d[k2] and k2 not in r[d[k1]]:
                    r[d[k1]].append(k2)
    return r

def mergeDicts(dictionaries, nullVal=0):
    #returns a Dict which is a merge between the dictionaries in dictionaries 
    #the values of the merged dict is a list containing  the values of  the dictionaries in 
    #dictionaries separately. Becomes a matrix
    #nullVal is the empty value
    #collect all keys
    allkeys=[]
    for d in dictionaries:
        allkeys=list(set(allkeys).union(set(list(d))))
    #Create the dictionary
    merged=dict()
    allkeys.sort()
    #Initialize
    for k in allkeys:
        merged[k]=[]
    #Collect values from dictionaries
    for k in sorted(allkeys):
        for d in dictionaries:
            if k in list(d):
                merged[k].append(d[k])
            else:
                merged[k].append(nullVal)
    return merged

def reduceNumDict(dictionary, threshold, aggrKey='other'):
    #Assumes dict has numeric values, e.g. {'nor':43, 'dan':5, etc}
    #All values less than threshold is summed up and assigned to a key aggrKey
    #Returns a new dict with all values under thresjhold are assigned to aggrKey
    res=dict()
    aggrval=0
    for k in dictionary.keys():
        if dictionary[k]<threshold:
            aggrval+=dictionary[k]
        else:
            res[k]=dictionary[k]
    res[aggrKey]=aggrval
    return res

def flatten(l):
    #returns a list of all the elements of all the lists in l. (Flattened 1 level)
    result=[]
    for elt in l:
        if type(elt)==list:
            result.extend(elt)
        else:
            result.append(elt)
    return result

def compress(seq, toRemove):
    #Returns a cpopy of seq (list or tuple) without any of the elements in toRemove
    result = []
    for elt in seq:
        if elt not in toRemove:
            result.append(elt)
    if type(seq) == tuple:
        return tuple(result)
    else:
        return result

def removePrefixes(l):
    #returns a new list with all the elements in l which are not prefixes or equal another element
    #only 1 level is handled (no nesting)
    #meant to be used for lists of strings.
    #non-strings elements are converted to strings
    deleted=[]
    for e1 in l:
        for e2 in l:
            if e1 not in deleted and str(e2).startswith(str(e1)) and len(str(e1))< len(str(e2)):
                deleted.append(e1)
    return list(set(l).difference(set(deleted)))

def iscapitalized(strng):
    #Returns True iff strng starts with an uppecase letter and the rest (of the cased letters) are lowecase
    cap=False
    if strng!='':
        if strng[0].isupper():
            if len(strng) > 1:
                if strng[1:].islower() or strng.endswith('.'):
                    cap=True
            else:
                cap=True
    return cap

def transpose(lstlst):
    #lstlst is a list of sequences.
    #returns a list of lists, in which the internal lists are transpositions of input
    #lstlst=[[1,2,3], [4,5,6], [7,8,9,10]]
    #returns[[1,4,7], [2,5,8], [3,6,9]]
    min_l=min(list(map(lambda x: len(x), lstlst)))
    result=[]
    for i in range(0,min_l):
        comp=[]
        for seq in lstlst:
            comp.append(seq[i])
        result.append(comp)    
    return result    

def sum(lst):
    res=0
    for i in lst:
        res+=i
    return res
    

# Bibliography-specific functions

In [None]:
def overlap(bibl1, codebibl2):
    #Returns the overlap between the dataset bibl1 and another dataset indicated by 913$a<codebibl2>
    #That is, the records in bibl1 for which 913$a<codebibl2> exist
    #Tolerate 1st character lower and uppercase
    r='('+codebibl2[0].lower() + '|' + codebibl2[0].upper() + ')' + codebibl2[1:] #Parentheses are necessary!
    if bibl1 !=[]:
        result=filterRecords(bibl1, r,['913'])
    else:
        result= []
    return result

def authorGender(autrecs, biblrecs):
    #returns a list of 3 lists:
    #1. the biblrecs with female main authors
    #2. the biblrecs with male authors
    #3. the biblrecs with no gender info on main author
    #(biblrecs minus the union of the 3 above include records that have no main author
    #    or have main author, but no author ID, or the author ID is not found in autrecs)
    females=[]
    males=[]
    noGenderInfo=[]
    #Create an index of autrecs, for more efficient retrieval
    autindx=indexRecords(autrecs)
    #Look only at those with 100$0 field (has main author (person) and is authorised)
    withMainAuth=selectAssigned(biblrecs,'100', subfields=['0'])
    for rec in withMainAuth:
        #remove the prefix from $0
        autid = rec.get_fields('100')[0].get_subfields('0')[0][10:]
        if autid in autindx.keys():
            aut=autindx[autid]
            gf=aut.get_fields('375')
            if gf!=[]:
                if gf[0].value()[0] in {'f', 'F'}:
                    females.append(rec)
                elif gf[0].value()[0] in {'m', 'M'}: 
                    males.append(rec)
            else:
                noGenderInfo.append(rec)
        else:
            noGenderInfo.append(rec)
    return [females, males, noGenderInfo]

def authorGender2(biblrecs, girls, boys):
    #returns a list of 3 lists:
    #1. the biblrecs with female main authors
    #2. the biblrecs with male authors
    #3. the biblrecs with no gender info on main author
    #girls and bouys are  lists of names extracted from SSB (https://data.ssb.no/api/v0/no/console)
    females=[]
    males=[]
    noGenderInfo=[]
    #Look only at those with 100$a field (has main author (person) and a name)
    withMainAuth=selectAssigned(biblrecs,'100', subfields=['a'])
    #Identify the individual first names in 100$a
    for rec in withMainAuth:
        names= forenames(rec.get_fields('100')[0].get_subfields('a')[0])
        if set(girls).intersection(set(names)) != set():
            females.append(rec)
        elif set(boys).intersection(set(names)) != set():
            males.append(rec)
        else:
            noGenderInfo.append(rec)
    return [females, males, noGenderInfo]

def forenames (namestring):
    #returns a list of forenames from a field 100a
    #on the form <forenames>, <last name>(s), e.g. 
    # Kvamme, Ole Andreas   --> returns [Ole, Andreas]
    # Downs, Brian H.  ---> returns [Brian]
    fnamestr=namestring.partition(',')[2].strip()
    fnames=list(map (lambda x: x.strip(), fnamestr.split(' ')))
    #Ignore abbreviations/initials
    res=[]
    for s in fnames:
        if len(s)>1 and s[-1]!='.':
            res.append(s)
    return res      
    

def publishedYears(records, groupSz=0):
    yrCounter=valueCounter(records, ['008'], slice=(7,11))   #sorted by keys (years)
    return pd.DataFrame(yrCounter.values(), index=yrCounter.keys())

def publishedBetween(records, fromYear=0, toYear=2040):
    #returns the number of records in records published in the given interval
    yrCounter=valueCounter(records, ['008'], slice=(7,11))   #sorted by keys (years)
    res=0
    for k in yrCounter.keys():
        if k.isdigit() and int(k)>=fromYear and int(k)<toYear:
            res+=yrCounter[k]
    return res

def textvolume (records):
    return textvolumeInfo(records)[2]

def textvolumeInfo (records):
    #calculates the approximate, total  number of pages or leaves in records
    #Filters out the subset of records with 'a' in Leader
    #Then calculates the number of pages or leaves from 300$a in the subset
    #Returns a tuple of 3 elements: 
    #(1)The number of records, (2)the number of text records, (3)the number of text pages or leaves
    #textrecs=filterRecordsByControlField(records, 'ta', '007', (0,2))
    textrecs=filterRecordsByLeader(records, 'a', posint=(6,7))
    textvol=sum(list(map (lambda x: textExtent(x), textrecs)))
    return (len(records), len(textrecs), textvol)

def textExtent(record):
    ext=0
    extentstr=''
    f300=record.get_fields('300')
    if f300 != []:
        sf300a=f300[0].get_subfields('a')
        if sf300a != []:
            extentstr=sf300a[0]
            ext=gatherTextExtent(extentstr)
    return ext

def gatherTextExtent(extentstring):
    #extentstring is the total content of 300a
    extentstr=extentstring
    ext=0
    if re.search('(\d b\. i 1)', extentstr) is not None:
        #remove this, the rest should detail the pages
        extentstr=extentstr.replace(re.search('(\d b\. i 1)', extentstr).groups()[0],'',1)
    elif re.search('(\d b\.)', extentstr) is not None:
        extentstr=extentstr.replace(re.search('(\d b\.)', extentstr).groups()[0],'',1)
    for extentcomp in extentstr.split(','):
            ext+=calcTextExtent(extentcomp)
    return ext

def calcTextExtent(extentstring):
    #Calculates the number of pages or leaves expressed by extsentring
    #extentstring is 1 statement in 300$a (which may contain several statements separated by comma)
    #Example of 300$a: 1 bl., 4,  [2] s., S. 595-1088, [2] s. This contains 5 extentstatments, 
    #to be processed separately here
    #examples: 
    # 148 s.| 150 s.|154 bl.|126 s.|'S. 95-96|Side 95-96 | S. 96-[118]|S. [103]-130| S. [109]-[121]
    # V|[6] | 220 s.|
    ext=0
    #1. Detect number of units like 15 s. (or S.) or 15 sider (or Sider) or 15 bl. or Bl. or blad or Blad.
    if re.search('\[?(\d+)\]?\s*((s|S)\.|(s|S)ider|(b|B)l\.|(b|B)lad)', extentstring) is not None:
        #retrieve the first matching pagenumber
        pagenum=re.search('(\d+)', extentstring).groups()[0]
        if pagenum.isnumeric() == True:
            ext+=int(pagenum)
    #2 Detect spans,  like 'S. 67 | S. 95-96|Side 95-96 | S. 96-[118]|S. [103]-130| S. [109]-[121]
    elif re.search('((s|S)\.|(s|S)ide)\s*\[?(\d+)\]?\s*-\s*\[?(\d+)', extentstring) is not None:
        pagespan=re.search('(\d+)[^\d]*(\d+)', extentstring).groups()
        #print(pagespan)
        if pagespan[0].isnumeric() == True and pagespan[1].isnumeric() == True:
            ext+=int(pagespan[1])-int(pagespan[0])
    #3 Detect span without unit in front (occurs in cases when 300a includes e.g. 's. [1]-284, 285-467',)
    elif re.search('(\d+)\]?\s*-\s*\[?(\d+)', extentstring) is not None:
        pagespan=re.search('(\d+)[^\d]*(\d+)', extentstring).groups()
        #print(pagespan)
        if pagespan[0].isnumeric() == True and pagespan[1].isnumeric() == True:
            ext+=int(pagespan[1])-int(pagespan[0])
    #3 Detect single pages, like S. 67
    elif re.search('((s|S)\.|(s|S)ide)\s*\[?(\d+)', extentstring) is not None:
        ext+=1
    #4 Detect pagenum without unit, like in 134
    elif re.search('(\d+)', extentstring) is not None:
        #Assume this is a numer of pages or leaves (occurs in cases like  300a='134, 56 s.'')
        pagenum=re.search('(\d+)', extentstring).groups()[0]
        if pagenum.isnumeric() == True:
            ext+=int(pagenum) 
    return ext        
        


# 1. Read the data sets

In [None]:
#Ny innlesing pga mistenkt feil i 600-felter

handler=pymarc.marcxml.XmlHandler()
solstad=pymarc.marcxml.parse_xml_to_array('solstad2022.xml') #actually returns a list
proysen=pymarc.marcxml.parse_xml_to_array('proysen2022.xml')
hamsun=pymarc.marcxml.parse_xml_to_array('hamsun2022.xml')
collett=pymarc.marcxml.parse_xml_to_array('collett2022.xml')
bjornson=pymarc.marcxml.parse_xml_to_array('bjornson2022.xml')
norskeboker=pymarc.parse_xml_to_array('norske-boker-1519-1850_2022.xml')
bibl1814=pymarc.parse_xml_to_array('1814_2022.xml')
noram=pymarc.parse_xml_to_array('noram2022.xml')
samisk=pymarc.parse_xml_to_array('samisk2022.xml')
bibliografier=[solstad,proysen, hamsun, collett, bjornson, norskeboker, bibl1814, noram, samisk]
bibliografiNavn={'solstad':solstad, 'prøysen': proysen , 
                 'hamsun':hamsun, 'collett':collett, 'bjørnson': bjornson, 
                 'norske bøker': norskeboker, '1814': bibl1814, 'norsk-amerikansk': noram, 'samisk': samisk}
bibliografiVar={'solstad':'solstad', 'prøysen': 'proysen' , 
                 'hamsun':'hamsun', 'collett':'collett', 'bjørnson': 'bjornson', 
                 'norske bøker': 'norskeboker', '1814': 'bibl1814', 'norsk-amerikansk': 'noram', 'samisk': 'samisk'}

In [None]:
for k in sorted(bibliografiNavn.keys()):
    print(k+': ', len(bibliografiNavn[k]))

# 2. Inspect and analyse the bibliographies

Pandas cheat sheets: 
https://www.dataquest.io/blog/pandas-cheat-sheet/
https://towardsdatascience.com/my-python-pandas-cheat-sheet-746b11e44368

User guide:
https://pandas.pydata.org/docs/user_guide/10min.html  (intro)
https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html (IO facilities)
https://pandas.pydata.org/docs/user_guide/text.html  (working with text)
https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html (visualization, plotting)

MatPlotLib:
User guide:
https://matplotlib.org/users/index.html


## 2.1. Overview, number of records, pages, material type

### 2.1.1. Number of records

In [None]:
bLength=list(map(lambda x: len(x), bibliografier))
overview= pd.Series(bLength, index=bibliografiNavn.keys())
df_overview=pd.DataFrame(overview, columns=['Antall poster'])
fig0=df_overview.plot(kind='bar', legend=False, fontsize=16, figsize=(12,7)).get_figure()
fig0.savefig('overview.pdf', bbox_inches='tight')
fig0.savefig('overview.jpeg', bbox_inches='tight')
#df_overview.plot(kind='bar', legend=False, fontsize=14, figsize=(8,4)).get_figure()

### 2.1.2. Type of material (medium)

In [None]:
dicts=[]
for bn in bibliografiNavn.keys():
    dicts.append(valueCounter(bibliografiNavn[bn], ['000'], slice=(6,7)))
mediaTypes=mergeDicts(dicts)
mediaTypes
#Only distinguish between language material and others

In [None]:
#Consider only types a, c, i and j. The rest is summed up into 'other'
#This is reduction based on keys, not on values
dictsReduced=[]
for bn in bibliografiNavn.keys():
    tmp=valueCounter(bibliografiNavn[bn], ['000'], slice=(6,7))
    reduced=dict()
    keep=['a', 'c', 'i', 'j']
    for k in keep:
        reduced[k]=tmp[k]
    #Sum up the rest
    oth=0
    for k in tmp.keys():
        if k not in keep:
            oth+=tmp[k]
    reduced['other']=oth
    dictsReduced.append(reduced)
mediaTypesRed=mergeDicts(dictsReduced)
mediaTypesRed

In [None]:
list (map (lambda x: list(x.values()), dictsReduced))

In [None]:
data=list (map (lambda x: list(x.values()), dictsReduced))
df_media=pd.DataFrame(data, 
                      columns=['Text', 'Sheet music', 'Audio (not music)', 'Audio (music)', 'Other'], 
                      index=bibliografiNavn.keys())

s=df_media.style
s

In [None]:
len(collett)

### 2.1.3 Size of textual content

In [None]:
bPages=list(map(lambda x: textvolume(x), bibliografier))
overview2= pd.Series(bPages, index=bibliografiNavn.keys())
df_overview2=pd.DataFrame(overview2, columns=['Antall sider/blad'])
fig02=df_overview2.plot(kind='bar', legend=False, fontsize=16, figsize=(12,7)).get_figure()
fig02.savefig('overview2.pdf', bbox_inches='tight')
fig02.savefig('overview2.jpeg', bbox_inches='tight')
#df_overview2.plot(kind='bar', legend=False, fontsize=14, figsize=(8,5)).get_figure()

In [None]:
#Criterium for text 007/00-01 = ta (Do not use! This does not include text in computer filesMore than this is text)
for bn in bibliografiNavn.keys():
    size=textvolumeInfo(bibliografiNavn[bn])
    pagesPerRecord=int(size[2]/size[0])
    pagesPerTextRecord=int(size[2]/size[1])
    print(bn, size, pagesPerRecord, 'per record', pagesPerTextRecord, 'per textual record')

In [None]:
#Criterium for text: Using LDR/06 = 'a' (includes printed and digital text, but not audiobooks)
pagesPerRecList=[]
for bn in bibliografiNavn.keys():
    size=textvolumeInfo(bibliografiNavn[bn])
    pagesPerRecord=int(size[2]/size[0])
    pagesPerTextRecord=int(size[2]/size[1])
    pagesPerRecList.append(pagesPerTextRecord)
    #print(bn, size, pagesPerRecord, 'per record', pagesPerTextRecord, 'per textual record')
df_pagePerTextrec=pd.DataFrame(pagesPerRecList, 
                      columns=['Number of pages per text resource'],
                      index=bibliografiNavn.keys()
                    )

s=df_pagePerTextrec
s    


In [None]:
figpgPerRec=df_pagePerTextrec.plot(kind='line', legend=False, fontsize=16, figsize=(12,7), rot=270).get_figure()
figpgPerRec.savefig('pgPerTextRec_line.pdf', bbox_inches='tight')
figpgPerRec.savefig('pgPerTextRec_line.jpeg', bbox_inches='tight')

## 2.2. Bibliographical level

In [None]:
# Bibliografisk nivå: Monografier eller artikler/kapitler?

bibliografiskNiva=[]
for bn in bibliografiNavn.keys():
    b=bibliografiNavn[bn]
    mono=filterRecordsByLeader(b,'m', (7,8))
    comp=filterRecordsByLeader(b,'a', (7,8))
    ser=filterRecordsByLeader(b,'s', (7,8))
    integr=filterRecordsByLeader(b,'i', (7,8))
    bibliografiskNiva.append((bn, len(mono), len(comp), len(ser), len (integr), len(b)))

for tp in bibliografiskNiva:
    print (tp[0]+':','\t', 'Monografier:', tp[1],'('+ str(round((100*tp[1])/tp[5]))+'%)', '\t', 
           'Artikler/Kapitler:', tp[2],'('+ str(round((100*tp[2])/tp[5]))+'%)','\t', 'Serier:', tp[3], 
           '\t','Hele bibliografien:', tp[5])

In [None]:
# Bibliografisk nivå: Monografier eller artikler/kapitler? Bruker DataFrame

bibliografiskNiva=[]
for bn in bibliografiNavn.keys():
    b=bibliografiNavn[bn]
    mono=filterRecordsByLeader(b,'m', (7,8))
    comp=filterRecordsByLeader(b,'a', (7,8))
    ser=filterRecordsByLeader(b,'s', (7,8))
    integr=filterRecordsByLeader(b,'i', (7,8))
    bibliografiskNiva.append([len(mono), len(comp), len(ser)+len(integr), len(b)])
#Slå sammen serier og integrerte
df_bibliografiskNiva=pd.DataFrame(bibliografiskNiva, index=bibliografiNavn.keys(),
                                  columns=['Monographs', 'Articles/Chapters', 'Other','Total number'])
df_bibliografiskNiva=df_bibliografiskNiva.T
blevel=df_bibliografiskNiva[:-1].T.plot(kind='bar', stacked=True, fontsize=24, 
                                      title='Bibliographic level (absolute values)', 
                                      legend=False,figsize=(12,8)).get_figure()
plt.legend(bbox_to_anchor=(1,1), fontsize='xx-large')
blevel.savefig('biblLevelAbs.pdf', bbox_inches='tight')
blevel.savefig('biblLevelAbs.jpeg', bbox_inches='tight')
#Make df with relative distribution of material type
reldf=pd.DataFrame(columns=bibliografiNavn.keys(), 
                               index=['Monographs', 'Articles/Chapters', 'Other', 'Total number '])
for bn in bibliografiNavn.keys():
    reldf[bn][:-1] = round((df_bibliografiskNiva[bn][:-1] / df_bibliografiskNiva[bn][-1]) * 100, 0)
    reldf[bn][-1]=100  #df_bibliografiskNiva[bn][-1]
blevelx=reldf[:-1].T.plot(kind='bar', stacked=True, fontsize=24, title='Bibliographic level (relative distribution)', legend=False,
                       figsize=(12,8), rot=270).get_figure()
blevelx.tight_layout()
plt.legend(bbox_to_anchor=(1,1), fontsize='xx-large')
blevelx.savefig('biblLevel.pdf', bbox_inches='tight') # the latter must be included to include the legend in the file
blevelx.savefig('biblLevel.jpeg', bbox_inches='tight')

#Line diagrame, to compare with numper of pages per text record. 
# Only include monographs portion (= 1st value in each reldf[bn])
monogr_percentage=[list(map(lambda x: reldf[x][0],bibliografiNavn.keys()))]
monogr_df=pd.DataFrame(data=monogr_percentage, columns=bibliografiNavn.keys()) 
mlevelx_line=monogr_df.T.plot(kind='line', fontsize=24, title='Percentage of monographs', legend=True,
                       figsize=(12,8), rot=270).get_figure()   #rot=270 gives vertical Xticks labels
mlevelx_line.tight_layout()
plt.legend(bbox_to_anchor=(1,1), fontsize='xx-large')
mlevelx_line.savefig('monogrLevelLine.pdf', bbox_inches='tight') # the latter must be included to include the legend in the file
mlevelx_line.savefig('monogrLevelLine.jpeg', bbox_inches='tight')
reldf


## 2.3  Overlap between data sets

In [None]:
#The datasets are parsed separately, hence set intersection functions of no use

#Er det poster som finnes i alle bibliografiene? Svar: 0
overlapAll=bibliografiNavn[list(bibliografiNavn.keys())[0]]
for i in range(0, len(bibliografiNavn.keys())):
    if i<len((bibliografiNavn.keys())) and overlapAll != []:
        k2=list(bibliografiNavn.keys())[i+1] #bibliografiNavn.keys() is not subscriptable
        overlapAll=overlap(overlapAll, k2)

In [None]:
bibliografiNavn[list(bibliografiNavn.keys())[0]]

In [None]:
#pairwise overlap
overlapPairs=[]
for bn1 in bibliografiNavn.keys():
    overl=[]
    for bn2 in bibliografiNavn.keys():
        overl.append(len(overlap(bibliografiNavn[bn1], bn2)))
    overlapPairs.append(overl)

overlapsFrame=pd.DataFrame(overlapPairs, index=bibliografiNavn.keys(),columns=bibliografiNavn.keys())
overlapsFrame
    

In [None]:
#list(map(lambda x: len(x), bibliografier))
#print(145/4200)
x=overlap(bibliografiNavn['norske bøker'], '1814')
printFields(x, ['100','008', '245', '650'])

In [None]:
#Remove overlap with same bibliography
for k in bibliografiNavn.keys():
    overlapsFrame[k][k]=0
fig2=overlapsFrame.plot(kind='bar', title='Overlap between bibliographies', legend=False, figsize=(14,10),
                        fontsize=24, stacked=True).get_figure()
#fig2.tight_layout()
plt.legend(bbox_to_anchor=(1,1), fontsize='xx-large')
#fig1.savefig('mtype.pdf', bbox_inches='tight')
fig2.savefig('overlap.pdf', bbox_inches='tight')
fig2.savefig('overlap.jpeg', bbox_inches='tight')

## 2.4. Studying the authors and contributors

In [None]:
#Degree of unauthorised responsibles

def unauthorisedAgentsInfo(records, fieldtags, autrefSubfield):
    #Calculates the proportion (in %) of records having a field with tag in fieldtags 
    #    that do not have the subfield autrefSubfield
    #returns a tuple (number of records with unauth fieldtags (any), number of records with fieldtags (any), ratio)
    unauthLst=[]
    withFldLst=[]
    for fld in fieldtags:
        withFld=selectAssigned(records,fld)
        unauth=list(set(withFld).difference(set(selectAssigned(records, fld, autrefSubfield))))
        withFldLst.extend(withFld)
        unauthLst.extend(unauth)
    return ((len(set(unauthLst)), len(set(withFldLst)), round(100*len(set(unauthLst))/len(set(withFldLst)))))    

def unauthorisedAgents(records, fieldtags, autrefSubfield):
    #returns the list of records having at least a field with tag in fieldtags 
    #    that do not have the subfield autrefSubfield
    unauthLst=[]
    withFldLst=[]
    for fld in fieldtags:
        withFld=selectAssigned(records,fld)
        unauth=list(set(withFld).difference(set(selectAssigned(records, fld, autrefSubfield))))
        withFldLst.extend(withFld)
        unauthLst.extend(unauth)
    return list(set(unauthLst))

In [None]:
print('Poster med uautoriserte agenter:')
print('Prøysen:', unauthorisedAgentsInfo(proysen, ['100','110','111','700', '710','711'], '0'))
print('Solstad:', unauthorisedAgentsInfo(solstad, ['100','110','111','700', '710','711'], '0'))
print('Collett:', unauthorisedAgentsInfo(collett, ['100','110','111','700', '710','711'], '0'))
print('Hamsun:', unauthorisedAgentsInfo(hamsun, ['100','110','111','700', '710','711'], '0'))
print('Bjørnson:', unauthorisedAgentsInfo(bjornson, ['100','110','111','700', '710','711'], '0'))
#print('Undset:', unauthorisedAgentsInfo(undset, ['100','110','700', '710'], '0'))
print('Norske bøker:', unauthorisedAgentsInfo(norskeboker, ['100','110','111','700', '710','711'], '0'))


### 2.4.1 Exporting and reading all authorities

In [None]:
#Based on Export function in API (https://authority.bibsys.no/authority/)
#Uses a downloaded export
expPath='C:/Users/oddruno/OneDrive - Nasjonalbiblioteket/Dokumenter/Verksregister/SHARE-VDE/Authorities export/20240202/'
fstr='2024-01-31-010002_'
def toStr3(num):
    if num<10:
        res='00'+str(num)
    else:
        if num<100:
            res='0'+ str(num)
        else:
            res=str(num)
    return res

def toStr2(num):
    if num<10:
        res='0'+str(num)
    else:
        if num<100:
            res=str(num)
    return res

In [None]:
#Read and parse all authorities
authorities=[]
#The range end point must be 1 + the number of files. 
for i in range(0,243):
    flind=toStr3(i)
    recs=pymarc.parse_xml_to_array(expPath + fstr + flind + '.xml')
    authorities.extend(recs)

#Extract the set which have gender info
withGender=selectAssigned(authorities, '375')

In [None]:
kat1=select(authorities, '901', ['kat1']) #300740
kat2=select(authorities, '901', ['kat2']) #1727664 
kat3=select(authorities, '901', ['kat3']) #178327 
print(len(kat1), len(kat2), len(kat3), 'sum:', len(kat1)+len(kat2)+len(kat3), len(authorities)) #authorities=2208423
sumkat=len(kat1)+len(kat2)+len(kat3)
sumtot=len(authorities)
k1perc=round(len(kat1)*100/sumtot, 0)
k2perc=round(len(kat2)*100/sumtot, 0)
k3perc=round(len(kat3)*100/sumtot, 0)
print('prosentfordeling:', k1perc, k2perc, k3perc, 'totalt:', k1perc+k2perc+k3perc )

In [None]:
persons=selectAssigned(authorities, '100')
personsWG=selectAssigned(persons, '375')
print("% of persons with gender:", round(len(personsWG)*100/len(persons),0), '%')
print('% of authorities which are persons:', round(len(persons)*100/len(authorities), 0), '%')
len(persons)

In [None]:
#Crosscheck
print('Number of person authorities with gender info: ', len(personsWG))
personsWG == withGender

In [None]:
kat1Persons=selectAssigned(kat1, '100')
wG1=selectAssigned(kat1Persons, '375')
print('kat1 persons:', len(kat1Persons), 'with gender:', len(wG1), round(len(wG1)*100/len(kat1Persons),1), '%') #263973
kat1Corps=selectAssigned(kat1, '110')
len(kat1Corps) #29638
kat2Persons=selectAssigned(kat2, '100')
wG2=selectAssigned(kat2Persons, '375')
print('kat2 persons:', len(kat2Persons),'with gender:', len(wG2), round(len(wG2)*100/len(kat2Persons),1), '%') #263973
kat3Persons=selectAssigned(kat3, '100')
wG3=selectAssigned(kat3Persons, '375')
print('kat3 persons:', len(kat3Persons),'with gender:', len(wG3), round(len(wG3)*100/len(kat3Persons),1), '%') #263973

### 2.4.2 Extract gender info from authority registry

In [None]:
#Oversikt over kjønnsfordeling på hovedforfattere i de ulike bibliografiene
#Først selve postene
#OBS: Tar 4 timer, fjern Samisk neste gang
genderDict=dict()
for bn in bibliografiNavn.keys():
    genderDict[bn]=authorGender(withGender, bibliografiNavn[bn])
    #Print to file, for easier access next time around
    writeMarcToFile(genderDict[bn][0], bn +'-f'+'.xml')  
    writeMarcToFile(genderDict[bn][1], bn +'-m'+'.xml')
    writeMarcToFile(genderDict[bn][2], bn +'-noGender'+'.xml')
#Så opptelling    
genderDictCount=dict()
for bn in bibliografiNavn.keys():
    genderDictCount[bn]=list (map (lambda x: len(x), genderDict[bn]))
genderDictCount

{'solstad': [171, 1539, 862],
 'prøysen': [486, 7129, 1085],
 'hamsun': [487, 3036, 3843],
 'collett': [612, 171, 158],
 'bjørnson': [129, 3803, 897],
 'norske bøker': [72, 3767, 1459],
 '1814': [213, 2147, 1090],
 'norsk-amerikansk': [262, 1545, 4729],
 'samisk': [9383, 13451, 4398]}

In [None]:
#Calculate ratios of lacking gender info in autreg
for bn in bibliografiNavn.keys():
    noGenderRatio=round(100*genderDictCount[bn][2]/sum(genderDictCount[bn]), 0)
    print(bn, 'no gender ratio:', noGenderRatio, '%')

### 2.4.3 For records where no gender info was found, check against female and male first names extracted from SSB

In [None]:
#Sjekk postene der kjønn på forfatter ikke ble funnet, opp mot SSBs lister over jente- og guttenavn.
#Les inn jentenavn og guttenavn extrahert fra SSB. 
j_ssb=readlines('Jentenavn2013-2023.txt')
g_ssb=readlines('Guttenavn2013-2023.txt')

genderDictExt=dict()

for bn in bibliografiNavn.keys():
    undecided=genderDict[bn][2]   #3rd element in result from authorGender
    genderDictExt[bn]=authorGender2(undecided, j_ssb, g_ssb)

#Så opptelling    
genderDictExtCount=dict()
for bn in bibliografiNavn.keys():
    genderDictExtCount[bn]=list (map (lambda x: len(x), genderDictExt[bn]))
genderDictExtCount

### 2.4.4 Merge the results from 1 and 2 above

In [None]:
### DO NOT MERGE the lists as  THE  BELOW --- .extend changes genderDict!!  #############
##genderDictMerged=dict()
##for bn in bibliografiNavn.keys():
##    genderDictMerged[bn]=[genderDict[bn][0].extend(genderDictExt[bn][0]), 
##                         genderDict[bn][1].extend(genderDictExt[bn][1]),
##                         genderDictExt[bn][2]]

#Merge counts directly   
genderDictMergedCount=dict()
for bn in bibliografiNavn.keys():
    genderDictMergedCount[bn]=[genderDictCount[bn][0]+genderDictExtCount[bn][0],
                              genderDictCount[bn][1]+genderDictExtCount[bn][1],
                              genderDictExtCount[bn][2]]
genderDictMergedCount


In [None]:
#Calculate ratios for lacking gender info, after checking against SSB data
for bn in bibliografiNavn.keys():
    noGenderRatioFinal=round(100*genderDictMergedCount[bn][2]/sum(genderDictMergedCount[bn]), 0)
    print(bn, 'no gender ratio (final):', noGenderRatioFinal, '%')

In [None]:
#Check if correct. sum(genderDictMergedCount[bn]) should equal sum(genderDictCount[bn]))
for bn in bibliografiNavn.keys():
    if sum(genderDictMergedCount[bn]) == sum(genderDictCount[bn]):
        print(bn, 'OK')
    else:
        print(bn, 'feil')

In [None]:
# data=list(genderDictCount.values())
df_gender=pd.DataFrame(list(genderDictMergedCount.values()), index=bibliografiNavn.keys(), 
                     columns=['female', 'male', 'no gender info'])
df_gender_full=pd.DataFrame(columns=bibliografiNavn.keys(), 
                     index=['female', 'male', 'no gender info/no autid', 'no main author'])
for bn in bibliografiNavn.keys():
    df_gender_full[bn][:-1]=df_gender.T[bn]
    df_gender_full[bn][-1]=len(bibliografiNavn[bn])-df_gender_full[bn][:-1].sum()
df_gender_full

df_gender_rel=pd.DataFrame(columns=bibliografiNavn.keys(), 
                     index=['female', 'male', 'no gender info/no autid', 'no main author'])
#for bn in bibliografiNavn.keys():
    #reldf[bn][:-1] = round((df_bibliografiskNiva[bn][:-1] / df_bibliografiskNiva[bn][-1]) * 100, 0)
    #df_gender_rel[bn] = (df_gender_full[bn] // sum(df_gender_full[bn])) * 100
    #df_gender_rel[bn] = (df_gender_full[bn] / df_gender_full[bn].sum()) * 100
df_gender_full.to_csv('genderData.csv')
df_gender_full

#Plotting is made from Excel in gender.xlsx (in Notabene folder)


## 2.5 A geographical perspective (about places)

### 2.5.1 Functions

In [None]:
def placeName(placeString):
    #Remove possible qualifiers
    return placeString.partition('(')[0].strip()

def qualifier(placeString):
    #Remove possible qualifiers
    tmp=placeString.partition('(')[2]
    return tmp.partition(')')[0].strip()
    
noramPlDict= valueCounter(noram651,['651'], subfieldtags=['a'], countDupl=False ) #864


In [None]:
#NB: geonameUsr is your username in Geonames (a string)

def lookupPlace(plStr, featureClasses=None, fpri=None, user=geonameUsr, useQualifier=0):
    #looks up in Geonames via geocoder the place represented by plStr (typically fetched from 651$a)
    #if useQualifier=0, only look up placeName(plStr)
    #if useQualifier=1, look up placeName(plStr). If not found, look up qualifier(plStr)
    #if useQualifier=2, look up qualifier(plStr) if it exists and retruns result, else look up placeName(plStr)
    #fpri is one featureClass to be prioritized, featureClasses defines the featureClasses to be searched amongst 
    if useQualifier ==0:
        g=lookupPl(placeName(plStr), featureClasses=featureClasses,fpri=fpri, user=user)
    elif useQualifier==1:
        g=lookupPl(placeName(plStr), featureClasses=featureClasses,fpri=fpri, user=user)
        if g.lat is None or g.lng is None:
            if qualifier(plStr) != '':
                g=lookupPl(qualifier(plStr), featureClasses=featureClasses,fpri=fpri, user=user)
    elif useQualifier==2:
        if qualifier(plStr) != '':
            g=lookupPl(qualifier(plStr), featureClasses=featureClasses,fpri=fpri, user=user)
            if g.lat is None or g.lng is None:
                g=lookupPl(placeName(plStr), featureClasses=featureClasses,fpri=fpri, user=user)
        else:
            g=lookupPl(placeName(plStr), featureClasses=featureClasses,fpri=fpri, user=user)
    else:
        return None
    return g

def lookupPl(plStr, featureClasses=None, fpri=None, user=geonameUsr):
    #looks up plStr in Geonames via geocoder
    #tries fpri (a featureClass) first. If that fails try featureClasses
    if fpri is not None:
        g=geocoder.geonames(plStr, featureClass=fpri, key=user)
        if g.lat is None or g.lng is None:
            g=geocoder.geonames(plStr, featureClass=featureClasses, key=user)
    else:
        g=geocoder.geonames(plStr, featureClass=featureClasses, key=user)
    return g   

### 2.5.1 Preliminaries

In [None]:
w662=selectAssigned(noram, '662')
w651=selectAssigned(noram, '651')
wgeo=list(set(w651).union(set(w662)))
print('651:', len (w651), '662:', len (w662), 'Den ene eller andre:', len(wgeo))
set(w651) == set(wgeo)   #Alle med 662 har også 651

In [None]:
#The ratio of the documents in each bibliography that are about a geographical topic
for k in bibliografiNavn.keys():
    exec(bibliografiVar[k]+'651 = selectAssigned(bibliografiNavn[k], \'651\')')
    exec(bibliografiVar[k]+'662 = selectAssigned(bibliografiNavn[k], \'662\')')
    print(bibliografiVar[k]+'651:', round(len(eval(bibliografiVar[k]+'651'))*100/len(bibliografiNavn[k])))
    print(bibliografiVar[k]+'662:', round(len(eval(bibliografiVar[k]+'662'))*100/len(bibliografiNavn[k])))



#### Why field 662 is not used in the geographical analysis

### 2.5.2 Map for geographical topics of noram

#### Try with useQualifier=2 (use qualifier first. If not found, use the placename term)

In [None]:
m1dict=dict()
for pl in noramPlDict.keys():
    m1dict[pl]=lookupPlace(pl,featureClasses= ['A', 'P', 'L'], fpri='A', useQualifier=2)
    

In [None]:
#Generate the maps
import folium
m1 = folium.Map(location=[62,10])    #Norway
m1nf=[]
for pl in noramPlDict.keys():
    g=m1dict[pl]
    if g.lat is None or g.lng is None:
        m1nf.append(pl)
    else:
        c=noramPlDict[pl]
        #r=500*max(1, math.sqrt(c))
        r=500*c/2
        folium.Circle(location=[g.lat, g.lng], radius=r, 
                      popup=pl+', '+str(c), color="crimson", fill=True).add_to(m1)
m1.save("noram_q2_fpri_A.html") 

#### Try with useQualifier=1 (use specific name  first. If not found, use the qualifier term)

In [None]:
m1dict2=dict()
for pl in noramPlDict.keys():
    m1dict2[pl]=lookupPlace(pl,featureClasses= ['A', 'P', 'L'], useQualifier=1)
    

In [None]:
#Generate the maps, useQualifier=2
m1 = folium.Map(location=[62,10])    #Norway
m1nf2=[]
for pl in noramPlDict.keys():
    g=m1dict2[pl]
    if g.lat is None or g.lng is None:
        m1nf2.append(pl)
    else:
        c=noramPlDict[pl]
        #r=500*max(1, math.sqrt(c))
        r=500*c/2
        folium.Circle(location=[g.lat, g.lng], radius=r, 
                      popup=pl+', '+str(c), color="crimson", fill=True).add_to(m1)
m1.save("noram_q1_fpri_None.html") 

## 2.6 Publishing years

In [None]:
#Norske bøker:
publishedYears(norskeboker).plot(kind='line', figsize=(12,8))
#1814
publishedYears(bibl1814).plot(kind='line', figsize=(12,8))
#norsk-amerikansk
publishedYears(noram).plot(kind='line', figsize=(12,8))
#hamsun
publishedYears(hamsun).plot(kind='line', figsize=(12,8))
#bjornson
publishedYears(bjornson).plot(kind='line', figsize=(12,8))
#collett
publishedYears(collett).plot(kind='line', figsize=(12,8))
#samisk
publishedYears(samisk).plot(kind='line', figsize=(12,8))


In [None]:
dfYear=publishedYears(bibl1814)
figYear=dfYear.plot(kind='line', figsize=(12,8), legend=False, 
                title='Publishing year distribution in the 1814 bibliography').get_figure()
figYear.savefig('publyear1814.pdf')
figYear.savefig('publyear1814.jpeg')


## 2.7 Genre and form

### 2.7.1 Genres in the 1814 bibliography

In [None]:
#Ordsky for sjangre 1814

valueCounter(bibl1814, ['655'], subfieldtags=('a', '2') )
medSjanger1814=filterRecords(bibl1814, '(norvok)|(bib1814)', ['655'], ['2'])
tempDict1814=valueCounter(medSjanger1814, ['655'], subfieldtags=('a', '2'))
#Se bare på norvok- og bib1814-sjangerord, og fjern $norvok og $bib1814'
norvokSjangre1814=dict()
for k in tempDict1814.keys():
    if '$norvok' in k:
        nk=k[:-7]   #strip $norvok
        #some terms may have norvok in some records, bib1814 in others
        if nk in norvokSjangre1814.keys():
            norvokSjangre1814[nk]+=tempDict1814[k]
        else:
            norvokSjangre1814[nk]=tempDict1814[k]
    if '$bib1814' in k:
        nk=k[:-8]   #strip $norvok
        if nk in norvokSjangre1814.keys():
            norvokSjangre1814[nk]+=tempDict1814[k]
        else:
            norvokSjangre1814[nk]=tempDict1814[k]

wc=WordCloud(background_color='white', width=800, height=400, mode='RGB').generate_from_frequencies(norvokSjangre1814)
wc.to_file('wc-bibl1814.jpeg')

### 2.7.2  Genres in Norske bøker 1539-1850

In [None]:
#Ordsky for sjangre norske bøker

#valueCounter(norskeboker, ['655'], subfieldtags=('a', '2') )
medSjangerNB=filterRecords(norskeboker, 'norvok', ['655'], ['2'])
tempDictNB=valueCounter(medSjangerNB, ['655'], subfieldtags=('a', '2'))
#Se bare på norvok-sjangerord, og fjern $norvok. (Egen kildekode for Norske bøker er aldri opprettet)
norvokSjangreNB=dict()
for k in tempDictNB.keys():
    if '$norvok' in k:
        norvokSjangreNB[k[:-7]]=tempDictNB[k]

wc=WordCloud(background_color='white', width=800, height=400, mode='RGB').generate_from_frequencies(norvokSjangreNB)
wc.to_file('wc-norskeboker.jpeg')

## 2.8 Of and about authors

In [None]:
#Solstad
omSolstad=filterRecords(solstad, 'Solstad, Dag.{0,2}$', ['600'], ['a', 't'])
omSolstadsVerker=filterRecords(solstad, 'Solstad, Dag \w+', ['600'], ['a', 't'])
avSolstad=filterRecords(solstad, 'Solstad, Dag', ['100'])
#fieldValues(solstad, ['008'], slice=(34,35))
bio_s=filterRecordsByControlField(solstad, 'b|d', '008', (34,35))
print('Solstad: Av:\t',len(avSolstad), '\tOm:', len(omSolstad), '\tBiografikode:', 
      len(bio_s),'\tOm verker:',len(omSolstadsVerker),  '\tTotalt:', len(solstad))

#Prøysen
omProysen=filterRecords(proysen, 'Prøysen, Alf.{0,2}$', ['600'], ['a', 't'])
omProysensVerker=filterRecords(proysen, 'Prøysen, Alf \w+', ['600'], ['a', 't'])
avProysen=filterRecords(proysen, 'Prøysen, Alf', ['100'])

#Special handling of Prøysen, since his contributions in terms of song lyrics are not reflected in the 100 field
#   but in the 700 field, with role = aut or lyr
tmp=filterRecords(proysen, 'Prøysen, Alf', ['700'])
contribProysen=filterRecords(tmp, '(lyr)|(aut)', ['700'], subfieldtags=['4'])
avProysen= list(set(avProysen).union(set(contribProysen)))

bio_p=filterRecordsByControlField(proysen, 'b|d', '008', (34,35))
print('Prøysen: Av:\t',len(avProysen), '\tOm:', len(omProysen), '\tBiografikode:', 
      len(bio_p),'\tOm verker:',len(omProysensVerker),  '\tTotalt:', len(proysen))

#Collett
omCollett=filterRecords(collett, 'Collett, Camilla.{0,2}$', ['600'] , ['a', 't'])
omCollettsVerker=filterRecords(collett, 'Collett, Camilla \w+', ['600'] , ['a', 't'])
avCollett=filterRecords(collett, 'Collett, Camilla', ['100'])
bio_c=filterRecordsByControlField(collett, 'b|d', '008', (34,35))
print('Collett: Av:\t',len(avCollett), '\tOm:', len(omCollett), '\tBiografikode:', 
      len(bio_c),'\tOm verker:',len(omCollettsVerker),  '\tTotalt:', len(collett))

#Hamsun
omHamsun=filterRecords(hamsun, 'Hamsun, Knut.{0,2}$', ['600'], ['a', 't'])
omHamsunsVerker=filterRecords(hamsun, 'Hamsun, Knut \w+', ['600'], ['a', 't'])
avHamsun=filterRecords(hamsun, 'Hamsun, Knut', ['100'])
bio_h=filterRecordsByControlField(hamsun, 'b|d', '008', (34,35))
print('Hamsun: Av:\t',len(avHamsun), '\tOm:', len(omHamsun), '\tBiografikode:', 
      len(bio_h),'\tOm verker:',len(omHamsunsVerker),  '\tTotalt:', len(hamsun))

#Bjørnson
omBjornson=filterRecords(bjornson, 'Bjørnson, Bjørnstjerne.{0,2}$', ['600'], ['a', 't'] )
omBjornsonsVerker=filterRecords(bjornson, 'Bjørnson, Bjørnstjerne \w+', ['600'], ['a', 't'] )
avBjornson=filterRecords(bjornson, 'Bjørnson, Bjørnstjerne', ['100'])
bio_b=filterRecordsByControlField(bjornson, 'b|d', '008', (34,35))
print('Bjørnson: Av:\t',len(avBjornson), '\tOm:', len(omBjornson), '\tBiografikode:', 
      len(bio_b),'\tOm verker:',len(omBjornsonsVerker),  '\tTotalt:', len(bjornson))


In [None]:
authorDict={'Bjornson': bjornson,'Collett':collett, 'Hamsun':hamsun, 'Proysen': proysen ,'Solstad':solstad }
sums=[]
df_av_om=pd.DataFrame(columns=authorDict.keys(), index=['Av', 'Om', 'Om verk'])
df_av_om_rel=pd.DataFrame(columns=authorDict.keys(), index=['Av', 'Om', 'Om verk'])
for au in authorDict.keys():
    sums.append(len(eval('av'+au))+len(eval('om'+au))+ len(eval('om'+au+'sVerker')))
    df_av_om[au][0]=len(eval('av'+au))
    df_av_om_rel[au][0]=round(df_av_om[au][0]/len(authorDict[au])*100)
    df_av_om[au][1]=len(eval('om'+au))
    df_av_om_rel[au][1]=round(df_av_om[au][1]/len(authorDict[au])*100)
    df_av_om[au][2]=len(eval('om'+au+'sVerker'))
    df_av_om_rel[au][2]=round(df_av_om[au][2]/len(authorDict[au])*100)
sums
df_av_om_rel

In [None]:
avomfig=df_av_om_rel.T.plot(kind='bar', figsize=(12,8), fontsize= 'xx-large',stacked=True, legend={'reverse'}).get_figure()
plt.legend(bbox_to_anchor=(1,1), fontsize='large')
#fig1.savefig('mtypeabs.pdf', bbox_inches='tight')
avomfig.savefig('av-om-verk.jpeg', bbox_inches='tight')

##  About language distribution, translations, etc in the documents (not included in paper)

In [None]:
#Oversettelser
#Kriterier for å bli definert som oversettelse (tilstrekkelig hver for seg): 
#1.Det finnes et 246-felt med $i som inneholder "originaltittel"
#2.Det finnes et 041-felt med $h
#3.Det finnes et 765-felt (original language entry)

def translations(records):
    #returns the records in records that appear to be translations
    return list(set(filterRecords(records, '(O|originaltit)', ['246'], ['i'])).union
                (set(selectAssigned(records, '041', ['h'])), set(selectAssigned(records,'765'))))

oversettelserAndel=[]
for bn in bibliografiNavn.keys():
    b=bibliografiNavn[bn]
    overs=translations(b)
    oversettelserAndel.append((bn, len(overs), len(b), round((len(overs)*100)/len(b))))

for tp in oversettelserAndel:
    print (tp[0]+':','\t', 'Antall oversettelser:', tp[1], '\t', 
           'Antall poster i bibl:', tp[2],'\t', 'Andel oversettelser', tp[3], '%')


In [None]:
#Språk i dokumentene i bibliografiene
languageDicts=[valueCounter(solstad, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(proysen, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(hamsun, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(collett, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(bjornson, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(norskeboker, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(bibl1814, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(noram, ['008', '041'], ['a'], slice=(35,38)),
    valueCounter(samisk, ['008', '041'], ['a'], slice=(35,38))]

reducedLanguageDicts=list(map(lambda x: reduceNumDict(x,50), languageDicts))

languageTable=mergeDicts(languageDicts)
reducedLanguageTable= mergeDicts(reducedLanguageDicts)


#showMarcRecord(solstad[4])

In [None]:
mergeDicts(reducedLanguageDicts)
heading=''
for bnavn in list(bibliografiNavn):
    heading=heading+ '\t'+ bnavn[0:4]
print(heading)
for key in list(reducedLanguageTable):
    row= key+':'
    for n in reducedLanguageTable[key]:
        row=row + '\t' + str(n)
    print(row)

In [None]:
langFrame=pd.DataFrame(list(reducedLanguageTable.values()), columns=bibliografiNavn.keys(), index=reducedLanguageTable.keys())
type(languageDicts[0])
#red1=reduceNumDict(languageDicts[0], 20)
#langFrame.T.plot(kind='bar', figsize=(12,8), stacked=True)
langFrame_rel=pd.DataFrame(columns=bibliografiNavn.keys(), 
                     index=reducedLanguageTable.keys())
for bn in bibliografiNavn.keys():
    langFrame_rel[bn] = round((langFrame[bn] / langFrame[bn].sum()) * 100, 0)
langFig=langFrame_rel.T.plot(kind='bar', figsize=(12,8), fontsize= 'xx-large',stacked=True, legend={'reverse'}).get_figure()
plt.legend(bbox_to_anchor=(1,1), fontsize='large')
#fig1.savefig('mtypeabs.pdf', bbox_inches='tight')
langFig.savefig('langs2.jpeg', bbox_inches='tight')

In [None]:
langSamisk=langFrame_rel['samisk'].plot(kind='pie', figsize=(20,20),fontsize='xx-large', legend=False).get_figure()
#langSamisk.legend(bbox_to_anchor=(1,1), fontsize='xx-large')
langSamisk.savefig('langSamisk.jpeg')
#langFrame['samisk']

In [None]:
langFrame['samisk'].plot.pie().get_figure()

In [None]:
overs_h=selectAssigned(norskeboker, '041', ['h'])
len(overs_h)
#showMarcRecord(overs_h[2])
overs_h[2].get_fields()[4].tag