In [1]:
#! /usr/bin/env python3

import os
import csv

from nameparser import HumanName
from pymarc import MARCReader
from titlecase import titlecase

In [2]:
def lookup_uid(record):
    return record.get_fields('001')[0].value().replace('AAI','')

In [3]:
folder_on_U = '/media/francis/U/ProquestDissertations/Theses and Dissertations/ProquestDissertations/'

with open(os.path.join(folder_on_U, 'UnrestrictedTheses', 'MARCDATA.MRC'), 'rb') as f:
    reader = MARCReader(f)
    marc_unrestricted_records = list()
    for record in reader:
        marc_unrestricted_records.append(record)

In [4]:
with open(os.path.join(folder_on_U, 'RestrictedTheses', 'MARCDATA.MRC'), 'rb') as f:
    reader = MARCReader(f)
    marc_restricted_records = list()
    for record in reader:
        marc_restricted_records.append(record)

In [5]:
images_path= '/media/francis/U/ProquestDissertations/Theses and Dissertations/Image Discs and Supplement Files'

with open(os.path.join(images_path, 'MARCDATA.MRC'), 'rb') as f:
    reader = MARCReader(f)
    marc_images_records = list()
    for record in reader:
        marc_images_records.append(record)

In [6]:
marc_all_records = {i for i in marc_restricted_records}.union({i for i in marc_unrestricted_records})

marc_unrestricted_uids = {lookup_uid(i) for i in marc_unrestricted_records}
marc_restricted_uids = {lookup_uid(i) for i in marc_restricted_records}
marc_all_uids = {lookup_uid(record) for record in marc_all_records}


In [7]:
folder_restricted = {file.replace('.pdf', '') 
                    for file in os.listdir(
                        os.path.join(folder_on_U, 'RestrictedTheses'))
                    if file.replace('.pdf', '').isnumeric()}
folder_unrestricted = {file.replace('.pdf', '') 
                    for file in os.listdir(
                        os.path.join(folder_on_U, 'UnrestrictedTheses'))
                    if file.replace('.pdf', '').isnumeric()}
folder_duplicated = {file.replace('.pdf', '') 
                    for file in os.listdir(
                        os.path.join(folder_on_U, 'ETDDuplicates'))
                    if file.replace('.pdf', '').isnumeric()}
folder_images = {file.replace('.pdf', '') 
                    for file in os.listdir(images_path)
                    if os.path.splitext(file)[1].lower() == '.pdf'}
folder_all = folder_restricted.union(folder_unrestricted).union(folder_duplicated)

In [8]:
print('expected:', 'pass', len(marc_unrestricted_uids), 'pass', len(marc_restricted_uids), len(marc_images_records))
print('observed:', len(folder_all), len(folder_unrestricted), len(folder_duplicated), len(folder_restricted), len(folder_images))

expected: pass 7308 pass 40 441
observed: 8618 7308 1270 40 441


## Making the crosswalk

In [9]:
def lookup_clean_title(record):
    text = record.get_fields('245')[0].value()
    text = titlecase(text)
    text = text.replace(':  ', ": ")
    for k, v in wrong_roman_numeral.items():
        if k in text:
            text = text.replace(k, v)
    return text

In [10]:
wrong_roman_numeral = {' Ii': ' II',
                       ' Iii ': ' III ',
                       '-Iii': '-III',
                       ' Iii.': ' III.',
                       ' Iv ': ' IV ',
                       ' Vi ': ' VI ',
                       ' Iv.': ' IV.',
                       ' Iv)': 'IV)',
                       ' Viii': ' VIII',
                       '-Vii ': '-VII',
                       '-Viii': '-VIII',
                       ' Vii': ' VII',
                       }

In [43]:
def make_dropbox_url(record):
    uid = lookup_uid(record)
    if record in marc_restricted_records:
        url = 'https://dl.dropboxusercontent.com/u/302551934/Proquests/Rs/{}.pdf'.format(uid)
    elif record in marc_unrestricted_records:
        url = 'https://dl.dropboxusercontent.com/u/302551934/Proquests/{}.pdf'.format(uid)        
    return url

In [12]:
abbr_degree = {"MPT": "Master of Physical Therapy (MPT)",
    "MUP": "Master of Urban Planning (MUP)",
    "DM": "Doctor of Music (DM)",
    "MTS": "Master of Theological Studies (MTS)",
    "AuD": "Doctor of Audiology (AuD)",
    "MSEE": "Master of Science in Electrical Engineering (MSEE)",
    "MSIB": "Master of Science in International Business (MSIB)",
    "MCSM": "Master of Construction Science and Management (MCSM)",
    "PsyD": "Doctor of Psychology (PsyD)",
    "MSEM": "Master of Science in Engineering Management (MSEM)",
    "MSMSE": "Master of Science in Materials Science and Engineering (MSMSE)",
    "RhD": "Doctor of Rehabilitation (RhD)",
    "MATE": "Master of Arts in the Teaching of English (MATE)",
    "DPT": "Doctor of Physical Therapy (DPT)",
    "MSAgE": "Master of Science in Agricultural Engineering (MSAgE)",
    "PhDOtol": "PhD Otolaryngology (PhDOtol)",
    "MSHRM": "Master of Science in Human Resources Management (MSHRM)",
    "MIM": "Master of International Management (MIM)",
    "DMin": "Doctor of Ministry (DMin)",
    "MSIE": "Master of Science in Industrial Engineering (MSIE)",
    "MSISE": "Master of Science in Infrastructure Systems Engineering (MSISE)",
    "DPA": "Doctor of Public Administration (DPA)",
    "HSOP": "Doctor of Philosophy in Health Services Research (HSOP)",
    "MMatSE": "Master of Materials Science and Engineering (MMatSE)",
    "MAeroE": "Master of Aeronautical Engineering (MAeroE)",
    "MMT": "Master in Management of Technology (MMT)",
    "MSJ": "Master of Science in Jurisprudence (MSJ)",
    "MHP": "Master of Historic Preservation (MHP)",
    "DEng": "Doctor of Engineering (DEng)",
    "MBA": "Master of Business Administration (MBA)",
    "MRED": "Master of Real Estate Development (MRED)",
    "MCTE": "Master of Career and Technology Education (MCTE)",
    "MSAeroE": "Master of Science in Aerospace Engineering (MSAeroE)",
    "MAR": "Master of Arts in Religion (MAR)",
    "MST": "Master's of Science in Teaching (MST)",
    "MJS": "Master of Judicial Studies (MJS)",
    "MALA": "Master of Arts in Liberal Arts (MALA)",
    "MSETM": "Master of Science in Environmental Technology Management (MSETM)",
    "MSHTM": "Master of Science in Hospitality and Tourism Management (MSHTM)",
    "Th.M": "Master of Theology (Th.M)",
    "MSM": "Master of Science in Management (MSM)",
    "MCRP": "Master of City and Regional Planning (MCRP)",
    "MBS": "Master of Building Science (MBS)",
    "MAIS": "Master of Arts in Interdisciplinary Studies (MAIS)",
    "DBA": "Doctor of Business Administration (DBA)",
    "MPH": "Master of Public Health (MPH)",
    "MIDS": "Master of Interdisciplinary Studies (MIDS)",
    "MPA/JD": "Master of Public Administration/Juris Doctorate (MPA/JD)",
    "PhD": "Doctor of Philosophy (PhD)",
    "DMgt": "Doctor of Management (DMgt)",
    "MCIS": "Master of Computer and Information Science (MCIS)",
    "MAE": "Master of Arts in Education (MAE)",
    "MHD": "Master of Human Development (MHD)",
    "MM": "Master of Music (MM)",
    "MGS": "Master of General Studies (MGS)",
    "MSN": "Master of Science in Nursing (MSN)",
    "M.Div": "Master of Divinity (M.Div)",
    "MAC": "Master of Arts in Counseling (MAC)",
    "MCJ": "Master of Criminal Justice (MCJ)",
    "MFR": "Master of Forest Resources (MFR)",
    "MSSS": "Master of Science in Computer Science (MSCS)",
    "MSA": "Master of Science in Administration (MSA)",
    "MURP": "Master of Urban and Regional Planning (MURP)",
    "MAS": "Master in Advanced Studies (MAS)",
    "ND": "Doctor of Nursing (ND)",
    "ME": "Master of Engineering (ME)",
    "MSCRP": "Master of Science in Community and Regional Planning (MSCRP)",
    "MArch": "Master of Architecture (MArch)",
    "MLIS": "Master of Library and Information Science (MLIS)",
    "MSOtol": "MS Otolaryngology (MSOtol)",
    "MLS": "Master of Library Science/Master of Life Sciences (MLS)",
    "MSMANFE": "Master of Science in Manufacturing Engineering (MSMANFE)",
    "MSSE": "Master of Science and Software Engineering (MSSE)",
    "MEngr": "Master of Engineering (MEngr)",
    "MSB": "Masters of Science in Bioscience (MSB)",
    "PED": "Doctor of Physical Education (PED)",
    "MFA": "Master of Fine Arts (MFA)",
    "MMC": "Master of Mass Communication (MMC)",
    "MSBAE": "Master of Science in Biological and Agricultural Engineering (MSBAE)",
    "MAgEd": "Master of Agricultural Education (MAgEd)",
    "MSECE": "Master of Science in Electrical and Computer Engineering (MSECE)",
    "DMD": "Doctor of Dental Medicine (DMD)",
    "MSMatSE": "Master of Science in Material Science Engineering (MSMatSE)",
    "MAPC": "Master of Arts in Pastoral Counseling (MAPC)",
    "MSEd": "Master of Science in Education (MSEd)",
    "DPDS": "Doctor of Planning and Development Studies (DPDS)",
    "MRP": "Master of Regional Planning (MRP)",
    "MNS": "Master of Natural Sciences (MNS)",
    "EdD": "Doctor of Education (EdD)",
    "DrPH": "Doctor of Public Health (DrPH)",
    "DNS": "Doctor of Nursing Science (DNS)",
    "MSIEOR": "Master of Science in Industrial Engineering and Operations Research (MSIEOR)",
    "MAT": "Master of Arts in Teaching (MAT)",
    "MEE": "Master of Electrical Engineering (MEE)",
    "MS": "Master of Science (MS)",
    "MSECO": "Master of Science in Economics (MSECO)",
    "MLA": "Master of Landscape Architecture (MLA)",
    "PhDSurg": "PhD Surgergy (PhDSurg)",
    "MSES": "Master of Science in Engineering Science (MSES)",
    "MHI": "Masters of Health Informatics (MHI)",
    "MSME": "Master of Science in Mechanical Engineering (MSME)",
    "MMUS": "Master of Music (MMUS)",
    "MSW": "Master of Social Work (MSW)",
    "MME": "Master of Music Education (MME)",
    "DMA": "Doctor of Musical Arts (DMA)",
    "MPA": "Master of Public Administration (MPA)",
    "DA": "Doctor of Arts (DA)",
    "MApStat": "Master of Applied Statistics (MApStat)",
    "MSP": "Master of Science in Planning (MSP)",
    "MPP": "Master of Public Policy (MPP)",
    "MSExpSurg": "Medical Surgeon in Experimental Surgery (MSExpSurg)",
    "EdS": "Education Specialist (EdS)",
    "MF": "Master of Forestry (MF)",
    "MPlan": "Master of Planning (MPlan)",
    "MBT": "Master of Business Taxation (MBT)",
    "HSD": "Doctor of Health and Safety (HSD)",
    "MHRD": "Master of Human Resource Development (MHRD)",
    "MSPH": "Master of Science in Public Health (MSPH)",
    "MChE": "Master of Chemical Engineering (MChE)",
    "MSPE": "Master of Science in Petroleum Engineering (MSPE)",
    "MCompE": "Master of Computer Engineering (MCompE)",
    "MT": "Master in Taxation (MT)",
    "MAcc": "Master of Accounting (MAcc)",
    "MPM": "Master of Public Management (MPM)",
    "MSE": "Master of Science in Engineering (MSE)",
    "DME": "Doctor of Music Education (DME)",
    "DSW": "Doctor of Social Work (DSW)",
    "MSCE": "Master of Science in Civil Engineering (MSCE)",
    "DVM": "Doctor of Veterinary Medicine (DVM)",
    "MCE": "Master of Civil Engineering (MCE)",
    "MES": "Master of Environmental Studies (MES)",
    "MECom": "Master of Electronic Commerce (MECom)",
    "MHA": "Master of Health Administration (MHA)",
    "PharmD": "Doctor of Pharmacy (PharmD)",
    "MA": "Master of Arts (MA)",
    "Ded": "Doctor of Education (Ded)",
    "MEnvE": "Master of Environmental Engineering (MEnvE)",
    "ReD": "Doctor of Recreation (ReD)",
    "JD": "Juris Doctorate (JD)",
    "MSBiosyAgE": "Master of Science in Biosystems and Agricultural Engineering (MSBiosyAgE)",
    "PMBA": "Professional Master of Business Administration (PMBA)",
    "MHAMS": "Master of Historical Administration and Museum Studies (MHAMS)",
    "MSIS": "Master of Science in Interdisciplinary Studies (MSIS)",
    "IMES": "International Master of Environmental Sciences (IMES)",
    "MSChE": "Master of Science in Chemical Engineering (MSChE)",
    "MPAcc": "Master of Professional Accounting (MPAcc)",
    "MGIS": "Master of Geographic Information Science (MGIS)",
    "MBioSci": "Master of Biological Science (MBioSci)",
    "MCM": "Master of Construction Management (MCM)",
    "MSMS": "Master of Science in Medical Sciences (MSMS)",
    "MD": "Medical Doctor (MD)",
    "Medical Science": "Doctor of Philosophy (Medical Science)",
    "MGeoE": "Master of Geomechanics Engineering (MGeoE)",
    "MEd": "Master of Education (MEd)",
    "MAM": "Master in Agricultural Management (MAM)",
    "MPRTM": "Master of Parks, Recreation and Tourism Management (MPRTM)",
    "MAgr": "Master of Agriculture (MAgr)",
    "POCS": "Doctor of Oceanography & Coastal Sciences (POCS)",
    "PVMPB": "Doctor of Biomedical and Veterinary Medical Sciences-Pathobiological Sciences (PVMPB)",
    "PNFS": "Doctor of Nutrition and Food Sciences (PNFS)",
    "PENTM": "Doctor of Entomology (PENTM)",
    }



# non_matching_degree_abbrevs = dict()
# def expand_degree_type(degree_name):
#     if degree_name in abbr_degree:
#         return abbr_degree[degree_name]
#     else:
#         if degree_name not in non_matching_degree_abbrevs:
#             non_matching_degree_abbrevs[degree_name] = []
            
# for record in ori:
#     stated_degree = record.get_fields('791')[0].value()
#     stated_degree = stated_degree.replace('.', '')
#     if stated_degree in first_conversion:
#         stated_degree = first_conversion[stated_degree]
#     if stated_degree not in abbr_degree:
#         print(lookup_uid(record))
# #         print(stated_degree, record.get_fields('856')[0].value())

In [13]:
first_conversion = {'EducatD': 'EdD',
                    'DED': 'EdD',
                    'DMus': 'DMA',
                    'OCS': 'POCS',
                    'VetMedSc': 'PVMPB',
                    'DrSciEng': 'PNFS',
                    'SCDC': 'PENTM'}

def match_degrees(record):
    stated_degree = record.get_fields('791')[0].value()
    if not stated_degree:
        return ''
    alph_degree = stated_degree.replace('.','')
    if alph_degree in first_conversion:
        alph_degree = first_conversion[alph_degree]
    if alph_degree in abbr_degree:
        return abbr_degree[alph_degree]
    else:
        return 'not yet implemented'


In [14]:
def interpret_directors(record):
    text_a, text_b = parse_500(record)
    return split_directors(text_b)

def parse_500(record):
    value_500 = [i.value() for i in record.get_fields('500')]
    if len(value_500) == 1:
        return value_500[0], ''
    else:
        return value_500[0], value_500[1]  

def split_directors(text_b):
    directors_list = parse_advisors_field(text_b)
    if directors_list:
        if len(directors_list) == 3:
            return directors_list[0], directors_list[1], directors_list[2]
        elif len(directors_list) == 2:
            return directors_list[0], directors_list[1], ''
        elif len(directors_list) == 1:
            return directors_list[0], '', ''
    return ('', '', '')

def parse_advisors_field(text):
    for title in ('Directors: ',
                  'Director: ',
                  'Co-Chairs: ',
                  'Co-chairs: ',
                  'Co-Chairmen: ',
                  'Adviser: ',
                  'Advisers: ',
                  'Chair: ',
                  'Directed: '):
        if title in text:
            text = text.replace(title, '')
            text = text
            text = unperiod(text)
            if text:
                return [i.strip() for i in text.split('; ')]
    else:
        return ''

def unperiod(text):
    if text[-1] == '.':
        return text[:-1]
    return text

In [15]:
def find_source(record):
    fields = [i.value() for i in record.get_fields('500') if 'Source' in i.value()][0]
    fields = unperiod(fields)
    fields = fields.replace('Source: ', '')
    return fields

In [16]:
def combine_520(record):
    list_520 = [i for i in record.get_fields('520')]
    if list_520:
        combined_text = ' '.join([i.value() for i in list_520])
    else:
        combined_text = ''
    return combined_text

In [17]:
def combine_650(record):
    value_650 = [i.value() for i in record.get_fields('650')]
    value_650 = [i.capitalize().replace('.', '') for i in value_650]
    if value_650:
        combined_text = '; '.join(value_650)
    else:
        combined_text = ''
    return combined_text

In [18]:
def parse_author_names(record):
    name_clump = record.get_fields('100')[0].value()
    name_clump = unperiod(name_clump)
    name = HumanName(name_clump)
    last_name = name.last
    middle_name = name.middle
    suffix = name.suffix
    suffix = standardize_suffix(suffix)
    if name.nickname:
        first_name = "{} {}".format(name.first, name.nickname)
    else:
        first_name = name.first
    return first_name.capitalize(), middle_name.capitalize(), last_name.capitalize(), suffix

def standardize_suffix(text):
    replace_dict = {'JR': 'Jr', 'SR': 'Sr', '3RD': 'III', 'ED': 'Ed.'}
    for wrong in replace_dict:
        if wrong in text:
            text = text.replace(wrong, replace_dict[wrong])
    return text

In [19]:
def lookup_inst(record):
    text = record.get_fields('710')[0].value()
    text = unperiod(text)
    return text

In [20]:
def lookup_isbn(record):
    if record.get_fields('020'):
        return record.get_fields('020')[0].value()
    return ''

In [21]:
def determine_dtype(record):
    for degree in ('PhD', 'DMA', 'EdD', 'DBA', 'PENTM', 'PNFS', 'PVMPB', 'POCS' ):
        if degree in match_degrees(record):
            return "dissertation"
    return "thesis"

In [22]:
def armageddon_if_restricted(record):
    if record in marc_restricted_records:
        return "9999-12-01"
    return ''

In [23]:
def is_restricted(record):
    if record in marc_restricted_records:
        return 'withheld'
    return 'unrestricted'

## Making the csv

In [24]:
def csv_writer(data, path):
    with open(path, "w", newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for line in data:
            writer.writerow(line)

In [46]:
def build_csv(records_list):
    csv_data = []
    
    csvfieldnames = ['uid',
                     "title",
                     "fulltext_url",
                     'keywords',
                     'abstract',
                     "author1_fname",
                     'author1_mname',
                     'author1_lname',
                     'author1_suffix',
                     'author1_email',
                     'author1_institution',
                     'advisor1',
                     'advisor2',
                     'advisor3',
                     'availability',
                     'availability_description',
                     'disciplines',
                     'comments',
                     'degree_name',
                     'department',
                     "document_type",
                     'embargo_date',
                     'publication_date',
                     'season',
                     'release_date',
                     'isbn',
                     'pagelength',
                     'source',
                     'diss_note',
                     'host_item',
                     'language',
                     'host_url',
                    ]
    csv_data.append(csvfieldnames)

    for record in records_list:
        csv_uid = lookup_uid(record)
        csv_title = lookup_clean_title(record)
        fulltext_url = make_dropbox_url(record)
        csv_keywords = combine_650(record)
        csv_abstract = combine_520(record)
        csv_first_name, csv_middle_name, csv_last_name, csv_suffix = parse_author_names(record)
        csv_author_email = ''
        csv_institution = lookup_inst(record)
        csv_advisor1, csv_advisor2, csv_advisor3 = interpret_directors(record)
        csv_availability = is_restricted(record)
        csv_avail_desc = ''
        csv_disciplines = ''
        csv_comments = ''
        csv_degree_name = match_degrees(record)
        csv_department = ''
        csv_document_type = determine_dtype(record)
        csv_embargo_date = armageddon_if_restricted(record)
        csv_publication_date = record.get_fields('792')[0].value()
        csv_season = ''
        csv_release_date = ''
        csv_isbn = lookup_isbn(record)
        csv_pagelength = record.get_fields('300')[0].value().replace(' p.', '')
        csv_source = find_source(record)
        csv_diss_note = unperiod(record.get_fields('502')[0].value())
        csv_host_item = unperiod(record.get_fields('773')[0].value())
        csv_language = record.get_fields('793')[0].value()
        csv_host_url = record.get_fields('856')[0].value()

        csv_data.append([csv_uid,
                         csv_title,
                         fulltext_url,
                         csv_keywords,
                         csv_abstract,
                         csv_first_name,
                         csv_middle_name,
                         csv_last_name,
                         csv_suffix,
                         csv_author_email,
                         csv_institution,
                         csv_advisor1,
                         csv_advisor2,
                         csv_advisor3,
                         csv_availability,
                         csv_avail_desc,
                         csv_disciplines,
                         csv_comments,
                         csv_degree_name,
                         csv_department,
                         csv_document_type,
                         csv_embargo_date,
                         csv_publication_date,
                         csv_season,
                         csv_release_date,
                         csv_isbn,
                         csv_pagelength,
                         csv_source,
                         csv_diss_note,
                         csv_host_item,
                         csv_language,
                         csv_host_url,
                         ])
    output_folder = '/home/francis/Desktop/lsu-git/Proquest_to_DigitalCommons/output'
    os.makedirs(output_folder, exist_ok=True)
    csv_writer(csv_data, '/home/francis/Desktop/lsu-git/Proquest_to_DigitalCommons/output/scrap_Proquest.csv')

In [48]:
build_csv(marc_unrestricted_records)

## Testing things

In [27]:
# an example of one record
# orig_unrestricted_records[1000].as_dict()

In [28]:
# show all unique values for field 650

# all_650a = set()
# for record in to_do_records:
#     for i in record.get_fields('650'):
#         all_650a.add(i.value())
#     all_650a.add(record.get_fields('650')[0].value())
# print(all_650a)

In [29]:
# do any field values have an @ in it?

# for record in to_do_records:
#     all_fields = [i.value() for i in record.get_fields()]
#     for text in all_fields:
#         if '@' in text:
#             print(text)

In [30]:
# test of all uids in marc file match a pdf on U drive
# short answer: they all do

# pdf_not_on_U = list()

# for record in orig_unrestricted_records:
#     uid = lookup_uid(record)
#     if os.path.isfile('/media/francis/U/ProquestDissertations/UnrestrictedTheses/{}.pdf'.format(uid)):
#         continue
#     pdf_not_on_U.append(uid)

# print(pdf_not_on_U)


In [31]:
# how many unique values for each field/subfield?

# counting_items = dict()

# def add_to_if_not_yet(k, v):
#     v = v.strip()
#     if v == "None" or not v or v == None:
#         return
#     if k in counting_items:
#         counting_items[k].add(v)
#     else:
#         counting_items[k] = set()
#         counting_items[k].add(v)

# for record_as_marc in to_do_records:
#     record = record_as_marc.as_dict()
#     if not record['fields']:
#         break
#     for dictionary in record['fields']:
#         for k, v in dictionary.items():
#             if isinstance(v, str) and v:
#                 add_to_if_not_yet(k, v)
#             if isinstance(v, dict) and v:
#                 ind1 = v['ind1']
#                 fullpath = '{}/ind1'.format(k)
#                 add_to_if_not_yet(fullpath, ind1)
#                 ind2 = v['ind2']
#                 fullpath = '{}/ind2'.format(k)
#                 add_to_if_not_yet(fullpath, ind2)
#                 subfields = v['subfields']
#                 for subdictionary in subfields:
#                     for x, y in subdictionary.items():
#                         fullpath = '{}/subfields/{}'.format(k, x)
#                         add_to_if_not_yet(fullpath, y)
                        
# for k, v in counting_items.items():
#     print(k, len(v))

In [32]:
# how many unique values for each field/subfield?

# keys_lengths = dict()
# all_unique_keys = dict()

# def add_to_if_not_yet(dictionary, k, v):
#     if k in dictionary:
#         dictionary[k].add(v)
#     else:
#         dictionary[k] = set()
#         dictionary[k].add(v)

# for record_as_marc in to_do_records:
#     record = record_as_marc.as_dict()
#     if not record['fields']:
#         break        
#     field_keys = {k for field in record['fields'] for k in field.keys()}
#     fields_list = [k for field in record['fields'] for k in field.keys()]
#     for unique_field in field_keys:
#         add_to_if_not_yet(keys_lengths, unique_field, fields_list.count(unique_field))
        
# for record in to_do_records:
#     for field in record.get_fields():
#         add_to_if_not_yet(all_unique_keys, field.tag, field.value())

# print('this (key) shows up {times} in a record:\n', sorted(keys_lengths.items()))

# print('\nthis (key) has {unique values} across the repo:')
# for k, v in sorted(all_unique_keys.items()):
#     print(k, len(v))

In [33]:
# all_unique_keys['020']

In [34]:
# this is supposed to check for broken utf-8, but i don't trust it's working

# longest_field = 0

# for record_as_marc in to_do_records:
#     for field in record_as_marc.get_fields():
#         value = field.value()
#         try:
#             bytes_value = value.encode()
#             ascii_value = bytes_value.decode('ascii', "strict")
#             if len(ascii_value) > longest_field:
#                 longest_field = len(ascii_value)
#                 print(record_as_marc)
#         except:
#             print(value)

In [35]:
# print a full record matching a specified uid

def find_print_record(uid):
    for record in marc_restricted_records:
        if lookup_uid(record) == uid:
            return record.as_dict()
        
find_print_record('8811418')

{'fields': [{'001': 'AAI8811418'},
  {'005': '20140908111618.5'},
  {'008': '140908s1987    ||||||||||||||||| ||eng d'},
  {'035': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': '(MiAaPQ)AAI8811418'}]}},
  {'040': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'MiAaPQ'}, {'c': 'MiAaPQ'}]}},
  {'100': {'ind1': '1', 'ind2': ' ', 'subfields': [{'a': 'Lee, Duc Hee.'}]}},
  {'245': {'ind1': '1',
    'ind2': '2',
    'subfields': [{'a': 'A study of "Suite No. 1, Seven Korean Folksongs for Violin (or Violoncello) and Piano" (1958) by Min-jong Park.'}]}},
  {'300': {'ind1': ' ', 'ind2': ' ', 'subfields': [{'a': '178 p.'}]}},
  {'500': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'Source: Dissertation Abstracts International, Volume: 49-07, Section: A, page: 1616.'}]}},
  {'500': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'Director: Thaddeus Brys.'}]}},
  {'502': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'Thesis (D.M.A.)--Louisiana State Univer

In [36]:
# find a record with a certain text in any value

for record in marc_unrestricted_records:
    for field in record.get_fields():
        if 'Gipson' in field.value():
            print(lookup_uid(record))

## Editing MARC & moving pdfs

In [37]:
# from pymarc import MARCWriter

# trash_file = '/home/francis/Desktop/trash.marc'

# # actual_restricted_records = [i for i in orig_restricted_records if lookup_uid(i) not in all_restricteds]
# # actual_restricted_records.extend([i for i in orig_unrestricted_records if lookup_uid(i) in restricted_uids])

# print(len(expected_unrestricted_records))


# with open(trash_file, 'wb') as f:
#     for record in expected_unrestricted_records:
#         f.write(record.as_marc())
    
# with open(trash_file, 'rb') as f:
#     reader = MARCReader(f)
#     trash_records = list()
#     for record in reader:
#         trash_records.append(record)
#     print(len(trash_records), len(expected_unrestricted_records))

# print(expected_unrestricted_records[0].as_dict())

In [38]:
# import shutil

# for uid in folder_duplicated:
#     source = os.path.join(folder_on_U, 'UnrestrictedTheses', '{}.pdf'.format(uid))
#     dest = os.path.join(folder_on_U, 'TitlesDuplicatedInETDDatbase', '{}.pdf'.format(uid))
#     if os.path.isfile(source):
#         print(source, '\n', dest)
#         shutil.move(source, dest)

        
# for uid in folder_true_restrict:
#     source = os.path.join(folder_on_U, 'UnrestrictedTheses', '{}.pdf'.format(uid))
#     dest = os.path.join(folder_on_U, 'RestrictedTheses', '{}.pdf'.format(uid))
#     if os.path.isfile(source):
#         print(source, '\n', dest)
#         shutil.move(source, dest)

In [39]:
set_a = set()
for record in marc_unrestricted_records:
    set_a.add(match_degrees(record)[-6:-1])

In [40]:
set_a

{' (DBA',
 ' (DMA',
 ' (EdD',
 ' (MMC',
 ' (PhD',
 '(PNFS',
 '(POCS',
 'PENTM',
 'PVMPB',
 'e (MS'}

In [41]:
for item in marc_images_records:
    uid = lookup_uid(item)
    if uid in marc_unrestricted_uids:
        print("{}: dupe in unrestricted".format(uid))
    if uid in marc_restricted_uids:
        print("{}: dupe in restricted".format(uid))
    else:
        print('{}: not found dupe'.format(uid))

DP69149: not found dupe
DP69150: not found dupe
DP69151: not found dupe
DP69152: not found dupe
DP69153: not found dupe
DP69154: not found dupe
DP69155: not found dupe
DP69156: not found dupe
DP69157: not found dupe
DP69158: not found dupe
DP69159: not found dupe
DP69160: not found dupe
DP69161: not found dupe
DP69162: not found dupe
DP69163: not found dupe
DP69164: not found dupe
DP69165: not found dupe
DP69166: not found dupe
DP69167: not found dupe
DP69168: not found dupe
DP69169: not found dupe
DP69170: not found dupe
DP69171: not found dupe
DP69172: not found dupe
DP69173: not found dupe
DP69174: not found dupe
DP69175: not found dupe
DP69176: not found dupe
DP69177: not found dupe
DP69178: not found dupe
DP69179: not found dupe
DP69180: not found dupe
DP69181: not found dupe
DP69182: not found dupe
DP69183: not found dupe
DP69184: not found dupe
DP69185: not found dupe
DP69186: not found dupe
DP69187: not found dupe
DP69188: not found dupe
DP69189: not found dupe
DP69190: not fou

In [42]:
for item in marc_images_records:
    for k,v in item.as_dict().items():
        print(k, v)
    break

leader 01210nam a2200301   4500
fields [{'001': 'AAIDP69149'}, {'005': '20150728081845.5'}, {'008': '150728s1935    ||||||||||||||||| ||eng d'}, {'035': {'subfields': [{'a': '(MiAaPQ)AAIDP69149'}], 'ind1': ' ', 'ind2': ' '}}, {'040': {'subfields': [{'a': 'MiAaPQ'}, {'c': 'MiAaPQ'}], 'ind1': ' ', 'ind2': ' '}}, {'100': {'subfields': [{'a': 'Carter, Gipson Lafayette.'}], 'ind1': '1', 'ind2': ' '}}, {'245': {'subfields': [{'a': 'A survey of the status of Ramie.'}], 'ind1': '1', 'ind2': '2'}}, {'300': {'subfields': [{'a': '120 p.'}], 'ind1': ' ', 'ind2': ' '}}, {'500': {'subfields': [{'a': 'Source: Dissertation Abstracts International, Volume: 76-06(E), Section: B.'}], 'ind1': ' ', 'ind2': ' '}}, {'502': {'subfields': [{'a': 'Thesis (Ph.D.)--Louisiana State University and Agricultural & Mechanical College, 1935.'}], 'ind1': ' ', 'ind2': ' '}}, {'506': {'subfields': [{'a': 'This item must not be sold to any third party vendors.'}], 'ind1': ' ', 'ind2': ' '}}, {'506': {'subfields': [{'a': 'T