In [1]:
#! /usr/bin/env python3

import os
import csv

from nameparser import HumanName
from pymarc import MARCReader

In [2]:
with open('source_data/MARCDATA.MRC', 'rb') as f:
    reader = MARCReader(f)
    all_records = list()
    for record in reader:
        all_records.append(record)

In [3]:
# an example of one record
all_records[1000].as_dict()

{'fields': [{'001': 'AAI3167161'},
  {'005': '20140908111943.5'},
  {'008': '140908s2005    ||||||||||||||||| ||eng d'},
  {'020': {'ind1': ' ', 'ind2': ' ', 'subfields': [{'a': '9780542025891'}]}},
  {'035': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': '(MiAaPQ)AAI3167161'}]}},
  {'040': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'MiAaPQ'}, {'c': 'MiAaPQ'}]}},
  {'100': {'ind1': '1', 'ind2': ' ', 'subfields': [{'a': 'Song, In-hyouk.'}]}},
  {'245': {'ind1': '1',
    'ind2': '0',
    'subfields': [{'a': 'Laterally movable gate field effect transistor (LMGFET) for microsensor and microactuator applications.'}]}},
  {'300': {'ind1': ' ', 'ind2': ' ', 'subfields': [{'a': '171 p.'}]}},
  {'500': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'Source: Dissertation Abstracts International, Volume: 66-03, Section: B, page: 1638.'}]}},
  {'500': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'Director: Pratul K. Ajmera.'}]}},
  {'502': {'ind1': ' ',
    'i

In [4]:
def lookup_uid(record):
    return record.get_fields('001')[0].value().replace('AAI','')

In [5]:
lookup_uid(all_records[0])

'0000418'

In [6]:
# make a list of restricted items
# from the previously-made ranges
# they match 1247 records from the marc file

restricted_range_a = set(range(3048322, 3335145))
restricted_range_b = {3021429, 3030348, 3451495}
restricted_range_c = {3049191, 3049223, 3051440, 3053695, 3053696}
restricted_range_d = set(range(3049188, 3329096))
all_restricteds = set().union(restricted_range_a, 
                              restricted_range_b,
                              restricted_range_c,
                              restricted_range_d)
all_restricteds.remove(3136164)

restricted_uids = []

for record in all_records:
    uid = lookup_uid(record)
    if int(uid) in all_restricteds:
        restricted_uids.append(uid)
        continue
        
# print(len(restricted_uids))

In [7]:
# make a list of duplicated uids

with open('source_data/DuplicatedInDigitalCommons.txt', 'r', encoding='utf-8') as f:
    duplicate_uids = []
    for line in f.readlines():
        duplicate_uids.append(line.replace('.pdf', '').strip())

In [8]:
# make a list of records not in duplicated or in restricted

to_do_records = [i for i in all_records if lookup_uid(i) not in restricted_uids
                                        and lookup_uid(i) not in duplicate_uids]


## Testing things

In [9]:
# test of all uids in marc file match a pdf on U drive
# short answer: they all do

# pdf_not_on_U = list()

# for record in all_records:
#     uid = lookup_uid(record)
#     if os.path.isfile('/media/francis/U/ProquestDissertations/UnrestrictedTheses/{}.pdf'.format(uid)):
#         continue
#     pdf_not_on_U.append(uid)

# print(pdf_not_on_U)


In [10]:
# show all unique values for field 650

# all_650a = set()
# for record in to_do_records:
#     for i in record.get_fields('650'):
#         all_650a.add(i.value())
#     all_650a.add(record.get_fields('650')[0].value())
# print(all_650a)

## Making the crosswalk

In [12]:
# be sure to preserve order of items in field_520

def combine_520(record):
    list_520 = [i for i in record.get_fields('520')]
    if list_520:
        combined_text = ' '.join([i.value() for i in list_520])
    else:
        combined_text = ''
    return combined_text


In [13]:
def combine_650(record):
    value_650 = [i.value() for i in record.get_fields('650')]
    value_650 = [i.capitalize().replace('.', '') for i in value_650]
    if value_650:
        combined_text = '; '.join(value_650)
    else:
        combined_text = ''
    return combined_text

In [14]:
def parse_author_names(record):
    name_clump = record.get_fields('100')[0].value()
    name = HumanName(name_clump)
    last_name = name.last
    middle_name = name.middle
    suffix = name.suffix
    if name.nickname:
        first_name = "{} {}".format(name.first, name.nickname)
    else:
        first_name = name.first
    return first_name.capitalize(), middle_name.capitalize(), last_name.capitalize(), suffix.capitalize()

## Making the csv

In [15]:
def csv_writer(data, path):
    with open(path, "w", newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for line in data:
            writer.writerow(line)

In [16]:
def build_csv(to_do_records):
    csv_data = []
    
    csvfieldnames = ["title",
                     "fulltext_url",
                     'keywords',
                     'abstract',
                     "author1_fname",
                     'author1_mname',
                     'author1_lname',
                     'author1_suffix',
                     'author1_email',
                     'author1_institution',
                     'advisor1',
                     'advisor2',
                     'advisor3',
                     'disciplines',
                     'comments'
                     'degree_name',
                     'department',
                     "document_type",
                     'publication_date',
                     'season',
                     'release_date',
                     'urn',
                    ]
    csv_data.append(csvfieldnames)

    for record in to_do_records:
        csv_title = record.get_fields('245')[0].value()
        fulltext_url = ''
#         fulltext_url = parse_dropbox_url(urn)
#         fulltext_url = gdrive_dict[urn]
        csv_keywords = combine_650(record)
        csv_abstract = combine_520(record)
        csv_first_name, csv_middle_name, csv_last_name, csv_suffix = parse_author_names(record)
        csv_author_email = ''
        csv_institution = ''
        csv_advisor1, csv_advisor2 = parse_500(record)
        csv_advisor3 = ''
        csv_disciplines = ''
        csv_comments = ''
        csv_degree_name = ''
        csv_department = ''
        csv_document_type = ''
        csv_publication_date = ''
        csv_season = ''
        csv_release_date = ''
        csv_urn = lookup_uid(record)

        csv_data.append([csv_title,
                         fulltext_url,
                         csv_keywords,
                         csv_abstract,
                         csv_first_name,
                         csv_middle_name,
                         csv_last_name,
                         csv_suffix,
                         csv_author_email,
                         csv_institution,
                         csv_advisor1,
                         csv_advisor2,
                         csv_advisor3,
                         csv_disciplines,
                         csv_comments,
                         csv_degree_name,
                         csv_department,
                         csv_document_type,
                         csv_publication_date,
                         csv_season,
                         csv_release_date,
                         csv_urn,
                         ])
    output_folder = '/home/francis/Desktop/lsu-git/Proquest_to_DigitalCommons/output'
    os.makedirs(output_folder, exist_ok=True)
    csv_writer(csv_data, '/home/francis/Desktop/lsu-git/Proquest_to_DigitalCommons/output/scrap_Proquest.csv')

In [17]:
build_csv(to_do_records)

In [18]:
for record in to_do_records:
    combined_text = combine_650(record)
    if combined_text:
        print(combined_text)
        break

Philosophy


In [19]:
# how many unique values for each field/subfield?

# counting_items = dict()

# def add_to_if_not_yet(k, v):
#     v = v.strip()
#     if v == "None" or not v or v == None:
#         return
#     if k in counting_items:
#         counting_items[k].add(v)
#     else:
#         counting_items[k] = set()
#         counting_items[k].add(v)

# for record_as_marc in to_do_records:
#     record = record_as_marc.as_dict()
#     if not record['fields']:
#         break
#     for dictionary in record['fields']:
#         for k, v in dictionary.items():
#             if isinstance(v, str) and v:
#                 add_to_if_not_yet(k, v)
#             if isinstance(v, dict) and v:
#                 ind1 = v['ind1']
#                 fullpath = '{}/ind1'.format(k)
#                 add_to_if_not_yet(fullpath, ind1)
#                 ind2 = v['ind2']
#                 fullpath = '{}/ind2'.format(k)
#                 add_to_if_not_yet(fullpath, ind2)
#                 subfields = v['subfields']
#                 for subdictionary in subfields:
#                     for x, y in subdictionary.items():
#                         fullpath = '{}/subfields/{}'.format(k, x)
#                         add_to_if_not_yet(fullpath, y)
                        
# for k, v in counting_items.items():
#     print(k, len(v))

In [20]:
# how many unique values for each field/subfield?

# keys_lengths = dict()

# def add_to_if_not_yet(k, v):
#     if k in keys_lengths:
#         keys_lengths[k].add(v)
#     else:
#         keys_lengths[k] = set()
#         keys_lengths[k].add(v)

# for record_as_marc in to_do_records:
#     record = record_as_marc.as_dict()
#     if not record['fields']:
#         break        
#     field_keys = {k for field in record['fields'] for k in field.keys()}
#     fields_list = [k for field in record['fields'] for k in field.keys()]
#     for unique_field in field_keys:
#         add_to_if_not_yet(unique_field, fields_list.count(unique_field))

# print(keys_lengths)

In [21]:
# this is supposed to check for broken utf-8, but i don't trust it's working

# longest_field = 0

# for record_as_marc in to_do_records:
#     for field in record_as_marc.get_fields():
#         value = field.value()
#         try:
#             bytes_value = value.encode()
#             ascii_value = bytes_value.decode('ascii', "strict")
#             if len(ascii_value) > longest_field:
#                 longest_field = len(ascii_value)
#                 print(record_as_marc)
#         except:
#             print(value)


In [22]:
def find_print_record(uid):
    for record in all_records:
        if lookup_uid(record) == uid:
            return record.as_dict()

In [23]:
find_print_record('0000418')

{'fields': [{'001': 'AAI0000418'},
  {'005': '20140908111845.5'},
  {'008': '140908s1941    ||||||||||||||||| ||eng d'},
  {'035': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': '(MiAaPQ)AAI0000418'}]}},
  {'040': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'MiAaPQ'}, {'c': 'MiAaPQ'}]}},
  {'100': {'ind1': '1',
    'ind2': ' ',
    'subfields': [{'a': 'FLOWERS, FRANK C.'}]}},
  {'245': {'ind1': '1',
    'ind2': '0',
    'subfields': [{'a': "MARK TWAIN'S THEORIES OF MORALITY."}]}},
  {'300': {'ind1': ' ', 'ind2': ' ', 'subfields': [{'a': '239 p.'}]}},
  {'500': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'Source: Dissertation Abstracts International, Volume: 04-01, page: 2700.'}]}},
  {'502': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'Thesis (Ph.D.)--Louisiana State University and Agricultural & Mechanical College, 1941.'}]}},
  {'590': {'ind1': ' ',
    'ind2': ' ',
    'subfields': [{'a': 'School code: 0107.'}]}},
  {'650': {'ind1': ' ', 'ind

In [81]:
def parse_500(record):
    value_500 = [i.value() for i in record.get_fields('500')]
#     value_500 = [i for i in value_500]
    if len(value_500) == 1:
        return value_500[0], ''
    else:
        return value_500[0], value_500[1]
    
source, single_mult_director = parse_500(record)
 

In [85]:
def split_directors(text_b):
    directors_list = interpret_single_mult_director(text_b)
    if len(directors_list) == 3:
        return directors_list[0], directors_list[1], directors_list[2]
    elif len(directors_list) == 2:
        return directors_list[0], directors_list[1], ''
    elif len(directors_list) == 1:
        return directors_list[0], '', ''
    else:
        return ''

def interpret_single_mult_director(text):
    if "Directors" in text:
        return None
        text = text.replace('Directors: ', '')
        if text[-1] == '.':
            text = text[:-1]
        if text:
            return [i for i in text.split('; ')]
    elif "Director" in text:
        text = text.replace('Director: ', '')
        if text[-1] == '.':
            text = text[:-1]
        if text:
            return text
    else:
        return ''
        
for record in to_do_records:
    text_a, text_b = parse_500(record)
    split_directors(text_b)

TypeError: object of type 'NoneType' has no len()