In [None]:
#! /usr/bin/env python3

import os
import csv

from nameparser import HumanName
from pymarc import MARCReader
from titlecase import titlecase

In [None]:
with open('source_data/MARCDATA.MRC', 'rb') as f:
    reader = MARCReader(f)
    all_records = list()
    for record in reader:
        all_records.append(record)

In [None]:
# make a list of duplicated uids

with open('source_data/DuplicatedInDigitalCommons.txt', 'r', encoding='utf-8') as f:
    duplicate_uids = []
    for line in f.readlines():
        duplicate_uids.append(line.replace('.pdf', '').strip())

In [None]:
def lookup_uid(record):
    return record.get_fields('001')[0].value().replace('AAI','')

In [None]:
# make a list of restricted items
# from the previously-made ranges
# they match 1247 records from the marc file

restricted_range_a = set(range(3048322, 3335145))
restricted_range_b = {3021429, 3030348, 3451495}
restricted_range_c = {3049191, 3049223, 3051440, 3053695, 3053696}
restricted_range_d = set(range(3049188, 3329096))
all_restricteds = set().union(restricted_range_a, 
                              restricted_range_b,
                              restricted_range_c,
                              restricted_range_d)
all_restricteds.remove(3136164)

In [None]:
# identify restricted items

restricted_uids = []

for record in all_records:
    uid = lookup_uid(record)
    if uid in duplicate_uids:
        continue
    if int(uid) in all_restricteds:
        print(lookup_uid(record))

In [None]:
# make a list of records not in duplicated or in restricted

to_do_records = [i for i in all_records if lookup_uid(i) not in restricted_uids
                                        and lookup_uid(i) not in duplicate_uids]


## Making the crosswalk

In [None]:
def lookup_clean_title(record):
    text = record.get_fields('245')[0].value()
    text = titlecase(text)
    text = text.replace(':  ', ": ")
    for k, v in wrong_roman_numeral.items():
        if k in text:
            text = text.replace(k, v)
    return text

In [None]:
wrong_roman_numeral = {' Ii': ' II',
                       ' Iii ': ' III ',
                       '-Iii': '-III',
                       ' Iii.': ' III.',
                       ' Iv ': ' IV ',
                       ' Vi ': ' VI ',
                       ' Iv.': ' IV.',
                       ' Iv)': 'IV)',
                       ' Viii': ' VIII',
                       '-Vii ': '-VII',
                       '-Viii': '-VIII',
                       ' Vii': ' VII',
                       }

In [None]:
def make_dropbox_url(record):
    uid = lookup_uid(record)
    url = 'some.dropbox.url/public/something/{}.pdf'.format(uid)
    return url

In [None]:
def interpret_directors(record):
    text_a, text_b = parse_500(record)
    return split_directors(text_b)

def parse_500(record):
    value_500 = [i.value() for i in record.get_fields('500')]
    if len(value_500) == 1:
        return value_500[0], ''
    else:
        return value_500[0], value_500[1]  

def split_directors(text_b):
    directors_list = parse_advisors_field(text_b)
    if directors_list:
        if len(directors_list) == 3:
            return directors_list[0], directors_list[1], directors_list[2]
        elif len(directors_list) == 2:
            return directors_list[0], directors_list[1], ''
        elif len(directors_list) == 1:
            return directors_list[0], '', ''
    return ('', '', '')

def parse_advisors_field(text):
    for title in ('Directors: ',
                  'Director: ',
                  'Co-Chairs: ',
                  'Co-chairs: ',
                  'Co-Chairmen: ',
                  'Adviser: ',
                  'Advisers: ',
                  'Chair: ',
                  'Directed: '):
        if title in text:
            text = text.replace(title, '')
            text = text
            text = unperiod(text)
            if text:
                return [i.strip() for i in text.split('; ')]
    else:
        return ''

def unperiod(text):
    if text[-1] == '.':
        return text[:-1]
    return text

In [None]:
def find_source(record):
    fields = [i.value() for i in record.get_fields('500') if 'Source' in i.value()][0]
    fields = unperiod(fields)
    fields = fields.replace('Source: ', '')
    return fields

In [None]:
def combine_520(record):
    list_520 = [i for i in record.get_fields('520')]
    if list_520:
        combined_text = ' '.join([i.value() for i in list_520])
    else:
        combined_text = ''
    return combined_text


In [None]:
def combine_650(record):
    value_650 = [i.value() for i in record.get_fields('650')]
    value_650 = [i.capitalize().replace('.', '') for i in value_650]
    if value_650:
        combined_text = '; '.join(value_650)
    else:
        combined_text = ''
    return combined_text

In [None]:
def parse_author_names(record):
    name_clump = record.get_fields('100')[0].value()
    name_clump = unperiod(name_clump)
    name = HumanName(name_clump)
    last_name = name.last
    middle_name = name.middle
    if "Arch" in last_name:
        print(name_clump)
        
    suffix = name.suffix
    suffix = standardize_suffix(suffix)
    if name.nickname:
        first_name = "{} {}".format(name.first, name.nickname)
    else:
        first_name = name.first
    return first_name.capitalize(), middle_name.capitalize(), last_name.capitalize(), suffix

def standardize_suffix(text):
    replace_dict = {'JR': 'Jr', 'SR': 'Sr', '3RD': 'III', 'ED': 'Ed.'}
    for wrong in replace_dict:
        if wrong in text:
            text = text.replace(wrong, replace_dict[wrong])
    return text
    

In [None]:
def lookup_inst(record):
    text = record.get_fields('710')[0].value()
    text = unperiod(text)
    return text

In [None]:
def lookup_isbn(record):
    if record.get_fields('020'):
        return record.get_fields('020')[0].value()
    return ''

## Making the csv

In [None]:
def csv_writer(data, path):
    with open(path, "w", newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for line in data:
            writer.writerow(line)

In [None]:
def build_csv(to_do_records):
    csv_data = []
    
    csvfieldnames = ['urn',
                     "title",
                     "fulltext_url",
                     'keywords',
                     'abstract',
                     "author1_fname",
                     'author1_mname',
                     'author1_lname',
                     'author1_suffix',
                     'author1_email',
                     'author1_institution',
                     'advisor1',
                     'advisor2',
                     'advisor3',
                     'disciplines',
                     'comments',
                     'degree_name',
                     'department',
                     "document_type",
                     'publication_date',
                     'season',
                     'release_date',
                     'ISBN',
                     'pagelength',
                     'source',
                     'diss_note',
                     'host_item',
                     'language',
                     'host_url',
                    ]
    csv_data.append(csvfieldnames)

    for record in to_do_records:
        csv_urn = lookup_uid(record)
        csv_title = lookup_clean_title(record)
        fulltext_url = make_dropbox_url(record)
        csv_keywords = combine_650(record)
        csv_abstract = combine_520(record)
        csv_first_name, csv_middle_name, csv_last_name, csv_suffix = parse_author_names(record)
        csv_author_email = ''
        csv_institution = lookup_inst(record)
        csv_advisor1, csv_advisor2, cvs_advisor3 = interpret_directors(record)
        csv_advisor3 = ''
        csv_disciplines = ''
        csv_comments = ''
        csv_degree_name = record.get_fields('791')[0].value()
        csv_department = ''
        csv_document_type = 'Thesis'
        csv_publication_date = record.get_fields('792')[0].value()
        csv_season = ''
        csv_release_date = ''
        csv_isbn = lookup_isbn(record)
        csv_pagelength = record.get_fields('300')[0].value().replace(' p.', '')
        csv_source = find_source(record)
        csv_diss_note = unperiod(record.get_fields('502')[0].value())
        csv_host_item = unperiod(record.get_fields('773')[0].value())
        csv_language = record.get_fields('793')[0].value()
        csv_host_url = record.get_fields('856')[0].value()

        csv_data.append([csv_urn,
                         csv_title,
                         fulltext_url,
                         csv_keywords,
                         csv_abstract,
                         csv_first_name,
                         csv_middle_name,
                         csv_last_name,
                         csv_suffix,
                         csv_author_email,
                         csv_institution,
                         csv_advisor1,
                         csv_advisor2,
                         csv_advisor3,
                         csv_disciplines,
                         csv_comments,
                         csv_degree_name,
                         csv_department,
                         csv_document_type,
                         csv_publication_date,
                         csv_season,
                         csv_release_date,
                         csv_isbn,
                         csv_pagelength,
                         csv_source,
                         csv_diss_note,
                         csv_host_item,
                         csv_language,
                         csv_host_url,
                         ])
    output_folder = '/home/francis/Desktop/lsu-git/Proquest_to_DigitalCommons/output'
    os.makedirs(output_folder, exist_ok=True)
    csv_writer(csv_data, '/home/francis/Desktop/lsu-git/Proquest_to_DigitalCommons/output/scrap_Proquest.csv')

In [None]:
build_csv(to_do_records)

## Testing things

In [None]:
# an example of one record
# all_records[1000].as_dict()

In [None]:
# show all unique values for field 650

# all_650a = set()
# for record in to_do_records:
#     for i in record.get_fields('650'):
#         all_650a.add(i.value())
#     all_650a.add(record.get_fields('650')[0].value())
# print(all_650a)

In [None]:
# do any field values have an @ in it?

# for record in to_do_records:
#     all_fields = [i.value() for i in record.get_fields()]
#     for text in all_fields:
#         if '@' in text:
#             print(text)

In [None]:
# test of all uids in marc file match a pdf on U drive
# short answer: they all do

# pdf_not_on_U = list()

# for record in all_records:
#     uid = lookup_uid(record)
#     if os.path.isfile('/media/francis/U/ProquestDissertations/UnrestrictedTheses/{}.pdf'.format(uid)):
#         continue
#     pdf_not_on_U.append(uid)

# print(pdf_not_on_U)


In [None]:
# how many unique values for each field/subfield?

# counting_items = dict()

# def add_to_if_not_yet(k, v):
#     v = v.strip()
#     if v == "None" or not v or v == None:
#         return
#     if k in counting_items:
#         counting_items[k].add(v)
#     else:
#         counting_items[k] = set()
#         counting_items[k].add(v)

# for record_as_marc in to_do_records:
#     record = record_as_marc.as_dict()
#     if not record['fields']:
#         break
#     for dictionary in record['fields']:
#         for k, v in dictionary.items():
#             if isinstance(v, str) and v:
#                 add_to_if_not_yet(k, v)
#             if isinstance(v, dict) and v:
#                 ind1 = v['ind1']
#                 fullpath = '{}/ind1'.format(k)
#                 add_to_if_not_yet(fullpath, ind1)
#                 ind2 = v['ind2']
#                 fullpath = '{}/ind2'.format(k)
#                 add_to_if_not_yet(fullpath, ind2)
#                 subfields = v['subfields']
#                 for subdictionary in subfields:
#                     for x, y in subdictionary.items():
#                         fullpath = '{}/subfields/{}'.format(k, x)
#                         add_to_if_not_yet(fullpath, y)
                        
# for k, v in counting_items.items():
#     print(k, len(v))

In [None]:
# how many unique values for each field/subfield?

# keys_lengths = dict()
# all_unique_keys = dict()

# def add_to_if_not_yet(dictionary, k, v):
#     if k in dictionary:
#         dictionary[k].add(v)
#     else:
#         dictionary[k] = set()
#         dictionary[k].add(v)

# for record_as_marc in to_do_records:
#     record = record_as_marc.as_dict()
#     if not record['fields']:
#         break        
#     field_keys = {k for field in record['fields'] for k in field.keys()}
#     fields_list = [k for field in record['fields'] for k in field.keys()]
#     for unique_field in field_keys:
#         add_to_if_not_yet(keys_lengths, unique_field, fields_list.count(unique_field))
        
# for record in to_do_records:
#     for field in record.get_fields():
#         add_to_if_not_yet(all_unique_keys, field.tag, field.value())

# print('this (key) shows up {times} in a record:\n', sorted(keys_lengths.items()))

# print('\nthis (key) has {unique values} across the repo:')
# for k, v in sorted(all_unique_keys.items()):
#     print(k, len(v))

In [None]:
# all_unique_keys['020']

In [None]:
# this is supposed to check for broken utf-8, but i don't trust it's working

# longest_field = 0

# for record_as_marc in to_do_records:
#     for field in record_as_marc.get_fields():
#         value = field.value()
#         try:
#             bytes_value = value.encode()
#             ascii_value = bytes_value.decode('ascii', "strict")
#             if len(ascii_value) > longest_field:
#                 longest_field = len(ascii_value)
#                 print(record_as_marc)
#         except:
#             print(value)

In [None]:
# print a full record matching a specified uid

def find_print_record(uid):
    for record in all_records:
        if lookup_uid(record) == uid:
            return record.as_dict()
        
# find_print_record('9951617')

In [None]:
# find a record with a certain text in any value

# for record in to_do_records:
#     for field in record.get_fields():
#         if 'Sethumadhava' in field.value():
#             print(lookup_uid(record))