In [1]:
#! /usr/bin/env python3

from pymarc import MARCReader
import os
import csv

In [2]:
with open('source_data/MARCDATA.MRC', 'rb') as f:
    reader = MARCReader(f)
    all_records = list()
    for record in reader:
        all_records.append(record)

In [3]:
# an example of one record
# all_records[0].as_dict()

In [4]:
def lookup_uid(record):
    return record.get_fields('001')[0].value().replace('AAI','')

In [5]:
lookup_uid(all_records[0])

'0000418'

In [6]:
# make a list of restricted items
# from the previously-made ranges
# they match 1247 records from the marc file

restricted_range_a = set(range(3048322, 3335145))
restricted_range_b = {3021429, 3030348, 3451495}
restricted_range_c = {3049191, 3049223, 3051440, 3053695, 3053696}
restricted_range_d = set(range(3049188, 3329096))
all_restricteds = set().union(restricted_range_a, 
                              restricted_range_b,
                              restricted_range_c,
                              restricted_range_d)
all_restricteds.remove(3136164)

restricted_uids = []

for record in all_records:
    uid = lookup_uid(record)
    if int(uid) in all_restricteds:
        restricted_uids.append(uid)
        continue
        
print(len(restricted_uids))

1247


In [10]:
# make a list of duplicated uids

with open('source_data/DuplicatedInDigitalCommons.txt', 'r', encoding='utf-8') as f:
    duplicate_uids = []
    for line in f.readlines():
        duplicate_uids.append(line.replace('.pdf', '').strip())

In [21]:
# make a list of records not in duplicated or in restricted

to_do_uids = [i for i in all_records if lookup_uid(i) not in restricted_uids
                                        and lookup_uid(i) not in duplicate_uids]


## Testing things

In [None]:
# test of all uids in marc file match a pdf on U drive
# short answer: they all do

# pdf_not_on_U = list()

# for record in all_records:
#     uid = lookup_uid(record)
#     if os.path.isfile('/media/francis/U/ProquestDissertations/UnrestrictedTheses/{}.pdf'.format(uid)):
#         continue
#     pdf_not_on_U.append(uid)

# print(pdf_not_on_U)


In [22]:
## Making the csv

In [None]:
def csv_writer(data, path):
    with open(path, "w", newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for line in data:
            writer.writerow(line)

In [None]:
def build_csv(all_data_parsed):
    csv_data = []
    # first row is all fieldnames, following rows are each uid's info
    
    csvfieldnames = ["title",
                     "fulltext_url",
                     'keywords',
                     'abstract',
                     "author1_fname",
                     'author1_mname',
                     'author1_lname',
                     'author1_suffix',
                     'author1_email',
                     'author1_institution',
                     'advisor1',
                     'advisor2',
                     'advisor3',
                     "advisor1_email",
                     'advisor1_title',
                     "advisor2_email",
                     'advisor2_title',
                     "advisor3_email",
                     'advisor3_title',
                     'advisor4',
                     "advisor4_email",
                     'advisor4_title',
                     'advisor5',
                     "advisor5_email",
                     'advisor5_title',
                     'advisor6',
                     "advisor6_email",
                     'advisor6_title',
                     'advisor7',
                     "advisor7_email",
                     'advisor7_title',
                     'availability',
                     'availability_description',
                     'disciplines',
                     'defense_date',
                     'degree_name',
                     'department',
                     "document_type",
                     'file_name',
                     'file_size',
                     'hide_author_email',
                     'legacy_department',
                     'publication_date',
                     'season',
                     'submission_date',
                     'urn',
                    ]
    csv_data.append(csvfieldnames)

    for urn in all_urns:
        csv_title = combine_title(catalog_sheet, main_sheet, urn)
        csv_urn = urn
        fulltext_url = parse_dropbox_url(urn)
#             fulltext_url = gdrive_dict[urn]
        csv_first_name = main_sheet[urn].first_name
        csv_middle_name = main_sheet[urn].middle_name
        csv_last_name = main_sheet[urn].last_name
        csv_suffix = main_sheet[urn].suffix
        csv_suffix = replace_null_with_nothing(csv_suffix)
        csv_author_email = lookup_email(urn)
        csv_hide_author_email = lookup_hide_email(urn)
        sorted_advisors = organize_advisors(advisors_sheet, urn)
        csv_document_type = main_sheet[urn].dtype.lower()
        csv_degree = expand_degree_type(main_sheet[urn].degree)
        csv_legacy_department = main_sheet[urn].department
        csv_department = lookup_current_dept(main_sheet[urn].department)
        csv_disciplines = ''
        csv_keywords = concatinate_keywords(keywords_sheet, urn)
        csv_abstract = main_sheet[urn].abstract
        csv_publication_date = find_pub_date(catalog_sheet, urn)
        csv_defense_date = find_defense_date(main_sheet, urn)
        csv_submission_date = find_submission_date(main_sheet, urn)
        csv_availability = main_sheet[urn].availability
        csv_availability_desc = main_sheet[urn].availability_description
        csv_filename = find_thesis(filenames_sheet, urn).filename
        csv_filesize = find_thesis(filenames_sheet, urn).size
        csv_season = ''

        if urn in filenames_sheet:
            filename = filenames_sheet[urn][0].filename
        else:
            filename = ''
        csv_data.append([csv_title,
                         fulltext_url,
                         csv_keywords,
                         csv_abstract,
                         csv_first_name,
                         csv_middle_name,
                         csv_last_name,
                         csv_suffix,
                         csv_author_email,
                         'Louisiana State University and Agricultural and Mechanical College',
                         sorted_advisors[0].advisor_name,
                         sorted_advisors[1].advisor_name,
                         sorted_advisors[2].advisor_name,
                         sorted_advisors[0].advisor_email,
                         sorted_advisors[0].advisor_title,
                         sorted_advisors[1].advisor_email,
                         sorted_advisors[1].advisor_title,
                         sorted_advisors[2].advisor_email,
                         sorted_advisors[2].advisor_title,
                         sorted_advisors[3].advisor_name,
                         sorted_advisors[3].advisor_email,
                         sorted_advisors[3].advisor_title,
                         sorted_advisors[4].advisor_name,
                         sorted_advisors[4].advisor_email,
                         sorted_advisors[4].advisor_title,
                         sorted_advisors[5].advisor_name,
                         sorted_advisors[5].advisor_email,
                         sorted_advisors[5].advisor_title,
                         sorted_advisors[6].advisor_name,
                         sorted_advisors[6].advisor_email,
                         sorted_advisors[6].advisor_title,
                         csv_availability,
                         csv_availability_desc,
                         csv_disciplines,
                         csv_defense_date,
                         csv_degree, 
                         csv_department,
                         csv_document_type,
                         csv_filename,
                         csv_filesize,
                         csv_hide_author_email,
                         csv_legacy_department,
                         csv_publication_date,
                         csv_season,
                         csv_submission_date,
                         urn,
                         ])
#     print(csv_data)
    csv_writer(csv_data, 'output/scrap_Proquest.csv')