In [1]:
#! /usr/bin/env python3

import os
import shutil
import csv
from collections import namedtuple
from datetime import datetime
import urllib.request

import openpyxl


## Scripts for scrutinizing the datasets

In [2]:
def list_all_sheets(workbook):
    # non-essential sanity check function
    sheets = [sheet for sheet in available_wb.get_sheet_names()]
    print(sheets)

In [3]:
def show_all_info(urn):
    # non-essential script that prints all the info related to a urn.
    if urn in main_sheet:
        print("Main example:", main_sheet[urn], "\n")
    if urn in filenames_sheet:
        print("Filenames example:", filenames_sheet[urn], "\n")
    if urn in keywords_sheet:
        print("Keywords example:", keywords_sheet[urn], "\n")
    if urn in advisors_sheet:
        print("Advisors example:", advisors_sheet[urn], "\n")
    if urn in catalog_sheet:
        print("Catalog example:", catalog_sheet[urn], "\n")

In [4]:
def show_combinations_of_advisors(advisors_sheet):
    urn_advisortitles = dict()
    for urn, advisors_nt_list in advisors_sheet.items():
        for item in advisors_nt_list:
            if item.urn in urn_advisortitles:
                urn_advisortitles[item.urn].append(item.advisor_title)
            else:
                urn_advisortitles[item.urn] = [item.advisor_title, ]

    a_set = set()
    for urn, titles in urn_advisortitles.items():
        for title in titles:
            a_set.add(title)
    print(a_set)

    advisors_permutations = set()

    for urn, titles in urn_advisortitles.items():
        this_permutation = (titles.count('Committee Chair'),
                            titles.count('Committee Co-Chair'),
                            titles.count('Committee Member'),
                            titles.count("Dean's Representative"),
                            )
        advisors_permutations.add(this_permutation)
    for i in advisors_permutations:
        print(i)
    return advisors_permutations

In [5]:
def find_mismatching_files(filenames_sheet):
    sames = dict()
    for urn, filenames_namedtuple_list in filenames_sheet.items():
        for item in filenames_namedtuple_list:
            if item.urn in sames:
                if sames[item.urn] != item.availability:
                    print('there should be one {}'.format(item.urn))
            else:
                sames[item.urn] = item.availability
    return sames

In [6]:
def find_misnamed_extensions(filenames_sheet):
    misnamed_urn_filename = []
    for urn, filenames_namedtuple_list in filenames_sheet.items():
        for item in filenames_namedtuple_list:
            if item.filename[-4] != "." and item.filename[-4:] not in ("docx", "r.gz"):
                misnamed_urn_filename.append((urn, item.filename))
                print(urn, item.filename)
    return misnamed_urn_filename

In [7]:
def find_legacy_school_names(main_sheet):
    schools_etds = dict()
    for urn, itemnamedtuple in main_sheet.items():
        if itemnamedtuple.department in schools_etds:
            schools_etds[itemnamedtuple.department].append(urn)
        else:
            schools_etds[itemnamedtuple.department] = [urn, ]
    for school, urns in schools_etds.items():
        print(school)
    return schools_etds

In [8]:
def find_page_by_page_pdfs(filenames_sheet):
    split_files = dict()
    for urn, filenames_namedtuples_list in filenames_sheet.items():
        for item in filenames_namedtuples_list:
            if item.urn not in split_files:
                split_files[item.urn] = [item.filename, ]
            else:
                split_files[item.urn].append(item.filename)
    page_by_page_pdfs = []
    for urn, filelist in split_files.items():
        split = False
        for i in filelist:
            if "chap" in i.lower():
                split = True
        if len(filelist) > 1 and split == True:
            print(urn, '\n', filelist, '\n')
            page_by_page_pdfs.append((urn, filelist))
    return page_by_page_pdfs

In [9]:
def is_catalog_subset_of_database(catalog_sheet, main_sheet):
    outside_uris = []
    for uri in catalog_sheet:
        if uri not in main_sheet:
            # print(uri)
            outside_uris.append(uri)
    print(len(outside_uris))
    return outside_uris

In [10]:
def are_all_urns_in_main_sheet(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet):
    all_urns = make_set_all_urns(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet)
    main_urns = set(i for i in main_sheet)
    if (all_urns - main_urns) != set():
        print(all_urns - main_urns)
    else:
        print('all urns in mainsheet')

In [11]:
# do all files have a pdf, (or no file at all).
def check_for_no_file_objects(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet):
    no_files = []
    for urn in make_set_all_urns(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet):
        pdf = False
        if urn not in filenames_sheet:
            no_files.append(urn)
            continue
        for nt in filenames_sheet[urn]:
            if 'pdf' in nt.filename.lower():
                pdf = True
        if pdf == False:
            print(urn)
    print('these have no uploaded files:', no_files)

## Reading & parsing source files

In [12]:
def read_workbook(workbook_name):
    sourcepath = 'data/databasetables'
    filename = 'prod_etd_{}_database.xlsx'.format(workbook_name)
    fullpath = os.path.join(sourcepath, filename)
    return openpyxl.load_workbook(fullpath)

In [13]:
def parse_main_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
    {urn: NamedTuple
     urn: NamedTuple
    }
    NamedTuple is expected to have attributes: (urn first_name middle_name last_name suffix author_email
                                                publish_email degree department dtype title abstract availability
                                                availability_description copyright_statement ddate sdate adate
                                                cdate rdate pid url notice notice_response timestamp
                                                survey_completed)
                                            or: (urn first_name middle_name last_name suffix author_email
                                                publish_email degree department dtype title abstract availability
                                                availability_description copyright_statement ddate sdate adate
                                                cdate rdate pid url notices timestamp)
    """
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    main_dict = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('etd_main table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                MainSheet = namedtuple('MainSheet', headers)
                continue
            values = (i.value for i in row)
            item = MainSheet(*values)
            main_dict[item.urn] = item
    return main_dict

In [14]:
def parse_filename_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
        urn: [NamedTuple, NamedTuple, ],
        urn: [NamedTuple, ]
    NamedTuple is expected to have attributes (path, size, availability, description, page_count, timestamp)
    """
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    filenames_sheet = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('filename_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Filenames = namedtuple('Filenames', headers)
                continue
            values = (i.value for i in row)
            item = Filenames(*values)

            if item.urn not in filenames_sheet:
                filenames_sheet[item.urn] = [item, ]
            else:
                row_timestamp = datetime.strptime(item.timestamp, "%Y-%m-%d %H:%M:%S")
                filenames_list = [i.filename for i in filenames_sheet[item.urn]]
                if item.filename in filenames_list:
                    previous_filename_entry = [i for i in filenames_sheet[item.urn] if i.filename == item.filename]
                    previous_timestamp = datetime.strptime(previous_filename_entry[0].timestamp, "%Y-%m-%d %H:%M:%S")
                    if row_timestamp > previous_timestamp:
                        previous_filename_entry[0] = item
                else:
                    filenames_sheet[item.urn].append(item)
    filenames_sheet = sort_descending_size(filenames_sheet)
    return filenames_sheet

def sort_descending_size(filenames_sheet):
    for urn, list_of_namedtuples in filenames_sheet.items():
        list_of_namedtuples = sorted(list_of_namedtuples, key=lambda x:int(x.size), reverse=True)
    return filenames_sheet
        

In [15]:
def parse_keyword_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
    {urn: [NamedTuple,
           NamedTuple,
           ]}
    NamedTuple is expected to have attributes ('keyword', 'urn', 'timestamp')
    """
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    keywords_sheet = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('keyword_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Keywords = namedtuple('Keywords', headers)
                continue
            values = (i.value for i in row)
            item = Keywords(*values)
            if item.urn not in keywords_sheet:
                keywords_sheet[item.urn] = [item, ]
            else:
                keywords_sheet[item.urn].append(item)
    return keywords_sheet

In [16]:
def parse_advisors_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
        {urn: [NamedTuple,
               NamedTuple,
               ]}
        NamedTuple is expected to have attributes ('urn', 'advisor_name', 'advisor_title',
                                                   'advisor_email', 'approval', 'timestamp')
   """
    advisors_sheet = dict()
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('advisor_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Advisors = namedtuple('Advisor', headers)
                continue
            values = (i.value for i in row)
            item = Advisors(*values)
            if item.urn not in advisors_sheet:
                advisors_sheet[item.urn] = [item, ]
            else:
                row_timestamp = datetime.strptime(item.timestamp, "%Y-%m-%d %H:%M:%S")
                if item.advisor_name in advisors_sheet[item.urn]:
                    previous_advisor_entry = [i for i in advisors_sheet[item.urn] if i.advisor_name == item.advisor_name]
                    previous_timestamp = datetime.strptime(previous_advisor_entry[0].timestamp, "%Y-%m-%d %H:%M:%S")
                    if row_timestamp > previous_timestamp:
                        previous_advisor_entry[0] = item
                else:
                    advisors_sheet[item.urn].append(item)
    return advisors_sheet


In [17]:
def parse_catalog_sheet():
    """ returns a dictionary in form of:
        {urn: NamedTuple
         urn: NamedTuple}
    """
    catalog_sheet = dict()
    sourcepath = 'data/Catalogtables'
    sourcefile = 'ETDCatalogRecords20161108.csv'
    with open(os.path.join(sourcepath, sourcefile), encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='|')
        for num, row in enumerate(csvreader):
            if num == 0:
                headers = (i for i in row)
                Catalog = namedtuple('Catalog', headers)
                continue
            values = (i for i in row)
            item = Catalog(*values)
            urn = [i for i in os.path.split(item.URL) if 'etd-' in i]
            urn = os.path.split(urn[0])[1]
            if not urn:
                print('No urn for URL:', item.URL)
            else:
                catalog_sheet[urn] = item
    return catalog_sheet

## Scraping the binaries

In [18]:
def retrieve_binary(url):
    with urllib.request.urlopen(url) as response:
        return response.read()

In [19]:
def write_binary_to_file(binary, folder, filename):
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, filename)
    with open(filepath, 'bw') as f:
        f.write(binary)

In [20]:
joined_postupload = ('etd-06182004-122626', 'etd-09012004-114224', 'etd-0327102-091522', 'etd-0707103-142120',
                     'etd-0710102-054039', 'etd-0409103-184148', 'etd-04152004-142117', 'etd-0830102-145811',
                     'etd-0903103-141852', )

In [21]:
def scrape_binaries(filenames_sheet):
    didnt_grab = []
    target_dir = './ETDbinaries/'
    count = 0
    for num, (urn, filenames_namedtuples_list) in enumerate(filenames_sheet.items()):
        local_dir = os.path.join(target_dir, urn)
        local_files = []
        if os.path.isdir(local_dir):
            local_files = os.listdir(local_dir)
        for item in filenames_namedtuples_list:
            if item.filename in local_files:
                pass
            else:
                url = 'http://etd.lsu.edu/{}/{}'.format("/".join(item.path.split('/')[3:]),
                                                                 item.filename)
                try:
                    binary = retrieve_binary(url)
                    write_binary_to_file(binary, local_dir, item.filename)
                except:
                    count += 1
                    pass
                    didnt_grab.append((urn, item.availability, item.filename))
#                     print(urn, item.availability, item.filename)
    print(count)
    return didnt_grab

In [22]:
# didnt_grab = scrape_binaries(filenames_sheet)

In [23]:
def what_files_arent_in_ETD_dump():
    withheld_files = {file for a,b,c in os.walk('/media/francis/ETD/withheld/') for file in c}
    missing_files = set()
    for urn, availability, filename in didnt_grab:
        if filename in withheld_files:
            continue
        else:
    #         print(urn, availability, filename)
            missing_files.add((urn, availability, filename))
    return missing_files

In [24]:
# missing_files = what_files_arent_in_ETD_dump()

In [25]:
# print(didnt_grab[:10])

In [26]:
# missing_list = sorted([*missing_files], key=lambda x:x[0])
# for i in missing_list[:10]:
#     print(i)

In [27]:
# print(missing_files)

In [28]:
# ETD_source = [os.path.join(root, file) 
#               for root, dirs, files in os.walk('/media/francis/ETD/')
#               for file in files]

In [29]:
# for urn, availability, sought_filename in didnt_grab:
#     for file in ETD_source:
#         found_filename = os.path.split(file)[1]
#         found_urn = file.split('/')[5]
#         if sought_filename == found_filename and found_urn == urn:
#             target_file = os.path.join('./ETDbinaries/', urn, found_filename)
#             if not os.path.isfile(target_file):
# #                 print(file, target_file)
#                 shutil.copy2(file, target_file)

## Building the csv

In [30]:
def csv_writer(data, path):
    with open(path, "w", newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for line in data:
            writer.writerow(line)

In [31]:
def concatinate_keywords(keywords_sheet, urn):
    if urn in keywords_sheet:
        return ', '.join(nt.keyword for nt in keywords_sheet[urn] if nt.keyword)
    return ''

In [32]:
def organize_advisors(advisors_sheet, urn):
    Advisor = namedtuple('Advisor', ('urn', 'advisor_name', 'advisor_title',
                                           'advisor_email', 'approval', 'timestamp'))
    blank_Advisor = Advisor('', '', '', '', '', '')
    Advisors_nt = advisors_sheet[urn]
    advisors_rank = {'Committee Chair': 1, 'Committee Co-Chair': 2, 'Committee Member': 3, "Dean's Representative": 4}
    alpha_Advisors = sorted(Advisors_nt, key=lambda x: x.advisor_name)
    sorted_advisors = sorted(alpha_Advisors, key=lambda x: advisors_rank[x.advisor_title])
    if len(sorted_advisors) > 7 and "Dean's Representative" in sorted_advisors[-1]:
        sorted_advisors = sorted_advisors[:6] + sorted_advisors[-1:]
    elif len(sorted_advisors) > 7:
        sorted_advisors = sorted_advisors[:7]
    elif len(sorted_advisors) < 7:
        missing = 7 - len(sorted_advisors)
        for i in range(missing):
            sorted_advisors.append(blank_Advisor)
    return sorted_advisors

In [33]:
def strip_slash_and_padding(text):
    if not text:
        return
    text = text.strip()
    if text[-1] == '/':
        text = text[:-1]
    text = text.strip()
    return text

In [34]:
def replace_null_with_nothing(text):
    if not text:
        return ''
    return text.replace('NULL', '')

In [35]:
def interpret_publish_email(text):
    if text in ('YES', 'NO'):
        return text.lower()
    else:
        return 'NO'.lower()

In [36]:
def combine_title(catalog_sheet, main_sheet, urn):
    if urn in catalog_sheet:
        title = catalog_sheet[urn].Title
        title = strip_slash_and_padding(title)
        subtitle = catalog_sheet[urn].Subtitle
        subtitle = strip_slash_and_padding(subtitle)
        if title[-1] == ':':
            title = title[:-1]
        if subtitle:
            csv_title = "{}:  {}".format(title,
                                         subtitle)
        else:
            csv_title = title
    else:
        title = main_sheet[urn].title
        title = strip_slash_and_padding(title)
        csv_title = title
    return csv_title

In [37]:
def remove_all_brackets(text):
    for i in ('<', '>', '[', ']', '{', '}', '(', ')', '.'):
        text = text.replace(i, '')
    return text

In [38]:
def find_pub_date(catalog_sheet, urn):
    if urn in catalog_sheet:
        if catalog_sheet[urn].SeriesDate:
            return remove_all_brackets(catalog_sheet[urn].SeriesDate)[:4]
        if catalog_sheet[urn].PubDate:
            return remove_all_brackets(catalog_sheet[urn].PubDate)[:4]
    return remove_all_brackets(main_sheet[urn].ddate)[:4]

In [39]:
def find_defense_date(main_sheet, urn):
    if main_sheet[urn].ddate:
        return remove_all_brackets(main_sheet[urn].ddate)[:4]
    return ''

def find_submission_date(main_sheet, urn):
    if main_sheet[urn].sdate:
        return remove_all_brackets(main_sheet[urn].sdate)[:4]
    return ''

In [40]:
def find_filename(filenames_sheet, urn):
    if urn in filenames_sheet:
        return filenames_sheet[urn][0].filename
    return ''

def find_filesize(filenames_sheet, urn):
    if urn in filenames_sheet:
        return filenames_sheet[urn][0].size
    return ''

def find_filelocation(filenames_sheet, urn):
    if urn in filenames_sheet:
        return 'not yet implemented'
    return 'not yet implemented'

In [41]:
def make_set_all_urns(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet):
    all_urns = set()
    for sheet in (main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet):
        for urn in sheet:
            all_urns.add(urn)
    all_urns.remove('etd-0807101-102716')  # a test item
    all_urns.remove(None)  #
    return all_urns

In [42]:
overlooked_degrees = set()

def read_legacy_dept_map():
    legacy_current = dict()
    sourcepath = 'data/LegacyNames.csv'
    with open(sourcepath, encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='\t')
        for num, row in enumerate(csvreader):
            if num == 0:
                headers = (i for i in row)
            current, legacy = row[0].strip(), row[1].strip()
            if current:
                if legacy not in legacy_current:
                    if legacy not in ('New', ):
                        legacy_current[legacy] = current
                else:
                    print(legacy, 'has two mappings')
    return legacy_current

def lookup_current_dept(legacy_dept):
    legacy_current = read_legacy_dept_map()
    if legacy_dept in legacy_current:
        return legacy_current[legacy_dept]
    else:
        overlooked_degrees.add(legacy_dept)
        return legacy_dept

In [43]:
non_matching_degree_abbrevs = dict()

def make_dict_of_degree_nicknames():
    nick_full_dict = dict()
    raw_text = """Candidate in Philosophy; Doctor of Arts (DA); Doctor of Audiology (AuD); Doctor of Business Administration (DBA); Doctor of Dental Medicine (DMD); Doctor of Education (Ded); Doctor of Education (EdD); Doctor of Engineering (DEng); Doctor of Health and Safety (HSD); Doctor of Management (DMgt); Doctor of Ministry (DMin); Doctor of Music (DM); Doctor of Music Education (DME); Doctor of Musical Arts (DMA); Doctor of Nursing (ND); Doctor of Nursing Science (DNS); Doctor of Pharmacy (PharmD); Doctor of Philosophy (Medical Science); Doctor of Philosophy (PhD); Doctor of Philosophy in Health Services Research (HSOP); Doctor of Physical Education (PED); Doctor of Physical Therapy (DPT); Doctor of Planning and Development Studies (DPDS); Doctor of Psychology (PsyD); Doctor of Public Administration (DPA); Doctor of Public Health (DrPH); Doctor of Recreation (ReD); Doctor of Rehabilitation (RhD); Doctor of Social Work (DSW); Doctor of Veterinary Medicine (DVM); Educat
 ion Specialist (EdS); International Master of Environmental Sciences (IMES); Juris Doctorate (JD); Master in Advanced Studies (MAS); Master in Agricultural Management (MAM); Master in Management of Technology (MMT); Master in Taxation (MT); Master of Accounting (MAcc); Master of Aeronautical Engineering (MAeroE); Master of Agricultural Education (MAgEd); Master of Agriculture (MAgr); Master of Architectural Engineering (MAE); Master of Architecture (MArch); Master of Architecture (MArch)/Master of Business Administration (MBA); Master of Architecture (MArch)/Master of Fine Arts (MFA); Master of Architecture (MArch)/Master of Urban Planning (MUP); Master of Arts (MA); Master of Arts in Counseling (MAC); Master of Arts in Education (MAE); Master of Arts in Interdisciplinary Studies (MAIS); Master of Arts in Pastoral Counseling (MAPC); Master of Arts in Religion (MAR); Master of Arts in Teaching (MAT); Master of Arts in the Teaching of English (MATE); Master of Biological Scien
 ce (MBioSci); Master of Building Science (MBS); Master of Business Administration (MBA); Master of Business Administration/Master of Science in Information Systems; Master of Business Taxation (MBT); Master of Career and Technology Education (MCTE); Master of Chemical Engineering (MChE); Master of City and Regional Planning (MCRP); Master of Civil Engineering (MCE); Master of Community Planning; Master of Computer and Information Science (MCIS); Master of Computer Engineering (MCompE); Master of Construction Management (MCM); Master of Construction Science and Management (MCSM); Master of Criminal Justice (MCJ); Master of Divinity (M.Div); Master of Education (MEd); Master of Electrical Engineering (MEE); Master of Electronic Commerce (MECom); Master of Engineering (ME); Master of Engineering (MEngr); Master of Environmental Engineering (MEnvE); Master of Environmental Studies (MES); Master of Fine Arts (MFA); Master of Forest Resources (MFR); Master of Forestry (MF); Master
  of General Studies (MGS); Master of Geographic Information Science (MGIS); Master of Geomechanics Engineering (MGeoE); Master of Health Administration (MHA); Master of Historic Preservation (MHP); Master of Historical Administration and Museum Studies (MHAMS); Master of Human Development (MHD); Master of Human Resource Development (MHRD); Master of Interdisciplinary Studies (MIDS); Master of International Management (MIM); Master of Judicial Studies (MJS); Master of Landscape Architecture (MLA); Master of Library Science/Master of Life Sciences (MLS); Master of Management (MM); Master of Materials Science and Engineering (MMatSE); Master of Music (MM); Master of Music (MMUS); Master of Music Education (MME); Master of Parks, Recreation and Tourism Management (MPRTM); Master of Physical Therapy (MPT); Master of Planning (MPlan); Master of Professional Accounting (MPAcc); Master of Public Administration (MPA); Master of Public Administration/Juris Doctorate (MPA/JD); Master o
 f Public Health (MPH); Master of Public Management (MPM); Master of Public Policy (MPP); Master of Real Estate Development (MRED); Master of Regional Planning (MRP); Master of School Administration (MSA); Master of Science (MS); Master of Science and Software Engineering (MSSE); Master of Science in Acountancy (MSA); Master of Science in Administration (MSA); Master of Science in Aerospace Engineering (MSAeroE); Master of Science in Agricultural Engineering (MSAgE); Master of Science in Biomedical Engineering; Master of Science in Biosystems and Agricultural Engineering (MSBiosyAgE); Master of Science in Chemical Engineering (MSChE); Master of Science in Civil Engineering (MSCE); Master of Science in Community and Regional Planning (MSCRP); Master of Science in Economics (MSECO); Master of Science in Education (MSEd); Master of Science in Electrical and Computer Engineering (MSECE); Master of Science in Electrical Engineering (MSEE); Master of Science in Engineering (MSE); M
 aster of Science in Engineering Management (MSEM); Master of Science in Environmental Technology Management (MSETM); Master of Science in Hospitality and Tourism Management (MSHTM); Master of Science in Human Resources Management (MSHRM); Master of Science in Industrial Engineering (MSIE); Master of Science in Industrial Engineering and Operations Research (MSIEOR); Master of Science in Information Systems (MSIS); Master of Science in Infrastructure Systems Engineering (MSISE); Master of Science in Interdisciplinary Studies (MSIS); Master of Science in International Business (MSIB); Master of Science in Jurisprudence (MSJ); Master of Science in Management (MSM); Master of Science in Manufacturing Engineering (MSMANFE); Master of Science in Material Science Engineering (MSMatSE); Master of Science in Materials Science and Engineering (MSMSE); Master of Science in Mechanical Engineering (MSME); Master of Science in Medical Sciences (MSMS); Master of Science in Nursing (MSN); M
 aster of Science in Petroleum Engineering (MSPE); Master of Science in Planning (MSP); Master of Science in Public Health (MSPH); Master of Social Welfare (MSW); Master of Social Work (MSW); Master of Theological Studies (MTS); Master of Theology (Th.M); Master of Urban and Regional Planning (MURP); Master of Urban Planning (MUP); Masters of Health Informatics (MHI); Masters of Science in Bioscience (MSB); Master's of Science in Teaching (MST); Medical Doctor (MD); Medical Surgeon in Experimental Surgery (MSExpSurg); MS Otolaryngology (MSOtol); PhD Otolaryngology (PhDOtol); PhD Surgergy (PhDSurg); Professional Master of Business Administration (PMBA)"""
    for item in raw_text.split(';'):
        if '(' not in item:
            continue
        else:
            nick = item.split('(')[1].replace(')','')
            name = item
            nick_full_dict[nick] = name
    return nick_full_dict

def expand_degree_type(degree_name):
    nick_name_dict = make_dict_of_degree_nicknames()
    if degree_name in nick_name_dict:
        return nick_name_dict[degree_name]
    else:
        if degree_name not in non_matching_degree_abbrevs:
            non_matching_degree_abbrevs[degree_name] = []
#         print('couldnt find a matching degree nickname in expand_degree_type() for {}'.format(degree_name))

In [44]:
def split_email(text):
    if not text:
        return ''
    emails = text.split(',')
    for i in emails:
        if 'lsu.edu' not in i:
            return i.strip()
    else:
        return emails[0].strip()
    

In [45]:
# current_department & department mapped -- we need department & legacy department.

def build_csv(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet):
    csv_data = []
    csvfieldnames = ["title",
                     "fulltext_url",
                     "author1_fname",
                     'author1_mname',
                     'author1_lname',
                     'author1_suffix',
                     'author1_email',
                     'author1_email_pub',
                     'author1_institution',
                     'advisor1_title',
                     'advisor1',
                     "advisor1_email",
                     'advisor2_title',
                     'advisor2',
                     "advisor2_email",
                     'advisor3_title',
                     'advisor3',
                     "advisor3_email",
                     'advisor4_title',
                     'advisor4',
                     "advisor4_email",
                     'advisor5_title',
                     'advisor5',
                     "advisor5_email",
                     'advisor6_title',
                     'advisor6',
                     "advisor6_email",
                     'advisor7_title',
                     'advisor7',
                     "advisor7_email",
                     "document_type",
                     'degree_name',
                     'legacy_department',
                     'department',
                     'disciplines',
                     'keywords',
                     'abstract',
                     'publication_date',
                     'defense_date',
                     'submission_date',
                     'availability',
                     'availability_description',
                     'urn',
                     'file_name',
                     'file_size',
                     'season',
                    ]
    csv_data.append(csvfieldnames)
    all_urns = make_set_all_urns(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet)
    for urn in all_urns:
        csv_title = combine_title(catalog_sheet, main_sheet, urn)
        csv_urn = urn
        csv_first_name = main_sheet[urn].first_name
        csv_middle_name = main_sheet[urn].middle_name
        csv_last_name = main_sheet[urn].last_name
        csv_suffix = main_sheet[urn].suffix
        csv_suffix = replace_null_with_nothing(csv_suffix)
        csv_author_email = split_email(main_sheet[urn].author_email)
        csv_publish_email = interpret_publish_email(main_sheet[urn].publish_email)
        sorted_advisors = organize_advisors(advisors_sheet, urn)
        csv_document_type = main_sheet[urn].dtype.lower()
        csv_degree = expand_degree_type(main_sheet[urn].degree)
        csv_legacy_department = main_sheet[urn].department
        csv_department = lookup_current_dept(main_sheet[urn].department)
        csv_disciplines = "not yet implemented"
        csv_keywords = concatinate_keywords(keywords_sheet, urn)
        csv_abstract = main_sheet[urn].abstract
        csv_publication_date = find_pub_date(catalog_sheet, urn)
        csv_defense_date = find_defense_date(main_sheet, urn)
        csv_submission_date = find_submission_date(main_sheet, urn)
        csv_availability = main_sheet[urn].availability
        csv_availability_desc = main_sheet[urn].availability_description
        csv_filename = find_filename(filenames_sheet, urn)
        csv_filesize = find_filesize(filenames_sheet, urn)
        
        
        if urn in filenames_sheet:
            filename = filenames_sheet[urn][0].filename
        else:
            filename = ''
            
        csv_data.append([csv_title,
                         '',
                         csv_first_name,
                         csv_middle_name,
                         csv_last_name,
                         csv_suffix,
                         csv_author_email,
                         csv_publish_email,
                         'Louisiana State University and Agricultural and Mechanical College',
                         sorted_advisors[0].advisor_title,
                         sorted_advisors[0].advisor_name,
                         sorted_advisors[0].advisor_email,
                         sorted_advisors[1].advisor_title,
                         sorted_advisors[1].advisor_name,
                         sorted_advisors[1].advisor_email,
                         sorted_advisors[2].advisor_title,
                         sorted_advisors[2].advisor_name,
                         sorted_advisors[2].advisor_email,
                         sorted_advisors[3].advisor_title,
                         sorted_advisors[3].advisor_name,
                         sorted_advisors[3].advisor_email,
                         sorted_advisors[4].advisor_title,
                         sorted_advisors[4].advisor_name,
                         sorted_advisors[4].advisor_email,
                         sorted_advisors[5].advisor_title,
                         sorted_advisors[5].advisor_name,
                         sorted_advisors[5].advisor_email,
                         sorted_advisors[6].advisor_title,
                         sorted_advisors[6].advisor_name,
                         sorted_advisors[6].advisor_email,
                         csv_document_type,
                         csv_degree,
                         csv_legacy_department,
                         csv_department,
                         csv_disciplines,
                         csv_keywords,
                         csv_abstract,
                         csv_publication_date,
                         csv_defense_date,
                         csv_submission_date,
                         csv_availability,
                         csv_availability_desc,
                         urn,
                         csv_filename,
                         csv_filesize,
                         '',
                         ])
#     print(csv_data)
    csv_writer(csv_data, '../../scrap.csv')

# if __name__ == '__main__':

In [46]:
available_wb = read_workbook('available')
submitted_wb = read_workbook('submitted')
withheld_wb = read_workbook('withheld')

In [47]:
# merges the matching sheets from all 3 workbooks into one datastructure per sheet-type.
all_db_workbooks = (available_wb, submitted_wb, withheld_wb)

main_sheet = parse_main_sheet(all_db_workbooks)
filenames_sheet = parse_filename_sheet(all_db_workbooks)
keywords_sheet = parse_keyword_sheet(all_db_workbooks)
advisors_sheet = parse_advisors_sheet(all_db_workbooks)

In [48]:
catalog_sheet = parse_catalog_sheet()

In [49]:
build_csv(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet)


In [50]:
print(overlooked_degrees)

{'Construction Management', 'Physics & Astronomy', 'Finance', 'Environmental Sciences', 'English', 'Accounting', 'Management (Business Administration)', 'Chemical Engineering', 'Animal Science (Animal, Dairy, & Poultry Sciences)', 'Landscape Architecture', 'Communication Studies', 'Mass Communication', 'Engineering Science (Interdepartmental Program)', 'Education', 'Foreign Languages & Literatures', 'Entomology', 'Finance (Business Administration)', 'Human Resource Education & Workforce Development', 'Mechanical Engineering', 'Geography & Anthropology', 'Civil & Environmental Engineering', 'Civil and Environmental Engineering', 'Information Systems & Decision Sciences', 'Geology & Geophysics', 'Communication Sciences & Disorders', 'Theatre', 'Educational Leadership, Research & Counseling', 'Plant, Enviromental & Soil Sciences', 'Forestry, Wildlife, and Fisheries', 'Biological Sciences', 'Mathematics', 'Psychology', 'Oceanography and Coastal Sciences', 'Art', 'French Studies', 'Petroleu

In [51]:
main_sheet_urns = {i for i in main_sheet}
catalog_sheet_urns = {i for i in catalog_sheet}
in_main_not_catalog = main_sheet_urns - catalog_sheet_urns

print(len(in_main_not_catalog))

2193


In [52]:
# for urn, a_list in filenames_sheet.items():
#     for nt in a_list:
#         if nt.availability not in ('available', 'unrestricted'):
#             print(urn, nt.availability, nt.filename)

In [53]:
count = 0
for urn, pack in main_sheet.items():
    if urn in catalog_sheet:
        if catalog_sheet[urn].PubDate:
            if remove_all_brackets(catalog_sheet[urn].PubDate) != pack.adate[:4]:
                count += 1
#                 print(urn, remove_all_brackets(catalog_sheet[urn].PubDate), pack.rdate[:4])
        elif catalog_sheet[urn].SeriesDate:
            if remove_all_brackets(catalog_sheet[urn].SeriesDate) != pack.adate[:4]:
                count += 1
#                 print(urn, remove_all_brackets(catalog_sheet[urn].SeriesDate), pack.rdate[:4])
print(count)

51


In [54]:
listed_filenames = set()
duplicate_filenames = dict()

for urn, item_list in filenames_sheet.items():
    for item in item_list:
        if item.filename in listed_filenames:
            if item.filename in duplicate_filenames:
                duplicate_filenames[item.filename] += 1
            else:
                duplicate_filenames[item.filename] = 1
        else:
            listed_filenames.add(item.filename)
        if item.filename == "Nelson_thesis.pdf":
            print(urn)

etd-1104103-132505
etd-11152010-155245


In [55]:
main_sheet['etd-11152010-155245'].title

'Depolarization by Transient Receptor Potential Melastatin 4 in Pancreatic Alpha-Cells Regulates Glucagon Secretion'

In [56]:
len(duplicate_filenames)

471

In [57]:
count = 0
for k, v in duplicate_filenames.items():
    if v > 5:
        print(k, v)

Thesis.pdf 45
Smith_thesis.pdf 13
Kim_dis.pdf 6
Lee_dis.pdf 8
DISSERTATION.pdf 6
Li_dis.pdf 11
Johnson_thesis.pdf 6
Williams_thesis.pdf 6
Wang_thesis.pdf 10
thesis.pdf 36
Zhang_thesis.pdf 10
Li_thesis.pdf 8
Zhang_dis.pdf 12
Wang_diss.pdf 8
THESIS.pdf 13
Smith_dis.pdf 6
Jones_thesis.pdf 9
Johnson_dis.pdf 6
Dissertation.pdf 40
Wang_dis.pdf 10
dissertation.pdf 12


In [58]:
print(len(main_sheet))

8398
