In [1]:
#! /usr/bin/env python3

import os
import csv
from collections import namedtuple
from datetime import datetime
import urllib.request

import openpyxl


## Scripts for scrutinizing the datasets

In [2]:
def list_all_sheets(workbook):
    # non-essential sanity check function
    sheets = [sheet for sheet in available_wb.get_sheet_names()]
    print(sheets)

In [3]:
def show_all_info(urn):
    # non-essential script that prints all the info related to a urn.
    print("Main example:", main_sheet[urn], "\n")
    print("Filenames example:", filenames_sheet[urn], "\n")
    print("Keywords example:", keywords_sheet[urn], "\n")
    print("Advisors example:", advisors_sheet[urn], "\n")
    print("Catalog example:", catalog_sheet[urn], "\n")

In [4]:
def show_combinations_of_advisors(advisors_sheet):
    urn_advisortitles = dict()
    for urn, advisors_nt_list in advisors_sheet.items():
        for item in advisors_nt_list:
            if item.urn in urn_advisortitles:
                urn_advisortitles[item.urn].append(item.advisor_title)
            else:
                urn_advisortitles[item.urn] = [item.advisor_title, ]

    a_set = set()
    for urn, titles in urn_advisortitles.items():
        for title in titles:
            a_set.add(title)
    print(a_set)

    advisors_permutations = set()

    for urn, titles in urn_advisortitles.items():
        this_permutation = (titles.count('Committee Chair'),
                            titles.count('Committee Co-Chair'),
                            titles.count('Committee Member'),
                            titles.count("Dean's Representative"),
                            )
        advisors_permutations.add(this_permutation)
    for i in advisors_permutations:
        print(i)
    return advisors_permutations

In [5]:
def find_mismatching_files(filenames_sheet):
    sames = dict()
    for urn, filenames_namedtuple_list in filenames_sheet.items():
        for item in filenames_namedtuple_list:
            if item.urn in sames:
                if sames[item.urn] != item.availability:
                    print('there should be one {}'.format(item.urn))
            else:
                sames[item.urn] = item.availability
    return sames

In [6]:
def find_misnamed_extensions(filenames_sheet):
    misnamed_urn_filename = []
    for urn, filenames_namedtuple_list in filenames_sheet.items():
        for item in filenames_namedtuple_list:
            if item.filename[-4] != "." and item.filename[-4:] not in ("docx", "r.gz"):
                misnamed_urn_filename.append((urn, item.filename))
                print(urn, item.filename)
    return misnamed_urn_filename

In [7]:
def find_legacy_school_names(main_sheet):
    schools_etds = dict()
    for urn, itemnamedtuple in main_sheet.items():
        if itemnamedtuple.department in schools_etds:
            schools_etds[itemnamedtuple.department].append(urn)
        else:
            schools_etds[itemnamedtuple.department] = [urn, ]
    for school, urns in schools_etds.items():
        print(school)
    return schools_etds

In [8]:
def find_page_by_page_pdfs(filenames_sheet):
    split_files = dict()
    for urn, filenames_namedtuples_list in filenames_sheet.items():
        for item in filenames_namedtuples_list:
            if item.urn not in split_files:
                split_files[item.urn] = [item.filename, ]
            else:
                split_files[item.urn].append(item.filename)
    page_by_page_pdfs = []
    for urn, filelist in split_files.items():
        split = False
        for i in filelist:
            if "chap" in i.lower():
                split = True
        if len(filelist) > 1 and split == True:
            print(urn, '\n', filelist, '\n')
            page_by_page_pdfs.append((urn, filelist))
    return page_by_page_pdfs

In [9]:
def is_catalog_superset_of_database(catalog_sheet, main_sheet):
    outside_uris = []
    for uri in catalog_sheet:
        if uri not in main_sheet:
            # print(uri)
            outside_uris.append(uri)
    print(len(outside_uris))
    return outside_uris

## Reading & parsing source files

In [10]:
def read_workbook(workbook_name):
    sourcepath = 'data/databasetables'
    filename = 'prod_etd_{}_database.xlsx'.format(workbook_name)
    fullpath = os.path.join(sourcepath, filename)
    return openpyxl.load_workbook(fullpath)

In [11]:
def parse_main_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
    {urn: NamedTuple
     urn: NamedTuple
    }
    NamedTuple is expected to have attributes: (urn first_name middle_name last_name suffix author_email
                                                publish_email degree department dtype title abstract availability
                                                availability_description copyright_statement ddate sdate adate
                                                cdate rdate pid url notice notice_response timestamp
                                                survey_completed)
                                            or: (urn first_name middle_name last_name suffix author_email
                                                publish_email degree department dtype title abstract availability
                                                availability_description copyright_statement ddate sdate adate
                                                cdate rdate pid url notices timestamp)
    """
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    main_dict = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('etd_main table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                MainSheet = namedtuple('MainSheet', headers)
                continue
            values = (i.value for i in row)
            item = MainSheet(*values)
            main_dict[item.urn] = item
    return main_dict

In [12]:
def parse_filename_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
        urn: [NamedTuple, NamedTuple, ],
        urn: [NamedTuple, ]
    NamedTuple is expected to have attributes (path, size, available, description, page_count, timestamp)
    """
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    filenames_sheet = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('filename_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Filenames = namedtuple('Filenames', headers)
                continue
            values = (i.value for i in row)
            item = Filenames(*values)

            if item.urn not in filenames_sheet:
                filenames_sheet[item.urn] = [item, ]
            else:
                row_timestamp = datetime.strptime(item.timestamp, "%Y-%m-%d %H:%M:%S")
                if item.filename in filenames_sheet[item.urn]:
                    previous_filename_entry = [i for i in filenames_sheet[item.urn] if i.filename == item.filename]
                    previous_timestamp = datetime.strptime(previous_filename_entry[0].timestamp, "%Y-%m-%d %H:%M:%S")
                    if row_timestamp > previous_timestamp:
                        print('oops')
                        previous_filename_entry[0] = item
                else:
                    filenames_sheet[item.urn].append(item)
    return filenames_sheet

In [13]:
def parse_keyword_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
    {urn: [NamedTuple,
           NamedTuple,
           ]}
    NamedTuple is expected to have attributes ('keyword', 'urn', 'timestamp')
    """
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    keywords_sheet = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('keyword_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Keywords = namedtuple('Keywords', headers)
                continue
            values = (i.value for i in row)
            item = Keywords(*values)
            if item.urn not in keywords_sheet:
                keywords_sheet[item.urn] = [item, ]
            else:
                keywords_sheet[item.urn].append(item)
    return keywords_sheet

In [14]:
def parse_advisors_sheet(all_db_workbooks):
    """ returns a dictionary in form of:
        {urn: [NamedTuple,
               NamedTuple,
               ]}
        NamedTuple is expected to have attributes ('urn', 'advisor_name', 'advisor_title',
                                                   'advisor_email', 'approval', 'timestamp')
   """
    advisors_sheet = dict()
    (available_wb, submitted_wb, withheld_wb) = all_db_workbooks
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('advisor_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Advisors = namedtuple('Advisor', headers)
                continue
            values = (i.value for i in row)
            item = Advisors(*values)
            if item.urn not in advisors_sheet:
                advisors_sheet[item.urn] = [item, ]
            else:
                row_timestamp = datetime.strptime(item.timestamp, "%Y-%m-%d %H:%M:%S")
                if item.advisor_name in advisors_sheet[item.urn]:
                    previous_advisor_entry = [i for i in advisors_sheet[item.urn] if i.advisor_name == item.advisor_name]
                    previous_timestamp = datetime.strptime(previous_advisor_entry[0].timestamp, "%Y-%m-%d %H:%M:%S")
                    if row_timestamp > previous_timestamp:
                        previous_advisor_entry[0] = item
                else:
                    advisors_sheet[item.urn].append(item)
    return advisors_sheet


In [15]:
def parse_catalog_sheet():
    """ returns a dictionary in form of:
        {urn: NamedTuple
         urn: NamedTuple}
    """
    catalog_sheet = dict()
    sourcepath = 'data/Catalogtables'
    sourcefile = 'CatalogETDSelectMetadata.csv'
    with open(os.path.join(sourcepath, sourcefile), encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for num, row in enumerate(csvreader):
            if num == 0:
                headers = (i for i in row)
                Catalog = namedtuple('Catalog', headers)
                continue
            values = (i for i in row)
            item = Catalog(*values)
            urn = [i for i in os.path.split(item.URL) if 'etd-' in i]
            urn = os.path.split(urn[0])[1]
            if not urn:
                print('No urn for URL:', item.URL)
            else:
                catalog_sheet[urn] = item
    return catalog_sheet

## Scraping the binaries

In [16]:
def retrieve_binary(url):
    with urllib.request.urlopen(url) as response:
        return response.read()

In [17]:
def write_binary_to_file(binary, folder, filename):
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, filename)
    with open(filepath, 'bw') as f:
        f.write(binary)

In [18]:
joined_postupload = ('etd-06182004-122626', 'etd-09012004-114224', 'etd-0327102-091522', 'etd-0707103-142120',
                     'etd-0710102-054039', 'etd-0409103-184148', 'etd-04152004-142117', 'etd-0830102-145811',
                     'etd-0903103-141852', )

In [19]:
def scrape_binaries(filenames_sheet):
    didnt_grab = []
    target_dir = './ETDbinaries/'
    count = 0
    for urn, filenames_namedtuples_list in filenames_sheet.items():
        local_dir = os.path.join(target_dir, urn)
        local_files = []
        if os.path.isdir(local_dir):
            local_files = os.listdir(local_dir)
        for item in filenames_namedtuples_list:
            if item.filename in local_files:
                pass
            else:
                url = 'http://etd.lsu.edu/{}/{}'.format("/".join(item.path.split('/')[3:]),
                                                                 item.filename)
                try:
                    binary = retrieve_binary(url)
                    write_binary_to_file(binary, local_dir, item.filename)
                except:
                    count += 1
                    pass
                    didnt_grab.append((urn, item.filename))
                    print(urn, item.filename)
    print(count)
    return didnt_grab

## Building the csv

In [50]:
def csv_writer(data, path):
    with open(path, "w", newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for line in data:
            writer.writerow(line)

In [117]:
def concatinate_keywords(keywords_sheet, urn):
    if urn in keywords_sheet:
        return ', '.join(nt.keyword for nt in keywords_sheet[urn] if nt.keyword)
    return ''

In [97]:
def organize_advisors(advisors_sheet, urn):
    Advisor = namedtuple('Advisor', ('urn', 'advisor_name', 'advisor_title',
                                           'advisor_email', 'approval', 'timestamp'))
    blank_Advisor = Advisor('', '', '', '', '', '')
    Advisors_nt = advisors_sheet[urn]
    advisors_rank = {'Committee Chair': 1, 'Committee Co-Chair': 2, 'Committee Member': 3, "Dean's Representative": 4}
    alpha_Advisors = sorted(Advisors_nt, key=lambda x: x.advisor_name)
    sorted_advisors = sorted(alpha_Advisors, key=lambda x: advisors_rank[x.advisor_title])
    if len(sorted_advisors) > 7 and "Dean's Representative" in sorted_advisors[-1]:
        sorted_advisors = sorted_advisors[:6] + sorted_advisors[-1:]
    elif len(sorted_advisors) > 7:
        sorted_advisors = sorted_advisors[:7]
    elif len(sorted_advisors) < 7:
        missing = 7 - len(sorted_advisors)
        for i in range(missing):
            sorted_advisors.append(blank_Advisor)
    return sorted_advisors

In [60]:
def strip_slash_and_padding(text):
    if not text:
        return
    text = text.strip()
    if text[-1] == '/':
        text = text[:-1]
    text = text.strip()
    return text

In [70]:
def replace_null_with_nothing(text):
    if not text:
        return ''
    return text.replace('NULL', '')

In [62]:
def combine_title(catalog_sheet, main_sheet, urn):
    if urn in catalog_sheet:
        title = catalog_sheet[urn].Title
        title = strip_slash_and_padding(title)
        subtitle = catalog_sheet[urn].Subtitle
        subtitle = strip_slash_and_padding(subtitle)
        if title[-1] == ':':
            title = title[:-1]
        if subtitle:
            csv_title = "{}:  {}".format(title,
                                         subtitle)
        else:
            csv_title = title
    else:
        title = main_sheet[urn].title
        title = strip_slash_and_padding(title)
        csv_title = title
    return csv_title

In [123]:
def remove_all_brackets(text):
    for i in ('<', '>', '[', ']', '{', '}', '(', ')'):
        text = text.replace(i, '')
    return text

In [131]:
def find_pub_date(catalog_sheet, urn):
    if urn in catalog_sheet:
        if catalog_sheet[urn].SeriesDate:
            return remove_all_brackets(catalog_sheet[urn].SeriesDate)
        if catalog_sheet[urn].PubDate:
            return remove_all_brackets(catalog_sheet[urn].PubDate)
    return ''

In [133]:
def find_defense_date(main_sheet, urn):
    if main_sheet[urn].ddate:
        return remove_all_brackets(main_sheet[urn].ddate)[:4]
    return ''

def find_submission_date(main_sheet, urn):
    if main_sheet[urn].sdate:
        return remove_all_brackets(main_sheet[urn].sdate)[:4]
    return ''

In [139]:
def build_csv(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet):
    csv_data = []

    csvfieldnames = ["title",
                     "urn",
                     "author_fname",
                     'author_mname',
                     'author_lname',
                     'author_suffix',
                     'author Email',
                     'author_email_pub',
                     'advisor1_title',
                     'advisor1_name',
                     "advisor1_email",
                     'advisor2_title',
                     'advisor2_name',
                     "advisor2_email",
                     'advisor3_title',
                     'advisor3_name',
                     "advisor3_email",
                     'advisor4_title',
                     'advisor4_name',
                     "advisor4_email",
                     'advisor5_title',
                     'advisor5_name',
                     "advisor5_email",
                     'advisor6_title',
                     'advisor6_name',
                     "advisor6_email",
                     'advisor7_title',
                     'advisor7_name',
                     "advisor7_email",
                     "document_type",
                     'degree_name',
                     'department',
                     'legacy_department',
                     'disciplines',
                     'keywords',
                     'abstract',
                     'publication_date',
                     'defense_date',
                     'submission_date',
                     'availability',
                     'availability_description',
                    ]
    csv_data.append(csvfieldnames)
    for urn in main_sheet:
        csv_title = combine_title(catalog_sheet, main_sheet, urn)
        csv_urn = urn
        csv_first_name = main_sheet[urn].first_name
        csv_middle_name = main_sheet[urn].middle_name
        csv_last_name = main_sheet[urn].last_name
        csv_suffix = main_sheet[urn].suffix
        csv_suffix = replace_null_with_nothing(csv_suffix)
        csv_author_email = main_sheet[urn].author_email
        csv_publish_email = main_sheet[urn].publish_email
        sorted_advisors = organize_advisors(advisors_sheet, urn)
        csv_document_type = main_sheet[urn].dtype
        csv_degree = main_sheet[urn].degree
        csv_legacy_department = main_sheet[urn].department
        csv_department = 'awaiting mapping legacy:current'
        csv_disciplines = 'awaiting mapping ???:disciplines'
        csv_keywords = concatinate_keywords(keywords_sheet, urn)
        csv_abstract = main_sheet[urn].abstract
        csv_publication_date = find_pub_date(catalog_sheet, urn)
        csv_defense_date = find_defense_date(main_sheet, urn)
        csv_submission_date = find_submission_date(main_sheet, urn)
        csv_availability = main_sheet[urn].availability
        csv_availability_desc = main_sheet[urn].availability_description,
        
        
        
        if urn in filenames_sheet:
            filename = filenames_sheet[urn][0].filename
        else:
            filename = ''
            
        csv_data.append([csv_title,
                         csv_urn,
                         csv_first_name,
                         csv_middle_name,
                         csv_last_name,
                         csv_suffix,
                         csv_author_email,
                         csv_publish_email,
                         sorted_advisors[0].advisor_title,
                         sorted_advisors[0].advisor_name,
                         sorted_advisors[0].advisor_email,
                         sorted_advisors[1].advisor_title,
                         sorted_advisors[1].advisor_name,
                         sorted_advisors[1].advisor_email,
                         sorted_advisors[2].advisor_title,
                         sorted_advisors[2].advisor_name,
                         sorted_advisors[2].advisor_email,
                         sorted_advisors[3].advisor_title,
                         sorted_advisors[3].advisor_name,
                         sorted_advisors[3].advisor_email,
                         sorted_advisors[4].advisor_title,
                         sorted_advisors[4].advisor_name,
                         sorted_advisors[4].advisor_email,
                         sorted_advisors[5].advisor_title,
                         sorted_advisors[5].advisor_name,
                         sorted_advisors[5].advisor_email,
                         sorted_advisors[6].advisor_title,
                         sorted_advisors[6].advisor_name,
                         sorted_advisors[6].advisor_email,
                         csv_document_type,
                         csv_degree,
                         csv_legacy_department,
                         csv_department,
                         csv_disciplines,
                         csv_keywords,
                         csv_abstract,
                         csv_publication_date,
                         csv_defense_date,
                         csv_submission_date,
                         csv_availability,
                         csv_availability_desc,
                         ])
#     print(csv_data)
    csv_writer(csv_data, '../../trash.csv')

# if __name__ == '__main__':

In [56]:
available_wb = read_workbook('available')
submitted_wb = read_workbook('submitted')
withheld_wb = read_workbook('withheld')

In [57]:
# merges the matching sheets from all 3 workbooks into one datastructure per sheet-type.
all_db_workbooks = (available_wb, submitted_wb, withheld_wb)

main_sheet = parse_main_sheet(all_db_workbooks)
filenames_sheet = parse_filename_sheet(all_db_workbooks)
keywords_sheet = parse_keyword_sheet(all_db_workbooks)
advisors_sheet = parse_advisors_sheet(all_db_workbooks)

In [58]:
catalog_sheet = parse_catalog_sheet()

In [141]:
build_csv(main_sheet, catalog_sheet, filenames_sheet, keywords_sheet, advisors_sheet)
