In [1]:
import os
import csv
from collections import namedtuple
from datetime import datetime

import openpyxl

In [2]:
# Availables!

sourcepath = 'data/databasetables'
available_file = 'prod_etd_available_database.xlsx'
available_wb = openpyxl.load_workbook(os.path.join(sourcepath, available_file))

In [3]:
# these items' files were chpt-by-chpt pdfs, joined into one pdf post upload.
# we must be careful to ingest to Digital Commons the joined files instead of the split files.

joined_postupload = ('etd-06182004-122626', 'etd-09012004-114224', 'etd-0327102-091522', 'etd-0707103-142120',
                    'etd-0710102-054039', 'etd-0409103-184148', 'etd-04152004-142117', 'etd-0830102-145811',
                    'etd-0903103-141852', )

In [4]:
# list all the sheets within the xsls workbook

available_sheets = [sheet for sheet in available_wb.get_sheet_names()]
print(available_sheets)

['prod_etd_available all tables', 'keyword_by_urn table', 'filename_by_urn table', 'etd_main table', 'advisor_by_urn table']


In [5]:
# Submitteds!

sourcepath = 'data/databasetables'
submitted_file = 'prod_etd_submitted_database.xlsx'
submitted_wb = openpyxl.load_workbook(os.path.join(sourcepath, submitted_file))

In [6]:
submitted_sheets = [sheet for sheet in submitted_wb.get_sheet_names()]
print(submitted_sheets)

['prod_etd_submitted all tables', 'keyword_by_urn table', 'filename_by_urn table', 'etd_main table', 'advisor_by_urn table']


In [7]:
# Withhelds!

sourcepath = 'data/databasetables'
withheld_file = 'prod_etd_withheld_database.xlsx'
withheld_wb = openpyxl.load_workbook(os.path.join(sourcepath, withheld_file))

In [8]:
withheld_sheets = [sheet for sheet in withheld_wb.get_sheet_names()]
print(withheld_sheets)

['prod_etd_withheld all tables', 'keyword_by_urn table', 'filename_by_urn table', 'etd_main table', 'advisor_by_urn table']


Ok, we've read the Available, Withheld, and Submitted xlsx files into memory.  Everything else is fast now.


In [9]:
def parse_main_sheet():
    """ returns a dictionary in form of:
    {urn: NamedTuple
     urn: NamedTuple
    }
    NamedTuple is expected to have attributes: (urn first_name middle_name last_name suffix author_email 
                                                publish_email degree department dtype title abstract availability
                                                availability_description copyright_statement ddate sdate adate
                                                cdate rdate pid url notice notice_response timestamp
                                                survey_completed)
                                            or: (urn first_name middle_name last_name suffix author_email
                                                publish_email degree department dtype title abstract availability
                                                availability_description copyright_statement ddate sdate adate
                                                cdate rdate pid url notices timestamp)
    """
    main_dict = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('etd_main table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                MainSheet = namedtuple('MainSheet', headers)
                continue
            values = (i.value for i in row)
            item = MainSheet(*values)
            main_dict[item.urn] = item
    return main_dict

In [10]:
def parse_filename_sheet():
    """ returns a dictionary in form of:
        urn: [NamedTuple, NamedTuple, ],
        urn: [NamedTuple, ]
    NamedTuple is expected to have attributes (path, size, available, description, page_count, timestamp)
    """
    filenames_sheet = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('filename_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Filenames = namedtuple('Filenames', headers)
                continue
            values = (i.value for i in row)
            item = Filenames(*values)

            if item.urn not in filenames_sheet:
                filenames_sheet[item.urn] = [item, ]
            else:
                row_timestamp = datetime.strptime(item.timestamp, "%Y-%m-%d %H:%M:%S")
                if item.filename in filenames_sheet[item.urn]:             
                    previous_filename_entry = [i for i in filenames_sheet[item.urn] if i.filename == item.filename]
                    previous_timestamp = datetime.strptime(previous_filename_entry[0].timestamp, "%Y-%m-%d %H:%M:%S")
                    if row_timestamp > previous_timestamp:
                        print('oops')
                        previous_filename_entry[0] = item
                else:
                    filenames_sheet[item.urn].append(item)
    return filenames_sheet

In [11]:
def parse_keyword_sheet():
    """ returns a dictionary in form of:
    {urn: [NamedTuple,
           NamedTuple,
           ]}
    NamedTuple is expected to have attributes ('keyword', 'urn', 'timestamp')
    """
    keywords_sheet = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('keyword_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Keywords = namedtuple('Keywords', headers)
                continue
            values = (i.value for i in row)
            item = Keywords(*values)
            if item.urn not in keywords_sheet:
                keywords_sheet[item.urn] = [item, ]
            else:
                keywords_sheet[item.urn].append(item)
    return keywords_sheet

In [12]:
def parse_advisors_sheet():
    """ returns a dictionary in form of:
        {urn: [NamedTuple, 
               NamedTuple, 
               ]}
        NamedTuple is expected to have attributes ('urn', 'advisor_name', 'advisor_title',
                                                   'advisor_email', 'approval', 'timestamp')
   """
    advisors_sheet = dict()
    for wb in (available_wb, submitted_wb, withheld_wb):
        current_sheet = wb.get_sheet_by_name('advisor_by_urn table')
        for num, row in enumerate(current_sheet.iter_rows()):
            if num == 0:
                headers = (i.value for i in row)
                Advisors = namedtuple('Advisor', headers)
                continue
            values = (i.value for i in row)
            item = Advisors(*values)
            if item.urn not in advisors_sheet:
                advisors_sheet[item.urn] = [item, ]
            else:
                row_timestamp = datetime.strptime(item.timestamp, "%Y-%m-%d %H:%M:%S")
                if item.advisor_name in advisors_sheet[item.urn]:
                    previous_advisor_entry = [i for i in advisors_sheet[item.urn] if i.advisor_name == item.advisor_name]
                    previous_timestamp = datetime.strptime(previous_advisor_entry[0].timestamp, "%Y-%m-%d %H:%M:%S")
                    if row_timestamp > previous_timestamp:
                        previous_advisor_entry[0] = item
                else:
                    advisors_sheet[item.urn].append(item)
    return advisors_sheet

In [13]:
def parse_catalog_sheet():
    """ returns a dictionary in form of:
        {urn: [NamedTuple, NamedTuple, ]
         urn: [NamedTuple, ]}
    """
    catalog_sheet = dict()
    sourcepath = 'data/Catalogtables'
    sourcefile = 'CatalogETDSelectMetadata.csv'
    with open(os.path.join(sourcepath, sourcefile)) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for num, row in enumerate(csvreader):
            if num == 0:
                headers = (i for i in row)
                Catalog = namedtuple('Catalog', headers)
                continue
            values = (i for i in row)
            item = Catalog(*values)
            urn = [i for i in os.path.split(item.URL) if 'etd-' in i]
            urn = urn[0]
            if not urn:
                print('No urn for URL:', item.URL)
            if urn not in catalog_sheet:
                catalog_sheet[urn] = [item, ]
            else:
                catalog_sheet[urn].append(item)
    return catalog_sheet

In [14]:
# we just made plans for how to convert the openpyxls datastructure into python datastructures.  Now let's do it.
# take note that each sheet gets a slightly different datastructure, as they describe different data.

# print an example of each, for later reference.

main_sheet = parse_main_sheet()
catalog_sheet = parse_catalog_sheet()
filenames_sheet = parse_filename_sheet()
keywords_sheet = parse_keyword_sheet()
advisors_sheet = parse_advisors_sheet()

# print("Main example:", main_sheet['etd-0821101-100809'], "\n")
# print("Catalog example:", catalog_sheet['etd-0821101-100809'], "\n")
# print("Filenames example:", filenames_sheet['etd-0821101-100809'], "\n")
# print("Keywords example:", keywords_sheet['etd-0821101-100809'], "\n")
# print("Advisors example:", advisors_sheet['etd-0821101-100809'], "\n")

In [15]:
# investigate the advisors sheet.  What are the permutations of the advisor types in all the etd records?

urn_advisortitles = dict()
for urn, advisors_nt_list in advisors_sheet.items():
    for item in advisors_nt_list:
        if item.urn in urn_advisortitles:
            urn_advisortitles[item.urn].append(item.advisor_title)
        else:
            urn_advisortitles[item.urn] = [item.advisor_title,]

a_set = set()
for urn, titles in urn_advisortitles.items():
    for title in titles:
        a_set.add(title)
print(a_set)

advisors_permutations = set()

for urn, titles in urn_advisortitles.items():
    this_permutation = (titles.count('Committee Chair'),
                        titles.count('Committee Co-Chair'),
                        titles.count('Committee Member'),
                        titles.count("Dean's Representative"),
                       )
    advisors_permutations.add(this_permutation)
# for i in advisors_permutations:
#     print(i)

{'Committee Co-Chair', 'Committee Member', 'Committee Chair', "Dean's Representative"}


In [16]:
# locates urns without the expected file

# also locates items labeled 'available' in one place & 'withheld' in another
#   (irrelevant since we're treating these the same)

sames = dict()
for urn, filenames_namedtuple_list in filenames_sheet.items():
    for item in filenames_namedtuple_list:
        if item.urn in sames:
            if sames[item.urn] != item.availability:
                print('there should be one {}'.format(item.urn))
        else:
            sames[item.urn] = item.availability

# there should be one etd-06092008-192351
            

for urn, itemnamedtuple in main_sheet.items():
    if urn not in sames:
        print('{} wasnt in filenames sheet'.format(urn))
    elif sames[urn] != itemnamedtuple.availability:
        print(urn, sames[urn], itemnamedtuple.availability)
        
# etd-08082016-164729 wasnt in filenames sheet   --  no pdf submitted yet 
# etd-06092008-192351 unrestricted withheld      --  listed in available & withheld databases
# etd-06062010-192030 wasnt in filenames sheet   --  no pdf submitted yet


there should be one etd-06092008-192351
etd-08082016-164729 wasnt in filenames sheet
etd-06092008-192351 unrestricted withheld
etd-06062010-192030 wasnt in filenames sheet


In [17]:
# files without proper file extension

for urn, filenames_namedtuple_list in filenames_sheet.items():
    for item in filenames_namedtuple_list:
        if item.filename[-4] != "." and item.filename[-4:] not in ("docx", "r.gz"):
            print(urn, item.filename)

In [18]:
# Legacy school names

schools_etds = dict()

for urn, itemnamedtuple in main_sheet.items():
    if itemnamedtuple.department in schools_etds:
        schools_etds[itemnamedtuple.department].append(urn)
    else:
        schools_etds[itemnamedtuple.department] = [urn, ]

# for school, urns in schools_etds.items():
#     print(school)

In [19]:
# Files split into parts

split_files = dict()

for urn, filenames_namedtuples_list in filenames_sheet.items():
    for item in filenames_namedtuples_list:
        if item.urn not in split_files:
            split_files[item.urn] = [item.filename, ]
        else:
            split_files[item.urn].append(item.filename)

for urn, filelist in split_files.items():
    split = False
    for i in filelist:
        if "chap" in i.lower():
            split = True
    if len(filelist) > 1 and split == True:
        print(urn, '\n', filelist, '\n')

In [20]:
# Scripts for scraping binaries from the webpage to our local drive

import urllib.request

def retrieve_binary(url):
    with urllib.request.urlopen(url) as response:
        return response.read()

In [21]:
def write_binary_to_file(binary, folder, filename):
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, filename)
    with open(filepath, 'bw') as f:
        f.write(binary)

In [38]:
# Scrape all the findable binaries (withhelds are locked away from public view)

import os

didnt_grab = []
target_dir = './ETDbinaries/'

count = 0

for urn, filenames_namedtuples_list in filenames_sheet.items():
#     if count > 10:
#         break
    local_dir = os.path.join(target_dir, urn)
    if os.path.isdir(local_dir):
        local_files = os.listdir(local_dir)
    else:
        local_files = None

    for item in filenames_namedtuples_list:
#         if count > 10:
#             break
        if item.filename in local_files:
            if 'with' in item.availability:
                print(urn, 'withheld ripped')
            pass
        else:
            url = 'http://etd.lsu.edu/{}/{}'.format("/".join(item.path.split('/')[3:]), item.filename)
#             print(url)
#             print(item.filename)
            try:
                binary = retrieve_binary(url)
#                 write_binary_to_file(binary, local_dir, filename)
            except:
                count += 1
                pass
#                 didnt_grab.append((urn, item.filename))
#                 print(urn, item.filename)
print(count)

etd-06092008-192351 withheld ripped
1006


In [39]:
# Helper script that prints all the info related to a urn.

def show_all_info(urn):
    print("Main example:", main_sheet[urn], "\n")
    print("Filenames example:", filenames_sheet[urn], "\n")
    print("Keywords example:", keywords_sheet[urn], "\n")
    print("Advisors example:", advisors_sheet[urn], "\n")
    print("Catalog example:", catalog_sheet[urn], "\n")

In [40]:
# prints the urn, file location, and filename for urns of which we couldn't grab the binaries

# for urn, filename in didnt_grab:
#     print(urn, filenames_sheet[urn][filename][0], filename)

In [41]:
# show_all_info('etd-09012004-114224')

In [17]:
length = 0
for uri in catalog_sheet:
    uri = os.path.split(uri)[1]
    if uri not in main_sheet:
#         print(uri)
        length += 1
print(length)

0


In [43]:
def csv_writer(data, path):
    """
    Write data to a CSV file path
    """
    with open(path, "w", newline='') as csv_file:
        writer = csv.writer(csv_file, delimiter='\t')
        for line in data:
            writer.writerow(line)

In [83]:
def concatinate_keywords(urn):
    return ', '.join(nt.keyword for nt in keywords_sheet[urn]
                     if nt.keyword)


def organize_advisors(urn):
    Advisors = advisors_sheet[urn]
    advisors_rank = {'Committee Chair': 1, 'Committee Co-Chair': 2, 'Committee Member': 3, "Dean's Representative": 4}
    sorted_alpha = sorted(Advisors, key=lambda x: x.advisor_name)
    sorted_advisors = sorted(Advisors, key=lambda x: advisors_rank[x.advisor_title])
    if len(sorted_advisors) > 7 and "Dean's Representative" in sorted_advisors[-1]:
        sorted_advisors = sorted_advisors[:6] + sorted_advisors[-1:]
    elif len(sorted_advisors) > 7:
        sorted_advisors = sorted_advisors[:7]
    return sorted_advisors
    
# print(concatinate_keywords('etd-07062010-081041'))
#print(organize_advisors('etd-07062010-081041'))

# csv_writer(scratch_data, 'scratch_csv.csv')

# for urn in advisors_sheet:
#     organize_advisors(urn)

#print("Filenames example:", filenames_sheet['etd-08082016-164729'], "\n")


csv_data = []

names = ["f_name", 'm_name', 'l_name', 'dv1_name', 'adv1_title', "adv1_email",'filenam','department']
csv_data.append(names)
for urn in main_sheet:
    sorted_advisors = organize_advisors(urn)
    if urn in filenames_sheet:
        filename = filenames_sheet[urn][0].filename
        #filename = filename.split('/')[-3]
    else:
        filename = ''
    csv_data.append([main_sheet[urn].first_name,
                     main_sheet[urn].middle_name,
                     main_sheet[urn].last_name,
                     sorted_advisors[0].advisor_name,
                     sorted_advisors[0].advisor_title,
                     sorted_advisors[0].advisor_email,
                     filename,
                     main_sheet[urn].department])
print(csv_data)
csv_writer(csv_data, '../../trash.csv')
#'advisor_name', 'advisor_title'

[['f_name', 'm_name', 'l_name', 'dv1_name', 'adv1_title', 'adv1_email', 'filenam', 'department'], ['Jamie', 'Hughes', 'Collier', 'Robert Tague', 'Committee Chair', 'rtague@lsu.edu', 'Collier_thesis.pdf', 'Geography & Anthropology'], ['Taehee', None, 'Kim', 'Rod Parker', 'Committee Chair', 'rparke5@lsu.edu', 'Kim_thesis.pdf', 'Art'], ['Jeong Tae', None, 'Ok', 'Park, Sunggook', 'Committee Chair', 'sunggook@lsu.edu', 'jtok2011.pdf', 'Engineering Science (Interdepartmental Program)'], ['Skyler', None, 'Neylon', 'Samuel Bentley', 'Committee Chair', 'sjb@lsu.edu', 'Neylon_thesis.pdf', 'Oceanography & Coastal Sciences'], ['Yujie', None, 'Wang', 'Hu, Gang', 'Committee Chair', 'gang.hu@pbrc.edu', 'Wang_diss.pdf', 'Human Ecology'], ['Karen', None, 'Marrolli', 'Fulton, Kenneth', 'Committee Chair', 'wfulton@lsu.edu', 'Marrolli_diss.pdf', 'Music'], ['Matthew', 'Daniel', 'Voitier', 'Ramsay Smith', 'Committee Chair', 'wsmith@lsu.edu', 'Voitier_thesis.pdf', 'Renewable Natural Resources'], ['Amanda', '