In [None]:
#! /usr/bin/env python3

import os
import shutil
import csv
from collections import namedtuple
from datetime import datetime
import urllib.request
import re

import pdir
import openpyxl

## read and parse the DigitialCommons revision spreadsheet (must convert to xlsx first)

In [None]:
wb = openpyxl.load_workbook('/home/francis/Downloads/gradschool_dissertations_1.xls_Tue_Mar_21_20_58_54_2017part_1.xlsx')
current_sheet = wb.get_sheet_by_name('8734444')

In [None]:
wb_dict = dict()
for num, row in enumerate(current_sheet.iter_rows()):
    if num == 0:
        keys = [i.value for i in row]
        continue
    values = [i.value for i in row]
    row_dict = {keys[i]: values[i] for i in range(len(keys))}
    wb_dict[num] = row_dict

## read and parse the mapping spreadsheet of deg/dept to discipline

In [None]:
matches = openpyxl.load_workbook('/home/francis/Downloads/Disciplines for imported documents final.xlsx')
matches_sheet = matches.get_sheet_by_name('Sheet1')

In [None]:
matches_dict = set()
for num, row in enumerate(matches_sheet.iter_rows()):
    if num == 0:
        headers = (i.value.replace(' ', '') for i in row)
        Matches = namedtuple('Matches', headers)
        continue
    values = (i.value for i in row)
    item = Matches(*values)
    matches_dict.add(item)

## pick the matching discipline & assign it to each item

In [None]:
def match_discipline(obs_dept, obs_degree):
    for item in matches_dict:
        dept, degree, discipline = item.Ifdepartmentequals, item.anddegree_nameequals, item.thendisciplinesis
        if obs_dept == dept:
            if degree and obs_degree == degree:
                return discipline
            if not degree:
                return discipline

In [None]:
for key, item in wb_dict.items():
    discipline = match_discipline(item['department'], item['degree_name'])
    if discipline:
        item['disciplines'] = discipline

In [None]:
# fix date formatting after the fact, cause i forgot to specify the date format on reading the file

for key, item in wb_dict.items():
    for name, value in item.items():
        if value and isinstance(value, datetime):
            item[name] = value.strftime('%Y-%m-%d')

## make the csv, which you'll inspect then copy/paste over the original revision xls file

In [None]:
def csv_writer(data, path):
    with open(path, "w", newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for line in data:
            writer.writerow(line)

In [None]:
def build_csv(wb_dict, current_sheet):
    csv_data = []
    fieldnames = [i.value for i in current_sheet.rows[0]]
    csv_data.append(fieldnames)

    for num, row_dict in wb_dict.items():
        this_row = [row_dict[fieldname] for fieldname in fieldnames]
        csv_data.append(this_row)
    csv_writer(csv_data, '../../gradschool_theses_revision.csv')

In [None]:
build_csv(wb_dict, current_sheet)

## error checking

In [None]:
# check for items that will not be assigned a discipline -- report their dept/degree

no_disciplines = set()
count = 0

for key, item in wb_dict.items():
    if not item['disciplines']:
        count += 1
        no_disciplines.add((item['department'], item['degree_name']))
        
print(count, len(no_disciplines), len(wb_dict))

for i in no_disciplines:
    print('{}\t{}'.format(i[0], i[1]))