In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
from openpyxl import load_workbook

def parse_xlsx(filename, sheetname):
    wb = load_workbook(filename=filename, read_only=True)
    ws = wb[sheetname]

    sheet_rows = list()
    for num, row in enumerate(ws.rows):
        if num == 0:
            headers = [i.value for i in row]
        else:
            row_cells = [i.value for i in row]
            row_dict = dict(zip(headers, row_cells))
            sheet_rows.append(row_dict)
    return sheet_rows

In [None]:
main_file = '/home/raleigh/Desktop/MikeFuzzyMatch/ILLRECPUR FY16_thru_FY18_with_usage_thru _12-20-2018.xlsx'
main_file_parsed = parse_xlsx(main_file, 'Sheet1')

In [None]:
supp_file = '/home/raleigh/Desktop/MikeFuzzyMatch/ILL_Borrowing_Requests_(Loan)-1-1-2016-12-31-2018-Patron_Recommendations.xlsx'
supp_file_parsed = parse_xlsx(supp_file, 'Sheet1')

In [None]:
def make_matches(parsed_sheet1, parsed_sheet2):
    possible_matches = dict()

    for sheet1_row in parsed_sheet1:
        sheet1_row_author = sheet1_row['author']
        sheet1_row_title = sheet1_row['title']
        sheet1_row_id = sheet1_row['itemid']
        for sheet2_row in parsed_sheet2:
            sheet2_row_author = sheet2_row['Loan Author']
            sheet2_row_title = sheet2_row['Loan Title']
            title_ratio = fuzz.token_sort_ratio(sheet1_row_title, sheet2_row_title)
#             author_ratio = fuzz.token_sort_ratio(sheet1_row_title, sheet2_row_author)
            # if title_ration > 50 or (title_ratio > 25 and author_ratio > 25):
            if title_ratio > 62:
                sheet1_key = (sheet1_row_title, sheet1_row_author, sheet1_row_id)
                if not possible_matches.get(sheet1_key):
                    possible_matches[sheet1_key] = [(sheet2_row_title, sheet2_row_author), ]
                else:
                    possible_matches[sheet1_key].append((sheet2_row_title, sheet2_row_author))
    return possible_matches

In [None]:
possible_matches = make_matches(main_file_parsed, supp_file_parsed)

In [None]:
import csv

with open('/home/raleigh/Desktop/MikeFuzzyMatch/possible_matches_62_percent.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t', quotechar='"',
                           quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['sheet1_title', 'sheet1_author', 'sheet1_transaction_number', 'sheet2_title', 'sheet2_author'])
    
    for (sheet1_title, sheet1_author, sheet1_id), sheet2_match_list in possible_matches.items():
        for sheet2_match in sheet2_match_list:
            sheet2_title, sheet2_author = sheet2_match
            spamwriter.writerow([sheet1_title, sheet1_author, sheet1_id, sheet2_title, sheet2_author])
            

In [None]:
# Mike removed the duplicates.  Only unique and correct values remain.

In [None]:
clean_matches = '/home/raleigh/Desktop/MikeFuzzyMatch/ILL_Report_Copy_of_possible_matches_62_percent-edited.xlsx'
clean_matches_parsed = parse_xlsx(clean_matches, 'possible_matches_62_percent')

In [None]:
# print(clean_matches_parsed[0])
# print(main_file_parsed[0])
# print(supp_file_parsed[0])

In [None]:
both_matches = []
for match_item in clean_matches_parsed:
    main_file_matches = []
    for main_file_item in main_file_parsed:
        if (main_file_item['title'] == match_item['sheet1_title']) and (main_file_item['author'] == match_item['sheet1_author']):
            if main_file_item in main_file_matches:
                continue
            else:
                main_file_matches.append(main_file_item)
    supp_file_matches = []
    for supp_file_item in supp_file_parsed:
        if (supp_file_item['Loan Title'] == match_item['sheet2_title']) and (supp_file_item['Loan Author'] == match_item['sheet2_author']):
            if supp_file_item in supp_file_matches:
                continue
            else:
                supp_file_matches.append(supp_file_item)
    both_matches.append((main_file_matches, supp_file_matches))

In [None]:
import csv

with open('/home/raleigh/Desktop/MikeFuzzyMatch/final_output.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t', quotechar='"',
                           quoting=csv.QUOTE_MINIMAL)
    main_file_headers = list(both_matches[0][0][0].keys())
    supp_file_headers = list(both_matches[0][1][0].keys())
    headers = main_file_headers + supp_file_headers
    spamwriter.writerow(headers)
    
    for match in both_matches:
        for main_file_item in match[0]:
            for supp_file_item in match[1]:
                row = [main_file_item[i] for i in main_file_headers]
                row.extend(supp_file_item[i] for i in supp_file_headers)
                spamwriter.writerow(row)