forked from OpenAPC/openapc-de
-
Notifications
You must be signed in to change notification settings - Fork 0
/
auto_corrector.py
executable file
·156 lines (137 loc) · 6.73 KB
/
auto_corrector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import argparse
import codecs
import copy
import csv
import os
import sys
import openapc_toolkit as oat
from test import whitelists as wl
ARG_HELP_STRINGS = {
"csv_file": "The enriched file to check for errors",
}
APC_DE_FILE = "../data/apc_de.csv"
TA_FILE = "../data/transformative_agreements/transformative_agreements.csv"
ISSN_DICTS = {
"issn": {},
"issn_print": {},
"issn_electronic": {},
"issn_l": {}
}
MISMATCH_MSG = ('Line {}: Detected a mismatch in the {} field for {} {}:\n' +
' is_hybrid : {}\n' +
' publisher : {}\n' +
' journal_full_title : {}\n' +
'The journal occurs {} times in all OpenAPC collections with the following metadata:\n' +
' is_hybrid : {}\n' +
' publisher : {}\n' +
' journal_full_title : {}\n')
CORRECT_MSG = ("How do you want to proceed?\n" +
"1) Correct {} to {}\n" +
"2) Correct {} to {} and abort (saving all changes to out.csv)\n" +
"3) Do nothing\n" +
"4) Do nothing and abort (saving all changes to out.csv)\n" +
"5) Get additional information from the EZB\n> ")
QUOTE_MASK = [True, False, False, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True]
EZBS = oat.EZBSrcaping()
def _prepare_ezb_info(issn):
info = EZBS.get_ezb_info(issn)
if not info["success"]:
msg = "\n\nNo Information could be obtained from the EZB for ISSN " + issn
return msg
msg = "\n\nThe following additional information could be obtained from the EZB for ISSN " + issn + ":\n"
for record in info["data"]:
msg += "Title: " + record["title"] + "\n"
msg += "Access: " + oat.colorize(record["access_msg"], record["access_color"]) + "\n"
msg += "Comments: " + str(record["remarks"]) + "\n"
msg += "-------------------------------------------------------\n"
return msg
def main():
parser = argparse.ArgumentParser()
parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
args = parser.parse_args()
for file_path in [APC_DE_FILE, TA_FILE]:
with open(file_path, "r") as path:
reader = csv.DictReader(path)
oat.print_b("Preparing mapping tables from " + file_path + "...")
for line in reader:
data = {
"journal_full_title": line["journal_full_title"],
"publisher": line["publisher"],
"is_hybrid": line["is_hybrid"],
"count": 1
}
for issn_type in ISSN_DICTS.keys():
issn = line[issn_type]
if issn not in ISSN_DICTS[issn_type]:
ISSN_DICTS[issn_type][issn] = data
else:
ISSN_DICTS[issn_type][issn]["count"] += 1
if reader.line_num % 10000 == 0:
oat.print_b(str(reader.line_num) + " lines processed")
modified_content = []
header = None
with open(args.csv_file) as csv_file:
reader = csv.DictReader(csv_file)
header = list(reader.fieldnames)
stopped = False
for line in reader:
if stopped:
modified_content.append(line)
continue
for issn_type in ISSN_DICTS.keys():
issn = line[issn_type]
if not oat.has_value(issn):
continue
if issn in ISSN_DICTS[issn_type]:
for field_type in ["is_hybrid", "publisher", "journal_full_title"]:
new_value = line[field_type]
established_value = ISSN_DICTS[issn_type][issn][field_type]
if new_value != established_value and not is_whitelisted(field_type, new_value, established_value, line["issn"], line["issn_print"], line["issn_electronic"], line["issn_l"]):
msg = MISMATCH_MSG.format(reader.line_num, oat.colorize(field_type, "cyan"), issn_type, issn, line["is_hybrid"],
line["publisher"], line["journal_full_title"], oat.colorize(str(ISSN_DICTS[issn_type][issn]["count"]), "cyan"),
ISSN_DICTS[issn_type][issn]["is_hybrid"], ISSN_DICTS[issn_type][issn]["publisher"],
ISSN_DICTS[issn_type][issn]["journal_full_title"])
print(msg)
ask_msg = CORRECT_MSG.format(field_type, oat.colorize(established_value, "green"),
field_type, oat.colorize(established_value, "green"))
ezb_msg = None
ret = input(ask_msg)
while ret not in ["1", "2", "3", "4"]:
if ret == "5":
if ezb_msg is None:
ezb_msg = _prepare_ezb_info(issn)
print(ezb_msg)
ret = input("Please select an option from 1 to 5 > ")
print("\n\n\n\n")
if ret in ["1", "2"]:
line[field_type] = established_value
if ret in ["2", "4"]:
stopped = True
break
modified_content.append(line)
modified_lines = [header]
for line in modified_content:
modified_lines.append(list(line.values()))
with open('out.csv', 'w') as out:
writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True, False)
writer.write_rows(modified_lines)
def is_whitelisted(field_type, new_value, established_value, issn, issn_p, issn_e, issn_l):
# The JOURNAL_HYBRID_STATUS_CHANGED wl only lists one of the issns,
# so we have to compare all issn types for a match.
if field_type == "is_hybrid":
if len({issn, issn_p, issn_e, issn_l}.intersection(wl.JOURNAL_HYBRID_STATUS_CHANGED)) > 0:
return True
return False
# The publisher wls, on the other hand, list all issn types, so we have
# implement a different kind of logic
if field_type == "publisher":
for issn_type in [issn, issn_p, issn_e, issn_l]:
if oat.has_value(issn_type) and not wl.in_whitelist(issn_type, established_value, new_value):
return False
return True
return False
if __name__ == '__main__':
main()