forked from OpenAPC/openapc-de
-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_grid_ids.py
executable file
·100 lines (87 loc) · 2.95 KB
/
add_grid_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import json
import os
import sys
from Levenshtein import ratio
import openapc_toolkit as oat
MATCH_TYPES = [
{
"min_ratio": 0.8,
"print_func": oat.print_r,
"name": "Weak"
},
{
"min_ratio": 0.9,
"print_func": oat.print_y,
"name": "Possible"
},
{
"min_ratio": 0.95,
"print_func": oat.print_g,
"name": "Good"
},
{
"min_ratio": 1.0,
"print_func": oat.print_b,
"name": "Perfect"
}
]
def get_match_type(ratio):
best_type = None
for match_type in MATCH_TYPES:
if ratio >= match_type["min_ratio"]:
best_type = match_type
else:
break # requires ordering
return best_type
def get_best_match(grid_names, institutions_name):
highest_ratio = 0.0
grid_name = None
for name in grid_names:
current_ratio = ratio(name, institutions_name)
if current_ratio > highest_ratio:
highest_ratio = current_ratio
grid_name = name
return grid_name, highest_ratio
def write_out_file(ins_header, ins_content):
with open("out.csv", "w") as out_file:
quote_mask = [False for x in range(7)]
writer = oat.OpenAPCUnicodeWriter(out_file, quote_mask, False, False)
writer.write_rows(ins_header + ins_content)
ins_header, ins_content = oat.get_csv_file_content("../data/institutions.csv", "utf-8", True, False)
with open("grid.json") as grid_file:
content = grid_file.read()
json_dict = json.loads(content)
grid_list = json_dict["institutes"]
for index, ins in enumerate(grid_list):
deciles = {round((len(grid_list)/10) * i): str(i * 10) + "%" for i in range(1, 10)}
if index in deciles:
print(deciles[index])
if ins["status"] != "active":
continue
grid_names = [ins["name"]]
if "aliases" in ins:
grid_names += ins["aliases"]
for institutions_row in ins_content:
if oat.has_value(institutions_row[7]):
continue
institutions_name = institutions_row[2]
grid_name, highest_ratio = get_best_match(grid_names, institutions_name)
match_type = get_match_type(highest_ratio)
if match_type != None:
grid_id = ins["id"]
msg = '{} match: "{}" might be Grid institution "{}" ({}).'
question = 'Assign Grid ID {} ({}) (y/n/q)?'
msg = msg.format(match_type["name"], institutions_name, grid_name, highest_ratio)
question = question.format(grid_id, ins["name"])
match_type["print_func"](msg)
start = input(question)
while start not in ["y", "n", "q"]:
start = input("Please type 'y', 'n' or 'q':")
if start == "y":
institutions_row[7] = grid_id
elif start == "q":
write_out_file(ins_header, ins_content)
sys.exit()
write_out_file(ins_header, ins_content)