/
csv_crosswalk.py
105 lines (91 loc) · 4.68 KB
/
csv_crosswalk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/python
# -*- coding: utf-8 -*-
# coding=utf-8
import csv
from datetime import datetime
import logging
from biblib.metajson import Document
from biblib.services import creator_service
from biblib.util import constants
from biblib.util import jsonbson
def csv_dict_reader_to_metasjon_list(csv_dict_reader, input_format, source, rec_id_prefix, only_first_record):
logging.info(csv_dict_reader)
for csv_row in csv_dict_reader:
yield csv_dict_reader_to_metasjon(csv_row, input_format, source, rec_id_prefix)
def csv_dict_reader_to_metasjon(csv_row, input_format, source, rec_id_prefix):
document = Document()
if source:
document["rec_source"] = source
if input_format == constants.FORMAT_CSV_SITPOL:
#logging.debug("csv_dict_reader_to_metasjon type(csv_row): {}".format(type(csv_row)))
#print csv_row
document["title"] = csv_row["title"]
classifications_sitpol = [x.strip() for x in csv_row["classifications_sitpol"].split(";") if x.strip()]
if classifications_sitpol:
document["classifications_sitpol"] = classifications_sitpol
classifications_ddc = [x.strip() for x in csv_row["classifications_ddc"].split(";") if x.strip()]
if classifications_ddc:
document["classifications_ddc"] = classifications_ddc
formatted_names = [x.strip() for x in csv_row["creators@role=pbl"].split(";") if x.strip()]
if formatted_names:
#logging.debug("formatted_names: {}".format(formatted_names))
creators = []
for formatted_name in formatted_names:
if formatted_name:
creator = creator_service.formatted_name_to_creator(formatted_name, None, "pbl")
if creator:
creators.append(creator)
if creators:
document["creators"] = creators
document["date_last_accessed"] = datetime.now().isoformat()
document["descriptions"] = [{"language":"fr", "value":csv_row["descriptions@lang=fr"]}]
keywords_fr = [x.strip() for x in csv_row["keywords@lang=fr"].split(";") if x.strip()]
keywords_en = [x.strip() for x in csv_row["keywords@lang=en"].split(";") if x.strip()]
keywords = {}
if keywords_fr:
keywords["fr"] = keywords_fr
if keywords_en:
keywords["en"] = keywords_en
if keywords:
document["keywords"] = keywords
document["languages"] = [x.strip() for x in csv_row["languages"].split(";") if x.strip()]
note = csv_row["notes@lang=fr"]
if note:
document["notes"] = note
document["publication_countries"] = [x.strip() for x in csv_row["publication_countries"].split(";") if x.strip()]
if "rec_created_user" in csv_row:
document["rec_created_user"] = csv_row["rec_created_user"]
document["rec_type_cerimes"] = csv_row["rec_type_cerimes"]
specific_agents = [x.strip() for x in csv_row["specific_agents"].split(";") if x.strip()]
if specific_agents:
document["specific_agents"] = specific_agents
document["specific_actor_type"] = csv_row["specific_actor_type"]
document["target_audiences_cerimes"] = csv_row["target_audiences_cerimes"]
document["url"] = csv_row["url"]
document["rec_type"] = constants.DOC_TYPE_WEBENTITY
document["webentity_type"] = csv_row["webentity_type"]
elif input_format == constants.FORMAT_CSV_METAJSON:
document["rec_type"] = "DatasetQuali"
creators = []
if "Laboratoire d'inventaire" in csv_row:
creators.append(creator_service.formatted_name_to_creator(csv_row["Laboratoire d'inventaire"], constants.REC_CLASS_ORGUNIT, "dpt"))
document["title"] = csv_row["Titre de l'enquete"]
if "Sujet(s) de l'enquete" in csv_row:
document["keywords"] = [x.strip() for x in csv_row["Sujet(s) de l'enquete"].split("\n") if x.strip()]
if "Nom Auteur 1" in csv_row:
name_familly = csv_row["Nom Auteur 1"]
name_given = affiliation = ""
if "Prenom Auteur 1" in csv_row:
name_given = csv_row["Prenom Auteur 1"]
if "Affiliation Auteur 1" in csv_row:
affiliation = csv_row["Affiliation Auteur 1"]
document["creators"] = creators
else:
logging.error("Unknown input_format: {}".format(input_format))
logging.info(jsonbson.dumps_json(document, True))
return document
def metajson_list_to_csv_metajson(documents):
csvwriter = csv.DictWriter(csv_file, delimiter=',', fieldnames=constants.fieldnames)
csvwriter.writeheader()
for key in sorted(results.iterkeys()):
row = results[key]