-
Notifications
You must be signed in to change notification settings - Fork 3
/
initialize.py
122 lines (95 loc) · 3.5 KB
/
initialize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import json
import csv
import sqlite3
import click
from common import (
sanitize_field_name,
get_default_structures_path,
get_blueleaks_path,
get_dbs_path,
)
def exec_sql(c, sql):
try:
c.execute(sql)
except sqlite3.OperationalError as e:
click.echo("")
click.secho(sql, dim=True)
raise e
def progress(site, table=None, row_count=None, finished=False):
# Clear previous output
click.echo("\r" + " " * 80, nl=False)
# Print progress
s = click.style(f"\r{site}", bold=True)
if finished:
s += " finished"
elif table:
s += " | " + f"{table}.csv"
if row_count:
s += " | " + click.style(f"{row_count:,} rows", dim=True)
click.echo(s, nl=False)
def load_file(path):
with open(path) as f:
structure = json.load(f)
site = structure["name"]
# Start the database
database_filename = os.path.join(get_dbs_path(), f"{site}.sqlite3")
if os.path.exists(database_filename):
click.secho(f"{database_filename} already exists so skipping", dim=True)
return
conn = sqlite3.connect(database_filename)
c = conn.cursor()
# For each table
for table in structure["tables"]:
progress(site, table)
if site.startswith("usa"):
csv_filename = os.path.join(get_blueleaks_path(), "usao", site, f"{table}.csv")
else:
csv_filename = os.path.join(get_blueleaks_path(), site, f"{table}.csv")
with open(csv_filename) as csv_file:
reader = csv.DictReader(csv_file)
fields = [sanitize_field_name(field) for field in reader.fieldnames]
for i in range(len(fields)):
if fields[i] == None:
fields[i] = ""
fields = ",".join([f"'{field}'" for field in fields])
sql = f"CREATE TABLE '{table}' ({fields})"
exec_sql(c, sql)
conn.commit()
row_count = 0
# Import rows
for row in reader:
values = []
for field in row:
if row[field] == None:
values.append("")
else:
values.append(
row[field].replace("\\\\", "/").replace("'", "''")
)
for i in range(len(values)):
if values[i] == None:
values[i] = ""
values = ",".join([f"'{value}'" for value in values])
sql = f"INSERT INTO '{table}' VALUES ({values})"
# click.secho(sql, dim=True)
exec_sql(c, sql)
row_count += 1
if row_count % 1000 == 0:
progress(site, table, row_count)
conn.commit()
conn.commit()
progress(site, finished=True)
conn.close()
click.echo()
def is_struct_json(filename):
return filename.find(".json") > -1
def import_data():
for filename in os.listdir(get_default_structures_path()):
if is_struct_json(filename):
load_file(
os.path.join(get_default_structures_path(), filename),
)
if __name__ == "__main__":
click.echo(click.style("BlueLeaks Explorer", fg="yellow"))
import_data()