-
Notifications
You must be signed in to change notification settings - Fork 15
/
main.py
202 lines (159 loc) · 6.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import click
import datetime
import logging
import requests
import time
from google.cloud import bigquery
BUGBUG_HTTP_SERVER = "https://bugbug.herokuapp.com"
CLASSIFICATION_LABELS = {0: "valid", 1: "invalid"}
def classification_http_request(url, reports):
reports_list = list(reports.values())
response = requests.post(
url, headers={"X-Api-Key": "docker-etl"}, json={"reports": reports_list}
)
response.raise_for_status()
return response.json()
def get_reports_classification(model, reports, retry_count=21, retry_sleep=10):
"""Get the classification for a list of reports.
Args:
model: The model to use for the classification.
reports: The dict containing reports to classify with uuid used as keys.
retry_count: The number of times to retry the request.
retry_sleep: The number of seconds to sleep between retries.
Returns:
A dictionary with the uuids as keys and classification results as values.
"""
if len(reports) == 0:
return {}
url = f"{BUGBUG_HTTP_SERVER}/{model}/predict/broken_site_report/batch"
json_response = {}
for _ in range(retry_count):
response = classification_http_request(url, reports)
# Check which reports are ready
for uuid, data in response["reports"].items():
if not data.get("ready", True):
continue
# The report is ready, add it to the json_response and pop it
# up from the current batch
reports.pop(uuid, None)
json_response[uuid] = data
if len(reports) == 0:
break
else:
time.sleep(retry_sleep)
else:
total_sleep = retry_count * retry_sleep
msg = f"Couldn't get {len(reports)} report classifications in {total_sleep} seconds, aborting" # noqa
logging.error(msg)
raise Exception(msg)
return json_response
def add_classification_results(client, bq_dataset_id, results):
res = []
for uuid, result in results.items():
bq_result = {
"report_uuid": uuid,
"label": CLASSIFICATION_LABELS[result["class"]],
"created_at": datetime.datetime.utcnow().isoformat(),
"probability": result["prob"][result["class"]],
"is_ml": True,
}
res.append(bq_result)
job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
schema=[
bigquery.SchemaField("report_uuid", "STRING", mode="REQUIRED"),
bigquery.SchemaField("label", "STRING", mode="REQUIRED"),
bigquery.SchemaField("created_at", "DATETIME", mode="REQUIRED"),
bigquery.SchemaField("probability", "FLOAT"),
bigquery.SchemaField("is_ml", "BOOLEAN", mode="REQUIRED"),
],
write_disposition="WRITE_APPEND",
)
labels_table = f"{bq_dataset_id}.labels"
job = client.load_table_from_json(
res,
labels_table,
job_config=job_config,
)
logging.info("Writing to `labels` table")
try:
job.result()
except Exception as e:
print(f"ERROR: {e}")
if job.errors:
for error in job.errors:
logging.error(error)
table = client.get_table(labels_table)
logging.info(f"Loaded {len(res)} rows into {table}")
def record_classification_run(client, bq_dataset_id, is_ok, count):
rows_to_insert = [
{
"run_at": datetime.datetime.utcnow().isoformat(),
"is_ok": is_ok,
"report_count": count,
},
]
bugbug_runs_table = f"{bq_dataset_id}.bugbug_classification_runs"
errors = client.insert_rows_json(bugbug_runs_table, rows_to_insert)
if errors:
logging.error(errors)
else:
logging.info("Last classification run recorded")
def get_last_classification_datetime(client, bq_dataset_id):
query = f"""
SELECT MAX(run_at) AS last_run_at
FROM `{bq_dataset_id}.bugbug_classification_runs`
WHERE is_ok = TRUE
"""
res = client.query(query).result()
row = list(res)[0]
last_run_time = (
row["last_run_at"] if row["last_run_at"] is not None else "2023-11-20T00:00:00"
)
return last_run_time
def get_reports_since_last_run(client, last_run_time):
query = f"""
SELECT
uuid,
comments as body,
url as title
FROM `moz-fx-data-shared-prod.org_mozilla_broken_site_report.user_reports`
WHERE comments != "" AND reported_at > "{last_run_time}"
ORDER BY reported_at
"""
query_job = client.query(query)
return list(query_job.result())
@click.command()
@click.option("--bq_project_id", help="BigQuery project id", required=True)
@click.option("--bq_dataset_id", help="BigQuery dataset id", required=True)
def main(bq_project_id, bq_dataset_id):
client = bigquery.Client(project=bq_project_id)
# Get datetime of the last classification run
last_run_time = get_last_classification_datetime(client, bq_dataset_id)
# Only get reports that were filed since last classification run
# and have non-empty descriptions
rows = get_reports_since_last_run(client, last_run_time)
if not rows:
logging.info(
f"No new reports with filled descriptions were found since {last_run_time}"
)
return
objects_dict = {
row["uuid"]: {field: value for field, value in row.items()} for row in rows
}
is_ok = True
result_count = 0
try:
logging.info("Getting classification results from bugbug.")
result = get_reports_classification("invalidcompatibilityreport", objects_dict)
if result:
result_count = len(result)
logging.info("Saving classification results to BQ.")
add_classification_results(client, bq_dataset_id, result)
except Exception as e:
logging.error(e)
is_ok = False
record_classification_run(client, bq_dataset_id, is_ok, result_count)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
main()