-
Notifications
You must be signed in to change notification settings - Fork 0
/
issues.py
499 lines (389 loc) · 17.3 KB
/
issues.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
import json
import os
import pandas as pd
import tqdm
# directory containing the parsed files
DIR = '/Users/narayansajeev/Desktop/MIT/parsed_files'
# dictionary to store the number of rows with issues
rows = {}
files = {}
sources = {}
def loop_fnames(prov):
# loop through the files in the directory and return a list of files that have been parsed
current = []
files = []
curr_dir = '%s/%s' % (DIR, prov)
for file in os.listdir(curr_dir):
is_xl = file.endswith(('.xlsx', '.xls'))
pkl = file + '.pkl.gz'
is_food = not any([i in file for i in ['业', '商']])
if is_xl and file not in current and pkl in os.listdir(curr_dir) and is_food:
files.append(file)
return files
def get_df(prov, fname):
# read in the first file from the list using pandas
return pd.read_pickle('%s/%s/%s.pkl.gz' % (DIR, prov, fname))
def get_known_cols():
# checking column classifier
known_cols_fn = '/Users/narayansajeev/Desktop/MIT/known_columns.json'
with open(known_cols_fn) as f:
return json.load(f)
def clean(col_headers):
# clean up the headers by removing newline, carriage return, non-breaking space and space characters
return [n.replace('\n', '').replace('\r', '').replace('\xa0', '').replace(' ', '') for n in col_headers]
def substr_check(substr_sets, k):
# check for substrings in the column headers
substr_dict = {}
for s in substr_sets:
try:
substr_dict[s] = any([i in k for i in substr_sets[s]])
except TypeError:
print(k)
raise
return substr_dict
def substring(df, known_cols):
# list of column headers
col_headers = ['抽样编号', '序号', '被抽样单位名称', '被抽样单位单位地址', '被抽样省份', '标识生产企业名称',
'标识生产企业地址', '食品名称', '规格型号', '生产日期/批号', '商标', '不合格项目║检验结果║标准值',
'分类', '公告号', '公告日期', '任务来源/项目名称', '检验机构名称', '备注']
col_headers = clean(col_headers)
# dictionary of substrings to check for in column headers
substr_sets = {
'announcement_date': ['检', '抽', '报'],
'address': ['地址', '所在地'],
'region': ['省', '县', '市', '区'],
'been_sampled': ['受', '被'],
'name': ['名称', '单位', '机构', '人'],
'testing_agency': ['采样', '检', '抽', '委托'],
'value_or_result': ['值', '结果', '要求'],
'limit': ['标', '限'],
'actual': ['测', '检', '实', '不合格'],
'result': ['结论', '结果', '判定'],
'not_qualified': ['不合格', '不符合'],
'item_or_reason': ['项', '原因'],
'produce': ['生产'],
'illegal': ['违法']
}
# list of possible inspection result values
insp_res_vals = [
'合格',
'合 格',
'合格(阴性)',
'符合',
'不合格',
'不符合',
'所检项目符合标准',
'铝不符合国家标准',
'所检项目符合标准要求。',
'所检项目符合国家标准或地方标准']
# list to store unmatched column headers
unmatched_cols = []
# review these cols
review_cols = []
# check for unmatched column headers
for term in col_headers:
if term not in known_cols:
unmatched_cols.append(term)
# check substrings for unmatched columns
for k in unmatched_cols:
substr_dict = substr_check(substr_sets, k)
if '日期' in k and '生产' in k:
continue
if '日期' in k and substr_dict['announcement_date']:
continue
if substr_dict['address'] and '生产' in k and not substr_dict['region']:
continue
if substr_dict['address'] and substr_dict['been_sampled'] and not substr_dict['region']:
continue
if substr_dict['name'] and substr_dict['been_sampled'] and not substr_dict['address']:
cup = k
for i in substr_sets['name']:
cup = cup.replace(i, '')
for i in substr_sets['been_sampled']:
cup = cup.replace(i, '')
for i in substr_sets['testing_agency']:
cup = cup.replace(i, '')
if len(cup) <= 1:
continue
if substr_dict['name'] and '生产' in k and not substr_dict['address'] and not '违法' in k:
continue
if substr_dict['name'] and substr_dict['testing_agency'] and not substr_dict['been_sampled']:
cup = k
for i in substr_sets['name']:
cup = cup.replace(i, '')
for i in substr_sets['testing_agency']:
cup = cup.replace(i, '')
if len(cup) <= 1:
continue
if substr_dict['value_or_result'] and substr_dict['limit']:
if substr_dict['actual']:
continue
else:
continue
if substr_dict['value_or_result'] and substr_dict['actual']:
continue
if substr_dict['not_qualified'] and substr_dict['item_or_reason'] and '数' not in k:
continue
if '规格' in k:
continue
if substr_dict['result']:
if len(df.index) >= 5:
first_obs = df.iloc[0:4][k]
else:
first_obs = df.loc[:][k]
if any([n in insp_res_vals for n in first_obs]):
continue
else:
review_cols.append(k)
return review_cols
def drop_columns(df, col_headers):
# list of columns to keep
cols = ['manufacturer_name', 'manufacturer_address', 'sampled_location_name', 'sampled_location_address',
'food_name', 'specifications_model', 'announcement_date', 'production_date',
'product_classification', 'task_source_or_project_name', 'testing_agency', 'adulterant',
'inspection_results', 'failing_results', 'test_outcome', 'legal_limit']
# select only the existing columns from the dataframe
df = df[[col for col in cols if col in df.columns]]
# drop columns that are not in the list of column headers
drop_df = df.dropna(axis=1, how='all')
dropped = []
for col in df.columns:
if col not in drop_df.columns:
dropped.append(col)
col_headers = sorted(col_headers)
for _ in ['商标', '备注', '序号', '抽样编号', '购进日期', '被抽样单位省', '被抽样单位盟市', '被抽样单位所在盟市',
'公告网址链接', '产品具体名称', '销售单位/电商']:
try:
col_headers.remove(_)
except:
pass
if len(dropped) > 0 and len(col_headers) > 0:
print(col_headers)
print(dropped)
return drop_df
def newline(df, prov):
# Define global variables
global rows, files, sources
# Count the number of newlines in each row of the dataframe
count = df.apply(lambda row: row.astype(str).str.contains('\n').any(), axis=1).sum()
# Get the name of the current function
name = newline.__name__
# Update the 'rows' dictionary with the count of newlines
rows[name] = rows.get(name, 0) + count
# If there are newlines in the dataframe, update the 'files' and 'sources' dictionaries
if count > 0:
# Update the 'files' dictionary with the count of files containing newlines
files[name] = files.get(name, 0) + 1
# Update the 'sources' dictionary with the provenance of the data
if name not in sources.keys():
sources[name] = [prov]
elif prov not in sources[name]:
lst = sources[name]
lst.append(prov)
sources[name] = sorted(lst)
def tab(df, prov):
# Define global variables
global rows, files, sources
# Count the number of rows in the dataframe that contain a tab character
count = df.apply(lambda row: row.astype(str).str.contains('\t').any(), axis=1).sum()
# Get the name of the current function
name = tab.__name__
# Update the 'rows' dictionary with the count
rows[name] = rows.get(name, 0) + count
# If there are any rows with a tab character
if count > 0:
# Update the 'files' dictionary with the count
files[name] = files.get(name, 0) + 1
# If the function name is not already in the 'sources' dictionary
if name not in sources.keys():
# Add it along with the 'prov' value
sources[name] = [prov]
# If the 'prov' value is not already associated with the function name in the 'sources' dictionary
elif prov not in sources[name]:
# Add it to the list of values associated with the function name
lst = sources[name]
lst.append(prov)
sources[name] = sorted(lst)
def adltrnt_msrmnt(df, prov):
# Define global variables
global rows, files, sources
# Check if 'adulterant' column exists in the dataframe
if 'adulterant' in df.columns:
# Count the number of rows where 'adulterant' contains both "/" and "g"
count = df[df['adulterant'].str.contains("/", na=False) & df['adulterant'].str.contains("g", na=False)].shape[0]
# Get the name of the current function
name = adltrnt_msrmnt.__name__
# Update the 'rows' dictionary with the count
rows[name] = rows.get(name, 0) + count
# If count is greater than 0, update 'files' dictionary and 'sources' dictionary
if count > 0:
# Update the 'files' dictionary with the count
files[name] = files.get(name, 0) + 1
# If current function name is not in 'sources', add it with 'prov' as its value
if name not in sources.keys():
sources[name] = [prov]
# If current function name is in 'sources' but 'prov' is not in its value list, append 'prov' to the list
elif prov not in sources[name]:
lst = sources[name]
lst.append(prov)
sources[name] = sorted(lst)
def adltrnt_none(df, prov):
# Define global variables
global rows, files, sources
# Check if 'adulterant' column exists in the dataframe
if 'adulterant' in df.columns:
# Count the number of null values in 'adulterant' column
count = df['adulterant'].isnull().sum()
# Get the name of the current function
name = adltrnt_none.__name__
# Update the 'rows' dictionary with the count of null values
rows[name] = rows.get(name, 0) + count
# If there are null values, update the 'files' dictionary
if count > 0:
files[name] = files.get(name, 0) + 1
# If the current function name is not in 'sources', add it
if name not in sources.keys():
sources[name] = [prov]
# If the provider is not in the list of sources for this function, add it
elif prov not in sources[name]:
lst = sources[name]
lst.append(prov)
sources[name] = sorted(lst)
def headers(df, prov):
# Define global variables
global rows, files, sources
# Count the number of times '标称生产企业名称' appears in the dataframe
count = df.applymap(lambda x: '标称生产企业名称' in str(x)).sum().sum()
# Get the name of the current function
name = headers.__name__
# Update the 'rows' dictionary with the count
rows[name] = rows.get(name, 0) + count
# If '标称生产企业名称' appears in the dataframe
if count > 0:
# Update the 'files' dictionary with the count
files[name] = files.get(name, 0) + 1
# If the function name is not in the 'sources' dictionary
if name not in sources.keys():
# Add it along with the province name
sources[name] = [prov]
# If the province name is not in the list of provinces for this function
elif prov not in sources[name]:
# Get the list of provinces for this function
lst = sources[name]
# Add the new province to the list
lst.append(prov)
# Sort the list and update the 'sources' dictionary
sources[name] = sorted(lst)
def none(df, prov):
# Declare 'rows', 'files', and 'sources' as global variables
global rows, files, sources
# Count the number of rows in the DataFrame where all values are null
count = df.isnull().all(axis=1).sum()
# Get the name of the current function
name = none.__name__
# Update the 'rows' dictionary with the count of null rows
rows[name] = rows.get(name, 0) + count
# If there are any null rows
if count > 0:
# Update the 'files' dictionary with the count of files with null rows
files[name] = files.get(name, 0) + 1
# If the function name is not already in the 'sources' dictionary
if name not in sources.keys():
# Add the provider to the 'sources' dictionary under the function name
sources[name] = [prov]
# If the provider is not already listed under the function name in the 'sources' dictionary
elif prov not in sources[name]:
# Get the list of providers for this function name
lst = sources[name]
# Add the new provider to the list
lst.append(prov)
# Sort the list and update it in the 'sources' dictionary under the function name
sources[name] = sorted(lst)
def test_legal_none(df, prov):
# Global variables to store the results
global rows, files, sources
# Check if both 'test_outcome' and 'legal_limit' columns exist in the dataframe
if 'test_outcome' in df.columns and 'legal_limit' in df.columns:
# Count the number of rows where both 'test_outcome' and 'legal_limit' are None
count = len(df[(df['test_outcome'].isna()) & (df['legal_limit'].isna())])
# Get the name of the current function
name = test_legal_none.__name__
# Update the 'rows' dictionary with the count
rows[name] = rows.get(name, 0) + count
# If there are any such rows, update the 'files' dictionary and 'sources' dictionary
if count > 0:
# Update the 'files' dictionary with the count
files[name] = files.get(name, 0) + 1
# Update the 'sources' dictionary with the provider name
if name not in sources.keys():
sources[name] = [prov]
elif prov not in sources[name]:
lst = sources[name]
lst.append(prov)
sources[name] = sorted(lst)
def test_concat_legal_none(df, prov):
# Define global variables
global rows, files, sources
# Check if 'test_outcome' and 'legal_limit' columns exist in the dataframe
if 'test_outcome' in df.columns and 'legal_limit' in df.columns:
# Count the number of rows where 'test_outcome' is not null and 'legal_limit' is null
count = df[(~df['test_outcome'].isnull()) & (df['legal_limit'].isnull())].shape[0]
# Get the name of the current function
name = test_concat_legal_none.__name__
# Update the 'rows' dictionary with the count
rows[name] = rows.get(name, 0) + count
# If count is greater than 0, update the 'files' dictionary and 'sources' dictionary
if count > 0:
files[name] = files.get(name, 0) + 1
# If the current function name is not in the keys of 'sources', add it with the provider as its value
if name not in sources.keys():
sources[name] = [prov]
# If the provider is not in the values of the current function name in 'sources', add it
elif prov not in sources[name]:
lst = sources[name]
lst.append(prov)
sources[name] = sorted(lst)
# Loop through all the files in the specified directory
for prov in tqdm.tqdm(os.listdir(DIR)):
# Skip the '.DS_Store' file
if prov == '.DS_Store':
continue
# Loop through all the file names in the current directory
for fname in loop_fnames(prov):
# Get the dataframe for the current file
df = get_df(prov, fname)
# Get the columns that contain a specific substring
review_cols = substring(df, get_known_cols())
# Drop the specified columns from the dataframe
df = drop_columns(df, review_cols)
newline(df, prov)
tab(df, prov)
adltrnt_msrmnt(df, prov)
adltrnt_none(df, prov)
headers(df, prov)
none(df, prov)
test_legal_none(df, prov)
test_concat_legal_none(df, prov)
print()
# Sort the dictionary 'rows' in descending order based on its values
rows = {k: v for k, v in sorted(rows.items(), key=lambda item: item[1], reverse=True)}
# Iterate over the keys in the sorted dictionary 'rows'
for func in rows.keys():
# Print the current key
print(func)
try:
# Print the value associated with the current key in 'rows'
print('Rows:', rows[func])
except:
pass
try:
# Print the value associated with the current key in 'files'
print('Files:', files[func])
except:
pass
try:
# Print the sources associated with the current key, joined by commas
print('Sources:', ', '.join(sources[func]))
except:
pass
print()