/
filechecker.py
executable file
·1819 lines (1647 loc) · 98.6 KB
/
filechecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
# This file is part of Mylar.
#
# Mylar is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Mylar is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Mylar. If not, see <http://www.gnu.org/licenses/>.
import os
import re
import sys
import glob
import shutil
import operator
import urllib.request, urllib.parse, urllib.error
import logging
import unicodedata
import optparse
from fnmatch import fnmatch
import datetime as dt
import subprocess
from subprocess import CalledProcessError, check_output
import mylar
from mylar import logger, helpers
class FileChecker(object):
def __init__(self, dir=None, watchcomic=None, Publisher=None, AlternateSearch=None, manual=None, sarc=None, justparse=None, file=None, pp_mode=False):
#dir = full path to the series Comic Location (manual pp will just be psssing the already parsed filename)
if dir:
self.dir = dir
else:
self.dir = None
if watchcomic:
#watchcomic = unicode name of series that is being searched against
self.og_watchcomic = watchcomic
self.watchcomic = re.sub('\?', '', watchcomic).strip() #strip the ? sepearte since it affects the regex.
self.watchcomic = re.sub('\u2014', ' - ', watchcomic).strip() #replace the \u2014 with a normal - because this world is f'd up enough to have something like that.
self.watchcomic = re.sub('\u2013', ' - ', watchcomic).strip() #replace the \u2013 with a normal - because again, people are dumb.
if type(self.watchcomic) != str:
self.watchcomic = unicodedata.normalize('NFKD', self.watchcomic).encode('ASCII', 'ignore')
else:
self.watchcomic = None
if Publisher:
#publisher = publisher of watchcomic
self.publisher = Publisher
else:
self.publisher = None
#alternatesearch = list of alternate search names
if AlternateSearch:
self.AlternateSearch = AlternateSearch
else:
self.AlternateSearch = None
#manual = true / false if it's a manual post-processing run
if manual:
self.manual = manual
else:
self.manual = None
#sarc = true / false if it's being run against an existing story-arc
if sarc:
self.sarc = sarc
else:
self.sarc = None
#justparse = true/false when manually post-processing, will quickly parse the filename to find
#the series name in order to query the sql instead of cycling through each series in the watchlist.
if justparse:
self.justparse = True
else:
self.justparse = False
#file = parse just one filename (used primarily during import/scan)
if file:
self.file = file
self.justparse = True
else:
self.file = None
if pp_mode:
self.pp_mode = True
else:
self.pp_mode = False
self.failed_files = []
self.dynamic_handlers = ['/','-',':',';','\'',',','&','?','!','+','(',')','\\u2014','\\u2013']
self.dynamic_replacements = ['and','the']
self.rippers = ['-empire','-empire-hd','minutemen-','-dcp','Glorith-HD']
#pre-generate the AS_Alternates now
AS_Alternates = self.altcheck()
self.AS_Alt = AS_Alternates['AS_Alt']
self.AS_Tuple = AS_Alternates['AS_Tuple']
def listFiles(self):
comiclist = []
watchmatch = {}
dirlist = []
comiccnt = 0
if self.file:
runresults = self.parseit(self.dir, self.file)
return {'parse_status': runresults['parse_status'],
'sub': runresults['sub'],
'comicfilename': runresults['comicfilename'],
'comiclocation': runresults['comiclocation'],
'series_name': runresults['series_name'],
'series_name_decoded': runresults['series_name_decoded'],
'issueid': runresults['issueid'],
'dynamic_name': runresults['dynamic_name'],
'series_volume': runresults['series_volume'],
'alt_series': runresults['alt_series'],
'alt_issue': runresults['alt_issue'],
'issue_year': runresults['issue_year'],
'issue_number': runresults['issue_number'],
'scangroup': runresults['scangroup'],
'reading_order': runresults['reading_order'],
'booktype': runresults['booktype']
}
else:
filelist = self.traverse_directories(self.dir)
for files in filelist:
filedir = files['directory']
filename = files['filename']
filesize = files['comicsize']
if filename.startswith('.'):
continue
logger.debug('[FILENAME]: %s' % filename)
runresults = self.parseit(self.dir, filename, filedir)
if runresults:
try:
if runresults['parse_status']:
run_status = runresults['parse_status']
except:
if runresults['process_status']:
run_status = runresults['process_status']
if any([run_status == 'success', run_status == 'match']):
if self.justparse:
comiclist.append({
'sub': runresults['sub'],
'comicfilename': runresults['comicfilename'],
'comiclocation': runresults['comiclocation'],
'series_name': runresults['series_name'], #helpers.conversion(runresults['series_name']),
'series_name_decoded': runresults['series_name_decoded'],
'issueid': runresults['issueid'],
'alt_series': runresults['alt_series'], #helpers.conversion(runresults['alt_series']),
'alt_issue': runresults['alt_issue'],
'dynamic_name': runresults['dynamic_name'],
'series_volume': runresults['series_volume'],
'issue_year': runresults['issue_year'],
'issue_number': runresults['issue_number'],
'scangroup': runresults['scangroup'],
'reading_order': runresults['reading_order'],
'booktype': runresults['booktype']
})
else:
comiclist.append({
'sub': runresults['sub'],
'ComicFilename': runresults['comicfilename'],
'ComicLocation': runresults['comiclocation'],
'ComicSize': files['comicsize'],
'ComicName': runresults['series_name'], #helpers.conversion(runresults['series_name']),
'SeriesVolume': runresults['series_volume'],
'IssueYear': runresults['issue_year'],
'JusttheDigits': runresults['justthedigits'],
'AnnualComicID': runresults['annual_comicid'],
'issueid': runresults['issueid'],
'scangroup': runresults['scangroup'],
'booktype': runresults['booktype']
})
comiccnt +=1
else:
#failure
self.failed_files.append({'parse_status': 'failure',
'sub': runresults['sub'],
'comicfilename': runresults['comicfilename'],
'comiclocation': runresults['comiclocation'],
'series_name': runresults['series_name'], #helpers.conversion(runresults['series_name']),
'series_volume': runresults['series_volume'],
'alt_series': runresults['alt_series'], #helpers.conversion(runresults['alt_series']),
'alt_issue': runresults['alt_issue'],
'issue_year': runresults['issue_year'],
'issue_number': runresults['issue_number'],
'issueid': runresults['issueid'],
'scangroup': runresults['scangroup'],
'booktype': runresults['booktype']
})
watchmatch['comiccount'] = comiccnt
if len(comiclist) > 0:
watchmatch['comiclist'] = comiclist
else:
watchmatch['comiclist'] = []
if len(self.failed_files) > 0:
logger.info('FAILED FILES: %s' % self.failed_files)
return watchmatch
def parseit(self, path, filename, subpath=None):
path_list = None
if subpath is None:
subpath = path
tmppath = None
path_list = None
else:
logger.fdebug('[CORRECTION] Sub-directory found. Altering path configuration.')
#basepath the sub if it exists to get the parent folder.
logger.fdebug('[SUB-PATH] Checking Folder Name for more information.')
#sub = re.sub(origpath, '', path).strip()})
logger.fdebug('[SUB-PATH] Original Path : %s' % path)
logger.fdebug('[SUB-PATH] Sub-directory : %s' % subpath)
#subpath = helpers.conversion(subpath)
if 'windows' in mylar.OS_DETECT.lower():
if path in subpath:
ab = len(path)
tmppath = subpath[ab:]
else:
tmppath = subpath.replace(path, '').strip()
path_list = os.path.normpath(tmppath)
if '/' == path_list[0] or '\\' == path_list[0]:
#need to remove any leading slashes so the os join can properly join the components
path_list = path_list[1:]
logger.fdebug('[SUB-PATH] subpath set to : %s' % path_list)
#parse out the extension for type
comic_ext = ('.cbr','.cbz','.cb7','.pdf')
if os.path.splitext(filename)[1].endswith(comic_ext):
filetype = os.path.splitext(filename)[1]
else:
filetype = 'unknown'
#find the issue number first.
#split the file and then get all the relevant numbers that could possibly be an issue number.
#remove the extension.
modfilename = re.sub(filetype, '', filename).strip()
reading_order = None
#if it's a story-arc, make sure to remove any leading reading order #'s
if self.sarc and mylar.CONFIG.READ2FILENAME:
removest = modfilename.find('-') # the - gets removed above so we test for the first blank space...
if mylar.CONFIG.FOLDER_SCAN_LOG_VERBOSE:
logger.fdebug('[SARC] Checking filename for Reading Order sequence - Reading Sequence Order found #: %s' % modfilename[:removest])
if modfilename[:removest].isdigit() and removest <= 3:
reading_order = {'reading_sequence': str(modfilename[:removest]),
'filename': filename[removest+1:]}
modfilename = modfilename[removest+1:]
if mylar.CONFIG.FOLDER_SCAN_LOG_VERBOSE:
logger.fdebug('[SARC] Removed Reading Order sequence from subname. Now set to : %s' % modfilename)
#make sure all the brackets are properly spaced apart
if modfilename.find('\s') == -1:
#if no spaces exist, assume decimals being used as spacers (ie. nzb name)
modspacer = '.'
else:
modspacer = ' '
m = re.findall('[^()]+', modfilename)
cnt = 1
#2019-12-24----fixed to accomodate naming convention like Amazing Mary Jane (2019) 002.cbr, and to account for brackets properly
try:
while cnt < len(m):
#logger.fdebug('[m=%s] modfilename.find: %s' % (m[cnt], modfilename[modfilename.find('('+m[cnt]+')')+len(m[cnt])+2]))
#logger.fdebug('mod_1: %s' % modfilename.find('('+m[cnt]+')'))
if modfilename[modfilename.find('('+m[cnt]+')')-1] != modspacer and modfilename.find('('+m[cnt]+')') != -1:
#logger.fdebug('before_space: %s' % modfilename[modfilename.find('('+m[cnt]+')')-1])
#logger.fdebug('after_space: %s' % modfilename[modfilename.find('('+m[cnt]+')')+len(m[cnt])+2])
modfilename = '%s%s%s' % (modfilename[:modfilename.find('('+m[cnt]+')')], modspacer, modfilename[modfilename.find('('+m[cnt]+')'):])
cnt+=1
except Exception as e:
#logger.warn('[ERROR] %s' % e)
pass
#---end 2019-12-24
#grab the scanner tags here.
scangroup = None
rippers = [x for x in self.rippers if x.lower() in modfilename.lower()]
if rippers:
#it's always possible that this could grab something else since tags aren't unique. Try and figure it out.
if len(rippers) > 0:
m = re.findall('[^()]+', modfilename)
#--2019-11-30 needed for Glorith naming conventions when it's an nzb name with all formatting removed.
if len(m) == 1:
spf30 = re.compile(r"[^.]+", re.UNICODE)
#logger.fdebug('spf30: %s' % spf30)
split_file30 = spf30.findall(modfilename)
#logger.fdebug('split_file30: %s' % split_file30)
if len(split_file30) > 3 and 'Glorith-HD' in modfilename:
scangroup = 'Glorith-HD'
sp_pos = 0
for x in split_file30:
if sp_pos+1 > len(split_file30):
break
if x[-1] == ',' and self.checkthedate(split_file30[sp_pos+1]):
modfilename = re.sub(x, x[:-1], modfilename, count=1)
break
sp_pos+=1
#-- end 2019-11-30
cnt = 1
for rp in rippers:
while cnt < len(m):
if m[cnt] == ' ':
pass
elif rp.lower() in m[cnt].lower():
scangroup = re.sub('[\(\)]', '', m[cnt]).strip()
logger.fdebug('Scanner group tag discovered: %s' % scangroup)
modfilename = modfilename.replace(m[cnt],'').strip()
break
cnt +=1
modfilename = modfilename.replace('()','').strip()
issueid = None
x = modfilename.find('[__')
if x != -1:
y = modfilename.find('__]', x)
if y != -1:
issueid = modfilename[x+3:y]
logger.fdebug('issueid: %s' % issueid)
modfilename = '%s %s'.strip() % (modfilename[:x], modfilename[y+3:])
logger.fdebug('issueid %s removed successfully: %s' % (issueid, modfilename))
#here we take a snapshot of the current modfilename, the intent is that we will remove characters that match
#as we discover them - namely volume, issue #, years, etc
#the remaining strings should be the series title and/or issue title if present (has to be detected properly)
modseries = modfilename
#try and remove /remember unicode character strings here (multiline ones get seperated/removed in below regex)
pat = re.compile('[\x00-\x7f]{3,}', re.UNICODE)
replack = pat.sub('XCV', modfilename)
wrds = replack.split('XCV')
tmpfilename = modfilename
if len(wrds) > 1:
for i in list(wrds):
if i != '':
tmpfilename = tmpfilename.replace(i, 'XCV')
tmpfilename = ''.join(tmpfilename)
modfilename = tmpfilename
sf3 = re.compile(r"[^,\s_]+", re.UNICODE)
split_file3 = sf3.findall(modfilename)
#--2019-11-30
if len(split_file3) == 1 or all([len(split_file3) == 2, scangroup == 'Glorith-HD']):
#--end 2019-11-30
logger.fdebug('Improperly formatted filename - there is no seperation using appropriate characters between wording.')
sf3 = re.compile(r"[^,\s_\.]+", re.UNICODE)
split_file3 = sf3.findall(modfilename)
logger.fdebug('NEW split_file3: %s' % split_file3)
ret_sf2 = ' '.join(split_file3)
sf = re.findall('''\( [^\)]* \) |\[ [^\]]* \] |\[ [^\#]* \]|\S+''', ret_sf2, re.VERBOSE)
#sf = re.findall('''\( [^\)]* \) |\[ [^\]]* \] |\S+''', ret_sf2, re.VERBOSE)
ret_sf1 = ' '.join(sf)
#here we should account for some characters that get stripped out due to the regex's
#namely, unique characters - known so far: +, &, @
#c11 = '\+'
#f11 = '\&'
#g11 = '\''
ret_sf1 = re.sub('\+', 'c11', ret_sf1).strip()
ret_sf1 = re.sub('\&', 'f11', ret_sf1).strip()
ret_sf1 = re.sub('\'', 'g11', ret_sf1).strip()
ret_sf1 = re.sub('\@', 'h11', ret_sf1).strip()
#split_file = re.findall('(?imu)\([\w\s-]+\)|[-+]?\d*\.\d+|\d+[\s]COVERS+|\d{4}-\d{2}-\d{2}|\d+[(th|nd|rd|st)]+|[\(^\)+]|\d+|[\w-]+|#?\d\.\d+|#[\.-]\w+|#[\d*\.\d+|\w+\d+]+|#(?<![\w\d])XCV(?![\w\d])+|#[\w+]|\)', ret_sf1, re.UNICODE)
#updated to keep words within square brackets together.
split_file = re.findall('(?imu)\([\w\s-]+\)|[-+]?\d*\.\d+|\d+[\s]COVERS+|\d+[(\s|\-)]PAGE+|\d{4}-\d{2}-\d{2}|\d+[(th|nd|rd|st)]+|[\(^\)+]|\[.*?\]|\d+|[\w-]+|#?\d\.\d+|#[\.-]\w+|#[\d*\.\d+|\w+\d+]+|#(?<![\w\d])XCV(?![\w\d])+|#[\w+]|\)', ret_sf1, re.UNICODE)
#10-20-2018 ---START -- attempt to detect '01 (of 7.3)'
#10-20-2018 -- attempt to detect '36p ctc' as one element
#4-7-2020 -- remove '####px' as it's useless and will muck up the parser.
spf = []
mini = False
wrdcnt = 0
for x in split_file:
if x == 'of':
if split_file[wrdcnt-1].isdigit():
mini = True
wrdcnt+=1
spf.append(x)
continue
if mini is True:
mini = False
try:
logger.fdebug('checking now: %s' % x)
if x.lower() == 'infinity':
raise Exception
if x.isdigit():
logger.fdebug('[MINI-SERIES] MAX ISSUES IN SERIES: %s' % x)
spf.append('(of %s)' % x)
elif float(x) > 0:
logger.fdebug('[MINI-DECIMAL SERIES] MAX ISSUES IN SERIES: %s' % x)
spf.append('(of %s)' % x)
except Exception as e:
spf.append(x)
elif x == ')' or x == '(':
pass
elif x == 'p' or x == 'ctc' or x == 'px':
try:
if spf[wrdcnt-1].isdigit():
logger.debug('THIS SHOULD BE : %s%s' % (spf[wrdcnt-1], x))
newline = '%s%s' % (spf[wrdcnt-1], x)
spf[wrdcnt -1] = newline
#wrdcnt =-1
elif spf[wrdcnt-1][-1] == 'p' and spf[wrdcnt-1][:-1].isdigit() and x == 'ctc':
logger.fdebug('THIS SHOULD BE : %s%s' % (spf[wrdcnt-1], x))
newline = '%s%s' % (spf[wrdcnt-1], x)
spf[wrdcnt -1] = newline
#wrdcnt =-1
except Exception as e:
spf.append(x)
else:
spf.append(x)
wrdcnt +=1
if len(spf) > 0:
split_file = spf
logger.fdebug('NEWLY SPLIT REORGD: %s' % split_file)
#10-20-2018 ---END
if len(split_file) == 1:
logger.fdebug('Improperly formatted filename - there is no seperation using appropriate characters between wording.')
ret_sf1 = re.sub('\-',' ', ret_sf1).strip()
split_file = re.findall('(?imu)\([\w\s-]+\)|[-+]?\d*\.\d+|\d+|[\w-]+|#?\d\.\d+|#(?<![\w\d])XCV(?![\w\d])+|\)', ret_sf1, re.UNICODE)
possible_issuenumbers = []
volumeprior = False
volume = None
volume_found = {}
datecheck = []
lastissue_label = None
lastissue_position = 0
lastmod_position = 0
booktype = 'issue'
#exceptions that are considered alpha-numeric issue numbers
exceptions = ('NOW', 'AI', 'AU', 'X', 'A', 'B', 'C', 'INH', 'MU', 'HU', 'SUMMER', 'SPRING', 'FALL', 'WINTER', 'PREVIEW')
#unicode characters, followed by int value
# num_exceptions = [{iss:u'\xbd',val:.5},{iss:u'\xbc',val:.25}, {iss:u'\xe',val:.75}, {iss:u'\221e',val:'infinity'}]
file_length = 0
validcountchk = False
sep_volume = False
current_pos = -1
for sf in split_file:
current_pos +=1
#the series title will always be first and be AT LEAST one word.
if split_file.index(sf) >= 0 and not volumeprior:
dtcheck = re.sub('[\(\)\,]', '', sf).strip()
#if there's more than one date, assume the right-most date is the actual issue date.
if any(['19' in dtcheck, '20' in dtcheck]) and not any([dtcheck.lower().startswith('v19'), dtcheck.lower().startswith('v20')]) and len(dtcheck) >=4:
logger.fdebug('checking date : %s' % dtcheck)
checkdate_response = self.checkthedate(dtcheck)
if checkdate_response:
logger.fdebug('date: %s' % checkdate_response)
datecheck.append({'date': dtcheck,
'position': split_file.index(sf),
'mod_position': self.char_file_position(modfilename, sf, lastmod_position)})
#this handles the exceptions list in the match for alpha-numerics
test_exception = ''.join([i for i in sf if not i.isdigit()])
if any([x for x in exceptions if x.lower() == test_exception.lower()]):
logger.fdebug('Exception match: %s' % test_exception)
if lastissue_label is not None:
if lastissue_position == (split_file.index(sf) -1):
logger.fdebug('alphanumeric issue number detected as : %s %s' % (lastissue_label,sf))
for x in possible_issuenumbers:
possible_issuenumbers = []
if int(x['position']) != int(lastissue_position):
possible_issuenumbers.append({'number': x['number'],
'position': x['position'],
'mod_position': x['mod_position'],
'validcountchk': x['validcountchk']})
possible_issuenumbers.append({'number': '%s %s' % (lastissue_label, sf),
'position': lastissue_position,
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
else:
#if the issue number & alpha character(s) don't have a space seperating them (ie. 15A)
#test_exception is the alpha-numeric
logger.fdebug('Possible alpha numeric issue (or non-numeric only). Testing my theory.')
test_sf = re.sub(test_exception.lower(), '', sf.lower()).strip()
logger.fdebug('[%s] Removing possible alpha issue leaves: %s (Should be a numeric)' % (test_exception, test_sf))
if test_sf.isdigit():
possible_issuenumbers.append({'number': sf,
'position': split_file.index(sf),
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
else:
test_position = modfilename[self.char_file_position(modfilename, sf,lastmod_position)-1]
if test_position == '#':
possible_issuenumbers.append({'number': sf,
'position': split_file.index(sf),
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
if sf == 'XCV':
# new 2016-09-19 \ attempt to check for XCV which replaces any unicode above
for x in list(wrds):
if x != '':
tmpissue_number = re.sub('XCV', x, split_file[split_file.index(sf)])
logger.fdebug('[SPECIAL-CHARACTER ISSUE] Possible issue # : %s' % tmpissue_number)
possible_issuenumbers.append({'number': sf,
'position': split_file.index(sf),
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
count = None
found = False
match = re.search('(?<=\sof\s)\d+(?=\s)', sf, re.IGNORECASE)
if match:
logger.fdebug('match')
count = match.group()
found = True
if found is False:
match = re.search('(?<=\(of\s)\d+(?=\))', sf, re.IGNORECASE)
if match:
count = match.group()
found = True
if count:
# count = count.lstrip("0")
logger.fdebug('Mini-Series Count detected. Maximum issue # set to : %s' % count.lstrip('0'))
# if the count was detected, then it's in a '(of 4)' or whatever pattern
# 95% of the time the digit immediately preceding the '(of 4)' is the actual issue #
logger.fdebug('Issue Number SHOULD BE: %s' % lastissue_label)
validcountchk = True
match2 = re.search('(\d+[\s])covers', sf, re.IGNORECASE)
if match2:
num_covers = re.sub('[^0-9]', '', match2.group()).strip()
#logger.fdebug('%s covers detected within filename' % num_covers)
continue
if all([lastissue_position == (split_file.index(sf) -1), lastissue_label is not None, '#' not in sf, sf != 'p']):
#find it in the original file to see if there's a decimal between.
findst = lastissue_mod_position+1
if findst >= len(modfilename):
findst = len(modfilename) -1
if modfilename[findst] != '.' or modfilename[findst] != '#': #findst != '.' and findst != '#':
if sf.isdigit():
seper_num = False
for x in datecheck:
if x['position'] == split_file.index(sf, lastissue_position):
seper_num = True
if seper_num is False:
logger.fdebug('2 seperate numbers detected. Assuming 2nd number is the actual issue')
#possible_issuenumbers.append({'number': sf,
# 'position': split_file.index(sf, lastissue_position), #modfilename.find(sf)})
# 'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
# 'validcountchk': validcountchk})
#used to see if the issue is an alpha-numeric (ie. 18.NOW, 50-X, etc)
lastissue_position = split_file.index(sf, lastissue_position)
lastissue_label = sf
lastissue_mod_position = file_length
else:
pass
else:
bb = len(lastissue_label) + findst
#find current sf
#logger.fdebug('bb: ' + str(bb) + '[' + modfilename[findst:bb] + ']')
cf = modfilename.find(sf, file_length)
#logger.fdebug('cf: ' + str(cf) + '[' + modfilename[cf:cf+len(sf)] + ']')
diff = bb
#logger.fdebug('diff: ' + str(bb) + '[' + modfilename[bb] + ']')
if modfilename[bb] == '.':
#logger.fdebug('decimal detected.')
logger.fdebug('[DECiMAL-DETECTION] Issue being stored for validation as : %s' % modfilename[findst:cf+len(sf)])
for x in possible_issuenumbers:
possible_issuenumbers = []
#logger.fdebug('compare: ' + str(x['position']) + ' .. ' + str(lastissue_position))
#logger.fdebug('compare: ' + str(x['position']) + ' .. ' + str(split_file.index(sf, lastissue_position)))
if int(x['position']) != int(lastissue_position) and int(x['position']) != split_file.index(sf, lastissue_position):
possible_issuenumbers.append({'number': x['number'],
'position': x['position'],
'mod_position': x['mod_position'],
'validcountchk': x['validcountchk']})
possible_issuenumbers.append({'number': modfilename[findst:cf+len(sf)],
'position': split_file.index(lastissue_label, lastissue_position),
'mod_position': findst,
'dec_position': bb,
'rem_position': split_file.index(sf),
'validcountchk': validcountchk})
else:
if ('#' in sf or sf.isdigit()) or validcountchk:
if validcountchk:
#if it's not a decimal but the digits are back-to-back, then it's something else.
possible_issuenumbers.append({'number': lastissue_label,
'position': lastissue_position,
'mod_position': lastissue_mod_position,
'validcountchk': validcountchk})
validcountchk = False
#used to see if the issue is an alpha-numeric (ie. 18.NOW, 50-X, etc)
lastissue_position = split_file.index(sf, lastissue_position)
lastissue_label = sf
lastissue_mod_position = file_length
elif '#' in sf:
logger.fdebug('Issue number found: %s' % sf)
#pound sign will almost always indicate an issue #, so just assume it's as such.
locateiss_st = modfilename.find('#')
locateiss_end = modfilename.find(' ', locateiss_st)
if locateiss_end == -1:
locateiss_end = len(modfilename)
if modfilename[locateiss_end-1] == ')':
locateiss_end = locateiss_end -1
possible_issuenumbers.append({'number': modfilename[locateiss_st:locateiss_end],
'position': split_file.index(sf), #locateiss_st})
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
#now we try to find the series title &/or volume lablel.
if any( [sf.lower().startswith('v'), sf.lower().startswith('vol'), volumeprior == True, 'volume' in sf.lower(), 'vol' in sf.lower(), 'part' in sf.lower()] ) and sf.lower() not in {'one','two','three','four','five','six'}:
if any([ split_file[split_file.index(sf)].isdigit(), split_file[split_file.index(sf)][3:].isdigit(), split_file[split_file.index(sf)][1:].isdigit() ]):
if all(identifier in sf for identifier in ['.', 'v']):
volume = sf.split('.')[0]
else:
volume = re.sub("[^0-9]", "", sf)
if volumeprior:
try:
volume_found['position'] = split_file.index(volumeprior_label, current_pos -1) #if this passes, then we're ok, otherwise will try exception
logger.fdebug('volume_found: %s' % volume_found['position'])
#remove volume numeric from split_file
split_file.pop(volume_found['position'])
split_file.pop(split_file.index(sf, current_pos-1))
#join the previous label to the volume numeric
#volume = str(volumeprior_label) + str(volume)
#insert the combined info back
split_file.insert(volume_found['position'], volumeprior_label + volume)
split_file.insert(volume_found['position']+1, '')
#volume_found['position'] = split_file.index(sf, current_pos)
#logger.fdebug('NEWSPLITFILE: %s' % split_file)
except:
volumeprior = False
volumeprior_label = None
sep_volume = False
continue
else:
volume_found['position'] = split_file.index(sf, current_pos)
volume_found['volume'] = volume
logger.fdebug('volume label detected as : Volume %s @ position: %s' % (volume, volume_found['position']))
volumeprior = False
volumeprior_label = None
elif all(['vol' in sf.lower(), len(sf) == 3]) or all(['vol.' in sf.lower(), len(sf) == 4]):
#if there's a space between the vol and # - adjust.
volumeprior = True
volumeprior_label = sf
sep_volume = True
logger.fdebug('volume label detected, but vol. number is not adjacent, adjusting scope to include number.')
elif 'volume' in sf.lower() or all(['part' in sf.lower(), len(sf) == 4]):
volume = re.sub("[^0-9]", "", sf)
if volume.isdigit():
volume_found['volume'] = volume
volume_found['position'] = split_file.index(sf)
else:
volumeprior = True
volumeprior_label = sf
sep_volume = True
elif any([sf == 'I', sf == 'II', sf == 'III', sf == 'IV']) and volumeprior:
volumeprior = False
volumeprior_label = None
sep_volume = False
continue
else:
#reset the sep_volume indicator here in case a false Volume detected above
sep_volume = False
#check here for numeric or negative number
if sf.isdigit() and split_file.index(sf, current_pos) == 0:
continue
if sf.isdigit():
possible_issuenumbers.append({'number': sf,
'position': split_file.index(sf, current_pos), #modfilename.find(sf)})
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
#used to see if the issue is an alpha-numeric (ie. 18.NOW, 50-X, etc)
lastissue_position = split_file.index(sf, current_pos)
lastissue_label = sf
lastissue_mod_position = file_length
#logger.fdebug('possible issue found: %s' % sf)
else:
try:
x = float(sf)
#validity check
if x < 0:
logger.fdebug('I have encountered a negative issue #: %s' % sf)
possible_issuenumbers.append({'number': sf,
'position': split_file.index(sf, lastissue_position), #modfilename.find(sf)})
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
lastissue_position = split_file.index(sf, lastissue_position)
lastissue_label = sf
lastissue_mod_position = file_length
elif x > 0:
logger.fdebug('I have encountered a decimal issue #: %s' % sf)
possible_issuenumbers.append({'number': sf,
'position': split_file.index(sf, lastissue_position), #modfilename.find(sf)})
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
lastissue_position = split_file.index(sf, lastissue_position)
lastissue_label = sf
lastissue_mod_position = file_length
else:
raise ValueError
except ValueError as e:
#10-20-2018 - to detect issue numbers such as #000.0000½
if lastissue_label is not None and lastissue_position == int(split_file.index(sf))-1 and sf == 'XCV':
logger.fdebug('this should be: %s%s' % (lastissue_label, sf))
pi = []
for x in possible_issuenumbers:
if (x['number'] == lastissue_label and x['position'] == lastissue_position) or (x['number'] == sf and x['position'] == split_file.index(sf, lastissue_position)):
pass
else:
pi.append({'number': x['number'],
'position': x['position'],
'mod_position': x['mod_position'],
'validcountchk': x['validcountchk']})
lastissue_label = '%s%s' % (lastissue_label, sf)
pi.append({'number': lastissue_label,
'position': lastissue_position,
'mod_position': lastmod_position,
'validcountchk': validcountchk})
if len(pi) > 0:
possible_issuenumbers = pi
elif sf.lower() == 'of' and lastissue_label is not None and lastissue_position == int(split_file.index(sf))-1:
logger.fdebug('MINI-SERIES DETECTED')
else:
if any([re.sub('[\(\)]', '', sf.lower()).strip() == 'tpb', re.sub('[\(\)]', '', sf.lower()).strip() == 'digital tpb']):
logger.fdebug('TRADE PAPERBACK DETECTED. NOT DETECTING ISSUE NUMBER - ASSUMING VOLUME')
booktype = 'TPB'
try:
if volume_found['volume'] is not None:
possible_issuenumbers.append({'number': volume_found['volume'],
'position': volume_found['position'],
'mod_position': self.char_file_position(modfilename, volume_found['volume'], lastmod_position),
'validcountchk': validcountchk})
except:
possible_issuenumbers.append({'number': '1',
'position': split_file.index(sf, lastissue_position), #modfilename.find(sf)})
'mod_position': self.char_file_position(modfilename, sf, lastmod_position),
'validcountchk': validcountchk})
elif any([sf.lower() == 'gn', sf.lower() == 'graphic novel']):
logger.fdebug('GRAPHIC NOVEL DETECTED. NOT DETECTING ISSUE NUMBER - ASSUMING VOLUME')
booktype = 'GN'
else:
if 'could not convert string to float' not in str(e):
logger.fdebug('[%s] Error detecting issue # - ignoring this result : %s' % (e, sf))
volumeprior = False
volumeprior_label = None
sep_volume = False
pass
#keep track of where in the original modfilename the positions are in order to check against it for decimal places, etc.
file_length += len(sf) + 1 #1 for space
if file_length > len(modfilename):
file_length = len(modfilename)
lastmod_position = self.char_file_position(modfilename, sf, lastmod_position)
highest_series_pos = len(split_file)
issue2year = False
issue_year = None
possible_years = []
yearmodposition = None
logger.fdebug('datecheck: %s' % datecheck)
if len(datecheck) > 0:
for dc in sorted(datecheck, key=operator.itemgetter('position'), reverse=True):
a = self.checkthedate(dc['date'])
ab = str(a)
sctd = self.checkthedate(str(dt.datetime.now().year))
logger.fdebug('sctd: %s' % sctd)
# + 1 sctd so that we can allow for issue dates that cross over into the following year when it's nearer to the end of said year.
if int(ab) > int(sctd) + 1:
logger.fdebug('year is in the future, ignoring and assuming part of series title.')
yearposition = None
yearmodposition = None
continue
else:
logger.fdebug('year verified as : %s' % issue_year)
if highest_series_pos > dc['position'] and all([dc['position'] != 0, len(datecheck) > 1]):
highest_series_pos = dc['position']
issue_year = dc['date']
yearposition = dc['position']
yearmodposition = dc['mod_position']
if len(ab) == 4:
issue_year = ab
logger.fdebug('year verified as: %s' % issue_year)
possible_years.append({'year': issue_year,
'yearposition': dc['position'],
'yearmodposition': dc['mod_position']})
else:
issue_year = ab
logger.fdebug('date verified as: %s' % issue_year)
if len(possible_years) == 1:
issueyear = possible_years[0]['year']
yearposition = possible_years[0]['yearposition']
yearmodposition = possible_years[0]['yearmodposition']
else:
if len(possible_issuenumbers) > 0:
for x in possible_years:
logger.fdebug('yearposition[%s] -- dc[position][%s]' % (yearposition, x['yearposition']))
if yearposition < x['yearposition']:
if all([len(possible_issuenumbers) == 1, possible_issuenumbers[0]['number'] == x['year'], x['yearposition'] != possible_issuenumbers[0]['position']]):
issue2year = True
highest_series_pos = x['yearposition']
yearposition = x['yearposition']
yearmodposition = x['yearmodposition']
if yearposition is not None and highest_series_pos > yearposition:
highest_series_pos = yearposition #dc['position']: highest_series_pos = dc['position']
else:
issue_year = None
yearposition = None
yearmodposition = None
logger.fdebug('No year present within title - ignoring as a variable.')
logger.fdebug('highest_series_position: %s' % highest_series_pos)
#---2019-11-30 account for scanner Glorith-HD stupid naming conventions
if len(possible_issuenumbers) == 0 and scangroup == 'Glorith-HD':
logger.fdebug('Abnormal formatting detected. Time to fix this shiet, yo.')
if any([yearposition == 0, yearposition is None]):
logger.fdebug('Too stupid of a format. Nope. Not gonna happen - just reinvent the wheel you fooker.')
else:
issposs = yearposition + 1
#logger.fdebug('split_file: %s' % split_file[issposs])
if '(' and ')' in split_file[issposs]:
new_issuenumber = split_file[issposs]
possible_issuenumbers.append({'number': re.sub('[/(/)]', '', split_file[issposs]).strip(),
'position': split_file.index(new_issuenumber, yearposition),
'mod_position': self.char_file_position(modfilename, new_issuenumber, yearmodposition),
'validcountchk': False})
#---end 2019-11-30
issue_number = None
dash_numbers = []
issue_number_position = len(split_file)
if len(possible_issuenumbers) > 0:
logger.fdebug('possible_issuenumbers: %s' % possible_issuenumbers)
if len(possible_issuenumbers) >= 1:
p = 1
if '-' not in split_file[0]:
finddash = modfilename.find('-')
if finddash != -1:
logger.fdebug('hyphen located at position: %s' % finddash)
if yearposition:
logger.fdebug('yearposition: %s' % yearposition)
else:
finddash = -1
logger.fdebug('dash is in first word, not considering for determing issue number.')
for pis in sorted(possible_issuenumbers, key=operator.itemgetter('position'), reverse=True):
a = ' '.join(split_file)
lenn = pis['mod_position'] + len(pis['number'])
if lenn == len(a) and finddash != -1:
logger.fdebug('Numeric detected as the last digit after a hyphen. Typically this is the issue number.')
if pis['position'] != yearposition:
issue_number = pis['number']
#logger.info('Issue set to: ' + str(issue_number))
issue_number_position = pis['position']
if highest_series_pos > pis['position']: highest_series_pos = pis['position']
#break
elif pis['validcountchk'] == True:
issue_number = pis['number']
issue_number_position = pis['position']
logger.fdebug('Issue verified and detected as part of a numeric count sequnce: %s' % issue_number)
if highest_series_pos > pis['position']: highest_series_pos = pis['position']
break
elif pis['mod_position'] > finddash and finddash != -1:
if yearmodposition is not None:
if finddash < yearmodposition and finddash > (yearmodposition + len(split_file[yearposition])):
logger.fdebug('issue number is positioned after a dash - probably not an issue number, but part of an issue title')
dash_numbers.append({'mod_position': pis['mod_position'],
'number': pis['number'],
'position': pis['position']})
continue
#2019-10-05 fix - if decimal-spaced filename has a series title with a hyphen will include issue # as part of series title
elif yearposition == pis['position']:
logger.fdebug('Already validated year, ignoring as possible issue number: %s' % pis['number'])
continue
#end 2019-10-05
elif yearposition == pis['position']:
logger.fdebug('Already validated year, ignoring as possible issue number: %s' % pis['number'])
continue
if p == 1:
issue_number = pis['number']
issue_number_position = pis['position']
logger.fdebug('issue number :%s' % issue_number) #(pis)
if highest_series_pos > pis['position'] and issue2year is False: highest_series_pos = pis['position']
#else:
#logger.fdebug('numeric probably belongs to series title: ' + str(pis))
p+=1
else:
issue_number = possible_issuenumbers[0]['number']
issue_number_position = possible_issuenumbers[0]['position']
if highest_series_pos > possible_issuenumbers[0]['position']: highest_series_pos = possible_issuenumbers[0]['position']
if issue_number:
issue_number = re.sub('#', '', issue_number).strip()
else:
if len(dash_numbers) > 0 and finddash !=-1 :
#there are numbers after a dash, which was incorrectly accounted for.
fin_num_position = finddash
fin_num = None
for dn in dash_numbers:
if dn['mod_position'] > finddash and dn['mod_position'] > fin_num_position:
fin_num_position = dn['mod_position']
fin_num = dn['number']
fin_pos = dn['position']
if fin_num:
logger.fdebug('Issue number re-corrected to : %s' % fin_num)
issue_number = fin_num
if highest_series_pos > fin_pos: highest_series_pos = fin_pos
#--- this is new - 2016-09-18 /account for unicode in issue number when issue number is not deteted above
logger.fdebug('issue_position: %s' % issue_number_position)
if all([issue_number_position == highest_series_pos, 'XCV' in split_file, issue_number is None]):
for x in list(wrds):
if x != '':
issue_number = re.sub('XCV', x, split_file[issue_number_position-1])
highest_series_pos -=1
issue_number_position -=1
if issue_number is None:
if any([booktype == 'TPB', booktype == 'GN']):
logger.fdebug('%s detected. Volume assumption is number: %s' % (booktype, volume_found))
else:
if issue_year is not None and issue_number is None and '2000ad' in ''.join(split_file).lower():
for x in possible_years:
try:
if split_file.index(issue_year) < x['yearposition'] and x['year'] != issue_year:
issue_year = x['year']
break
except Exception as e:
pass
issue_number = issue_year
issue_year = None
elif len(volume_found) > 0:
logger.fdebug('UNKNOWN TPB/GN detected. Volume assumption is number: %s' % (volume_found))
booktype = 'TPB'
else:
logger.fdebug('No issue number present in filename.')
else:
logger.fdebug('issue verified as : %s' % issue_number)
issue_volume = None
if len(volume_found) > 0:
issue_volume = 'v' + str(volume_found['volume'])
if all([highest_series_pos + 1 != volume_found['position'], highest_series_pos != volume_found['position'] + 1, sep_volume == False, booktype == 'issue', len(possible_issuenumbers) > 0]):
logger.fdebug('Extra item(s) are present between the volume label and the issue number. Checking..')
split_file.insert(int(issue_number_position), split_file.pop(volume_found['position'])) #highest_series_pos-1, split_file.pop(volume_found['position']))
logger.fdebug('new split: %s' % split_file)
highest_series_pos = volume_found['position'] -1
#2019-10-02 - account for volume BEFORE issue number
if issue_number_position > highest_series_pos:
issue_number_position -=1
else: