-
Notifications
You must be signed in to change notification settings - Fork 5
/
inaseg.py
314 lines (302 loc) · 12.9 KB
/
inaseg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# Load the API
from inaSpeechSegmenter import Segmenter # noqa: E402
import os
import glob
from threading import Thread
import gc
import logging
import tempfile
from noxsegutils.shazam import shazaming
from noxsegutils.util import get_segment_process_length_array, ffmpeg
from noxsegutils.timestamp import fix_missing_stamps, sec2timestamp
from noxsegutils.download import ytbdl
from noxsegutils.extractor import load_config, save_config
# 媒体流最大时长处理(秒);1G内存的进程推荐用10分钟/600秒,16G可以支持5小时,6GB VRAM可以支持5小时左右。
SEGMENT_THRES = 800
# 识歌分段最小阈值(秒),调大了会漏 调小了会多杂谈
EXTRACT_SEG_THRES = 60
# 最终识歌分段最小阈值(秒),调大了漏TV size 调小了多杂谈
EXTRACT_SEG_THRES_FINAL = 80
# 识歌分段连接的阈值(秒),调大了会两首歌分不开 调小了会碎
EXTRACT_SEG_CONNECT = 5
# 大了会碎 小了会两首歌分不开
ENERGY_RATIO = 0.03
# 8GB VRAM 推荐 256
BATCH_SIZE = 32
SAVE_YAML_PATH = os.path.join(
os.path.dirname(
os.path.abspath(__file__)),
'save.yaml')
class TimestampMismatch(Exception):
pass
# select a media to analyse
# any media supported by ffmpeg may be used (video, audio, urls)
def segment(
media: str, batch_size: int = BATCH_SIZE, energy_ratio: float = ENERGY_RATIO,
start_sec: int = None, stop_sec: int = None):
segmenter = Segmenter(
vad_engine='sm',
detect_gender=False,
energy_ratio=energy_ratio,
batch_size=batch_size)
segmentation = segmenter(media, start_sec=start_sec, stop_sec=stop_sec)
return segmentation
def segment_wrapper(
media: str, batch_size: int = BATCH_SIZE,
energy_ratio: float = ENERGY_RATIO, segment_length_thres: int = 0):
''''''
result = []
for i in get_segment_process_length_array(media, segment_length_thres):
logging.info([
'segmenting', media, 'from',
sec2timestamp(i[0]), 'to', sec2timestamp(i[1])])
result += segment(
media, batch_size, energy_ratio,
start_sec=i[0], stop_sec=i[1])
gc.collect()
tf.keras.backend.clear_session()
return result
def extract_music(
segmentation, segment_thres = EXTRACT_SEG_THRES,
segment_thres_final = EXTRACT_SEG_THRES_FINAL,
segment_connect = EXTRACT_SEG_CONNECT, start_padding = 1, end_padding = 4):
r = []
#bridges noEnergy segments that are likely fragmented
for i in range(len(segmentation)-2, 0, -1):
if segmentation[i][0] == 'noEnergy' and \
segmentation[i][2] - segmentation[i][1] < 4 and \
segmentation[i-1][0] == segmentation[i+1][0]:
segmentation[i-1] = (
segmentation[i-1][0],
segmentation[i-1][1],
segmentation[i+1][2])
for i in segmentation:
if i[0] == 'music' and i[2]-i[1] > segment_thres:
r.append(['',i[1] - start_padding, i[2] + end_padding])
for i in range(len(r)-1, 0, -1):
if r[i][1] - r[i-1][2] < segment_connect:
r[i-1][2] = r[i][2]
r[i][1] = r[i][2] + 1
rf = []
for i in r:
if i[1] < 5:
continue
if i[2]-i[1] > segment_thres_final:
rf.append(i)
return [['{}:{}:{}'.format(str(int(x[1]//3600)),
str(int(x[1] % 3600 //60)),
str(int(x[1] % 60)).zfill(2)),
'{}:{}:{}'.format(str(int(x[2]//3600)),
str(int(x[2] % 3600 //60)),
str(int(x[2] % 60)).zfill(2))] for x in rf]
def extract_mah_stuff(
media, segmented_stamps, outdir = None, rev = False,
delimited = '/', timestamps = [], soundonly = True):
nameswitch = False
timestamps_ext = segmented_stamps
try:
if len(timestamps) > 0:
raise FileNotFoundError()
for i in open(r"D:\tmp\ytd\timstamp.ini", 'r', encoding='UTF-8'):
for k in [
['\n',''],['」', ''],['~「',' '],['「', ' '],
['『', ' '],['』', ' '],[' ', ' '],[' ', ' '],['\'',''],]:
i = i.replace(k[0], k[1])
if ' (' in i:
i = i[:i.find(' (')]
if ':' in i:
timestamps.append([i[:i.find(' ')], i[i.find(' ')+1:]])
#timestamps[-1][1] = timestamps[-1][1][1:]
if delimited in timestamps[-1][1]:
timestamps[-1][1] = [
timestamps[-1][1][:timestamps[-1][1].find('/')],
timestamps[-1][1][timestamps[-1][1].find('/')+1:]]
if rev:
timestamps[-1][1].reverse()
timestamps[-1][1] = ' by '.join(timestamps[-1][1])
#timestamps[-1][1].replace('/', ' by ')
#timestamps[-1][1] = ' by '.join(timestamps[-1][1].split('/'))
#timestamps[-1][1] = timestamps[-1][1].replace('-', 'by')
while timestamps[-1][1][0] == ' ':
timestamps[-1][1] = timestamps[-1][1][1:]
while timestamps[-1][1][-1] == ' ':
timestamps[-1][1] = timestamps[-1][1][:-1]
# nameswitch = True
elif nameswitch:
nameswitch = False
timestamps[-1].append(i)
if len(timestamps) > 0:# and len(timestamps) != len(timestamps_ext):
logging.info('checking timestamp correspondence and removing\
mismatched ones (come on, are you really gonna do this manually)')
timestamps = fix_missing_stamps(timestamps, timestamps_ext)
timestamps_ext = fix_missing_stamps(timestamps_ext, timestamps)
if len(timestamps) != len(timestamps_ext):
raise TimestampMismatch(
'check timestamp assist', timestamps, timestamps_ext)
with open(r"D:\tmp\ytd\timstamp.ini", 'w') as f: # noqa: F841
pass
except FileNotFoundError:
pass
if len(timestamps) > 0:
logging.info([
'timestamp assist',
[[timestamps[x][0],
timestamps_ext[x][1], timestamps[x][1], ] for x in range(len(timestamps))]])
else:
logging.info([
'extracted timestamps',
['{} - {}'.format(x[0], x[1]) for x in timestamps_ext]])
try:
for count, x in enumerate(timestamps_ext):
logging.info(f'{str(count).zfill(2)}: {x[0]} - {x[1]}')
except Exception:
pass
save = load_config(SAVE_YAML_PATH)
save[os.path.basename(media)] = timestamps_ext
save_config(SAVE_YAML_PATH, save)
nameswitch = False
file = media
filename = file[:file.rfind('.')]
fileext = file[len(filename):]
filename = os.path.basename(filename)
cmds = []
for i in range(len(timestamps_ext)):
oud = outdir if outdir else os.path.dirname(file)
encoding = ['-c:v','copy','-c:a', 'copy']#'-c:v copy -c:a copy'
if soundonly:
encoding = ['-vn','-ab','320k']#'-vn -ab 320k'
fileext = '.mp3'
try:
prefix = timestamps[i][1].zfill(2)
cmds.append([
'ffmpeg',
'-ss',
timestamps[i][0],
'-to',
timestamps_ext[i][1],
'-i',
file,
'-reset_timestamps', '1',
] + encoding + [
os.path.join(
oud, filename + f'_{str(i).zfill(2)}_{prefix}' + fileext),
] + encoding)
except Exception:
prefix = str(i).zfill(2)
cmds.append([
'ffmpeg',
'-ss',
timestamps_ext[i][0],
'-to',
timestamps_ext[i][1],
'-i',
"{}".format(file),
] + encoding + [
"{}".format(os.path.join(oud, filename + '_' + prefix + fileext)),
])
k = [Thread(target=ffmpeg, args=(x,)) for x in cmds]
for i in k:
i.start()
i.join()
#k[-1].join()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='ina music segment')
parser.add_argument('--media', type=str, help = 'file path or weblink')
parser.add_argument(
'--outdir', type=str, default = tempfile.gettempdir(),
help = 'directory media will be downloaded into (if url) and extracted into;\
if docker, use as the mounted folder! and make sure -u is you!')
parser.add_argument(
'--shazam', action='store_true', default = False,
help = 'shazam the extracted files')
parser.add_argument(
'--shazam_coverart', type=str, default = '',
help = 'save shazam cover art too')
parser.add_argument(
'--aria', type=int, default = None,
help = 'use aria, specifying the number of threads')
parser.add_argument(
'--shazam_multithread', type=int, default = 1,
help = 'use multithreaded shazam')
parser.add_argument(
'--soundonly', type=str, default = r'-f bestaudio',
help = 'specify an empty string to extract both video and audio')
parser.add_argument(
'--seg_connect', type=int, default = 5,
help = 'max seconds for 2 music segments to be considered \
the same (takes care of random miss-detection/small rap segments)')
parser.add_argument(
'--cleanup', action='store_true', default = False,
help = 'delete the original media file once completed.')
parser.add_argument(
'--max_segment_length',
type=int,
default=SEGMENT_THRES,
help='for computers with limited RAM (eg a 5 hrs stream \
requires ~6GB VRAM), set this to process streams in \
this speficied segments to avoid ram overflow. in seconds.')
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(levelname)-8s %(message)s',
handlers=[
logging.FileHandler('./inaseg_inst.log'),
logging.StreamHandler()
])
args = parser.parse_args()
#media = ytbdl('')
if args.media is not None:
media = args.media
else:
raise Exception('no media')
if 'https:' in media:
media = ytbdl(
media, soundonly = args.soundonly,
aria = args.aria, outdir = args.outdir)
#media = r"D:\tmp\ytd\[莉犬くん【すとぷり】] 【激レア歌枠】とんでもないお知らせがあります。。。【莉犬】 20220607.webm"
if len(glob.glob(os.path.join(
args.outdir, f'*{os.path.splitext(os.path.basename(media))[0][1:]}_*'))) == 0:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
logging.info(gpus)
tf.get_logger().setLevel(logging.WARNING)
if False and gpus:
# Restrict TensorFlow to only allocate 1GB of memory on the first GPU: noope.
try:
tf.config.experimental.set_virtual_device_configuration(
gpus[0],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5424)])
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
logging.info([len(gpus), "Physical GPUs,",
len(logical_gpus), "Logical GPUs"])
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
except RuntimeError as e:
# Virtual devices must be set before GPUs have been initialized
logging.error(e)
raise
try:
timestamps = []
#timestamps = mus1ca_timestamp(media[:media.rfind('.')] + '.description')
saved_timestamp = extract_music(segment_wrapper(
media, segment_length_thres=args.max_segment_length),
segment_connect = args.seg_connect)
extract_mah_stuff(
media, segmented_stamps=saved_timestamp,
outdir=args.outdir, rev=False,
timestamps=timestamps,
soundonly=(args.soundonly != ''))
saved_timestamp = None
except TimestampMismatch:
raise
else:
logging.warning((
'segmentation', media, 'stopped to prevent posssible duplication'))
logging.info(['segmentation', media, 'successful'])
if args.cleanup and os.path.isfile(media):
os.remove(media)
if args.shazam:
shazaming(
args.outdir, media, args.shazam_coverart,
threads = args.shazam_multithread)
import sys
sys.exit(0)