Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for regex video title filtering #425

Merged
merged 11 commits into from
Nov 20, 2023
1 change: 1 addition & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ env:
IMAGE_NAME: tubesync

on:
workflow_dispatch:
locke4 marked this conversation as resolved.
Show resolved Hide resolved
push:
branches:
- main
Expand Down
2 changes: 1 addition & 1 deletion tubesync/common/templates/pagination.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<div class="col s12">
<div class="pagination">
{% for i in paginator.page_range %}
<a class="pagenum{% if i == page_obj.number %} currentpage{% endif %}" href="?{% if filter %}filter={{ filter }}&{% endif %}page={{ i }}{% if show_skipped %}&show_skipped=yes{% endif %}">{{ i }}</a>
<a class="pagenum{% if i == page_obj.number %} currentpage{% endif %}" href="?{% if filter %}filter={{ filter }}&{% endif %}page={{ i }}{% if show_skipped %}&show_skipped=yes{% endif %}{% if only_skipped %}&only_skipped=yes{% endif %}">{{ i }}</a>
{% endfor %}
</div>
</div>
Expand Down
1 change: 1 addition & 0 deletions tubesync/sync/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Migration(migrations.Migration):
('source_type', models.CharField(choices=[('c', 'YouTube channel'), ('p', 'YouTube playlist')], db_index=True, default='c', help_text='Source type', max_length=1, verbose_name='source type')),
('key', models.CharField(db_index=True, help_text='Source key, such as exact YouTube channel name or playlist ID', max_length=100, unique=True, verbose_name='key')),
('name', models.CharField(db_index=True, help_text='Friendly name for the source, used locally in TubeSync only', max_length=100, unique=True, verbose_name='name')),
('filter_text', models.CharField(db_index=True, help_text='Regex compatible filter string for video titles', max_length=100, verbose_name='filter text')),
locke4 marked this conversation as resolved.
Show resolved Hide resolved
('directory', models.CharField(db_index=True, help_text='Directory name to save the media into', max_length=100, unique=True, verbose_name='directory')),
('index_schedule', models.IntegerField(choices=[(3600, 'Every hour'), (7200, 'Every 2 hours'), (10800, 'Every 3 hours'), (14400, 'Every 4 hours'), (18000, 'Every 5 hours'), (21600, 'Every 6 hours'), (43200, 'Every 12 hours'), (86400, 'Every 24 hours')], db_index=True, default=21600, help_text='Schedule of how often to index the source for new media', verbose_name='index schedule')),
('delete_old_media', models.BooleanField(default=False, help_text='Delete old media after "days to keep" days?', verbose_name='delete old media')),
Expand Down
7 changes: 7 additions & 0 deletions tubesync/sync/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,13 @@ class IndexSchedule(models.IntegerChoices):
help_text=_('If "delete old media" is ticked, the number of days after which '
'to automatically delete media')
)
filter_text = models.CharField(
_('filter string'),
max_length=100,
default='.*',
locke4 marked this conversation as resolved.
Show resolved Hide resolved
blank=True,
help_text=_('Regex compatible filter string for video titles')
)
delete_removed_media = models.BooleanField(
_('delete removed media'),
default=False,
Expand Down
76 changes: 49 additions & 27 deletions tubesync/sync/signals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
from django.conf import settings
from django.db.models.signals import pre_save, post_save, pre_delete, post_delete
from django.dispatch import receiver
Expand Down Expand Up @@ -104,36 +105,57 @@ def media_post_save(sender, instance, created, **kwargs):
# already been downloaded
if not instance.downloaded:
max_cap_age = instance.source.download_cap_date
published = instance.published
if not published:
if not instance.skip:
log.warn(f'Media: {instance.source} / {instance} has no published date '
f'set, marking to be skipped')
filter_text = instance.source.filter_text
published = instance.published

if instance.skip:
locke4 marked this conversation as resolved.
Show resolved Hide resolved
#currently marked to be skipped, check if skip conditions still apply
if not published:
log.debug(f'Media: {instance.source} / {instance} has no published date '
f'set but is already marked to be skipped')
else:
if max_cap_age and filter_text:
if (published > max_cap_age) and (re.search(filter_text,instance.title)):
# Media was published after the cap date but is set to be skipped
print('Has a valid publishing date and matches filter, marking unskipped')
instance.skip = False
cap_changed = True
else:
print('does not have a valid publishing date or filter string, already marked skipped')
log.info(f'Media: {instance.source} / {instance} has no published date '
f'set but is already marked to be skipped')
elif max_cap_age:
if published > max_cap_age:
# Media was published after the cap date but is set to be skipped
log.info(f'Media: {instance.source} / {instance} has a valid '
f'publishing date, marking to be unskipped')
instance.skip = False
cap_changed = True
elif filter_text:
if re.search(filter_text,instance.title):
# Media was published after the cap date but is set to be skipped
locke4 marked this conversation as resolved.
Show resolved Hide resolved
log.info(f'Media: {instance.source} / {instance} matches the filter text, marking to be unskipped')
instance.skip = False
cap_changed = True
else:
if not published:
log.info(f'Media: {instance.source} / {instance} has no published date, marking to be skipped')
instance.skip = True
cap_changed = True
else:
log.debug(f'Media: {instance.source} / {instance} has no published date '
f'set but is already marked to be skipped')
else:
if max_cap_age:
if published > max_cap_age and instance.skip:
# Media was published after the cap date but is set to be skipped
log.info(f'Media: {instance.source} / {instance} has a valid '
f'publishing date, marking to be unskipped')
instance.skip = False
cap_changed = True
elif published <= max_cap_age and not instance.skip:
log.info(f'Media: {instance.source} / {instance} is too old for '
f'the download cap date, marking to be skipped')
instance.skip = True
cap_changed = True
else:
if instance.skip:
# Media marked to be skipped but source download cap removed
log.info(f'Media: {instance.source} / {instance} has a valid '
f'publishing date, marking to be unskipped')
instance.skip = False
cap_changed = True
if max_cap_age:
if published <= max_cap_age:
log.info(f'Media: {instance.source} / {instance} is too old for '
f'the download cap date, marking to be skipped')
instance.skip = True
cap_changed = True
if filter_text:
if not re.search(filter_text,instance.title):
#media doesn't match the filter text but is not marked to be skipped
log.info(f'Media: {instance.source} / {instance} does not match the filter text')
instance.skip = True
cap_changed = True

# Recalculate the "can_download" flag, this may
# need to change if the source specifications have been changed
if instance.metadata:
Expand Down
6 changes: 6 additions & 0 deletions tubesync/sync/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
import math
import uuid
import re
from io import BytesIO
from hashlib import sha1
from datetime import timedelta, datetime
Expand Down Expand Up @@ -254,6 +255,11 @@ def download_media_metadata(media_id):
log.warn(f'Media: {source} / {media} is older than cap age '
f'{max_cap_age}, skipping')
media.skip = True
# If the source has a search filter, check the video title matches the filter
if not re.search(source.filter_text,media.title):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be an opt-in filter (only download media that match this regex) or an opt-out filter (skip media that match this regex)?

Copy link
Contributor Author

@locke4 locke4 Oct 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought opt-in was more intuitive for a user. Currently the filter_text is specified, and then if anything does not match the filter, it is marked as a "skip". Happy to take your suggestion to this though as once I re-write this to use an is_regex_match(media_item_title) method it'll be easy to change later. Certainly regex supports inverse matching though it's clunky. The other way would be to specify both an "include" and an "exclude" string. I.e. include "foo" but exclude "bar" but it feels unnecessarily complex. On my local fork I'm already using a regex to download all videos which contain "40k" but don't contain "Darktide".

foo -> match
bar -> not matched
foo bar -> not matched

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would imagine the most common use-case for this is going to be "I want to ignore specific videos on a channel". While you can do this with positive matching regexes having to use lots of (?!string$)'s everywhere isn't going to be that friendly.

Personally, I think this might be better as a exclude_regex field. What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I follow. Negative lookaheads get this behaviour you describe. Inverting it is no work though I want to check that re supports negative lookaheads as I might need to swap it for re2. My primary use case is to do both (include the word "foo" but not include the word "bar") as above.

Regex string: ^(?!.*foo).*$

foo
bar <<< Match
foo bar

Regex string: foo

foo <<< Match
bar
foo bar <<< Match

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not add an option to specify what the filter does? Include/Exclude.

For various use cases having an allow-list vs an ignore-list might be very powerful.

Imagine a channel with thousands of videos, and perhaps a reoccuring title that appears throughout the channel, like "office update 15". It would be far easier to just set an include list of ^office update \d+$, imo.

Or perhaps a channel that is otherwise about some topic, but has music videos strewn about it.. You might set an exclude list of \(music\svideo\).

Writing regex is hard for some users and negations are tricky to reason about. Avoiding the need for them would be advantageous imo. Requiring one extra switch isn't a lot of work.

Copy link
Contributor Author

@locke4 locke4 Oct 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this idea. Essentially just a dropdown or slider toggle "include/exclude" though so it's clear what behaviour is currently selected (tickboxes would be a bit ambiguous. I'll have a look later today or tomorrow at do a new branch to implement this.

# Filter text not found in the media title. Accepts regex string, blank search filter results in this returning false
log.warn(f'Media: {source} / {media} does not contain {source.filter_text}, skipping')
locke4 marked this conversation as resolved.
Show resolved Hide resolved
media.skip = True
# If the source has a cut-off check the upload date is within the allowed delta
if source.delete_old_media and source.days_to_keep > 0:
if not isinstance(media.published, datetime):
Expand Down
2 changes: 1 addition & 1 deletion tubesync/sync/templates/sync/media.html
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,5 @@ <h1 class="truncate">Media</h1>
</div>
{% endfor %}
</div>
{% include 'pagination.html' with pagination=sources.paginator filter=source.pk show_skipped=show_skipped %}
{% include 'pagination.html' with pagination=sources.paginator filter=source.pk show_skipped=show_skipped only_skipped=only_skipped%}
{% endblock %}
4 changes: 4 additions & 0 deletions tubesync/sync/templates/sync/source.html
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ <h1 class="truncate">Source <strong>{{ source.name }}</strong></h1>
<td class="hide-on-small-only">Directory</td>
<td><span class="hide-on-med-and-up">Directory<br></span><strong>{{ source.directory }}</strong></td>
</tr>
<tr title="Filter text">
<td class="hide-on-small-only">Filter text</td>
<td><span class="hide-on-med-and-up">Filter text<br></span><strong>{{ source.filter_text }}</strong></td>
</tr>
<tr title="Media file name format to use for saving files">
<td class="hide-on-small-only">Media format</td>
<td><span class="hide-on-med-and-up">Media format<br></span><strong>{{ source.media_format }}</strong></td>
Expand Down
3 changes: 3 additions & 0 deletions tubesync/sync/tests.py
locke4 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def test_source(self):
'directory': 'testdirectory',
'media_format': settings.MEDIA_FORMATSTR_DEFAULT,
'download_cap': 0,
'filter_text':'.*',
'index_schedule': 3600,
'delete_old_media': False,
'days_to_keep': 14,
Expand Down Expand Up @@ -217,6 +218,7 @@ def test_source(self):
'directory': 'testdirectory',
'media_format': settings.MEDIA_FORMATSTR_DEFAULT,
'download_cap': 0,
'filter_text':'.*',
'index_schedule': Source.IndexSchedule.EVERY_HOUR,
'delete_old_media': False,
'days_to_keep': 14,
Expand Down Expand Up @@ -247,6 +249,7 @@ def test_source(self):
'directory': 'testdirectory',
'media_format': settings.MEDIA_FORMATSTR_DEFAULT,
'download_cap': 0,
'filter_text':'.*',
'index_schedule': Source.IndexSchedule.EVERY_2_HOURS, # changed
'delete_old_media': False,
'days_to_keep': 14,
Expand Down
2 changes: 1 addition & 1 deletion tubesync/sync/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def get_success_url(self):

class EditSourceMixin:
model = Source
fields = ('source_type', 'key', 'name', 'directory', 'media_format',
fields = ('source_type', 'key', 'name', 'directory', 'filter_text', 'media_format',
'index_schedule', 'download_media', 'download_cap', 'delete_old_media',
'delete_removed_media', 'days_to_keep', 'source_resolution', 'source_vcodec',
'source_acodec', 'prefer_60fps', 'prefer_hdr', 'fallback', 'copy_thumbnails',
Expand Down