Skip to content

Commit

Permalink
Improved spam prevention (#2954)
Browse files Browse the repository at this point in the history
* Backend of spam prevention

* Add comments and small tweaks

* Add language detection

* Initial mattermost notification system

* Fix listener broken on unknown arguments

* Update requirements

* Fix potentiel

* Add trailing slash

Co-authored-by: maudetes <maudet.estelle@gmail.com>

* fix discussion without first message

* Fix breadcrumbs and enable Mattermost notifications

* Add admin permissions on delete spam

* Use inheritence instead of signals to detect spam

* Fix updates of models for spam detection

* Use _created

* Small tweaks

* Fix two notifications sent

* Add API endpoint to fetch all potential spam

* Try again tests

* Remove comment

* Fix test

* Add new env to settings

* Add SPAM_MINIMUM_STRING_LENGTH_FOR_LANG_CHECK conf

---------

Co-authored-by: maudetes <maudet.estelle@gmail.com>
  • Loading branch information
ThibaudDauce and maudetes committed Feb 15, 2024
1 parent cda9af2 commit d7fbea1
Show file tree
Hide file tree
Showing 15 changed files with 558 additions and 16 deletions.
1 change: 1 addition & 0 deletions requirements/install.in
Expand Up @@ -31,6 +31,7 @@ html2text==2019.9.26
Jinja2==3.1.2
jsonschema==3.2.0
kombu[redis]==4.4.0
langdetect==1.0.9
lxml==4.4.2
mistune==0.8.4
mongoengine==0.27.0
Expand Down
12 changes: 5 additions & 7 deletions requirements/install.pip
@@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.7
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile --output-file=requirements/install.pip requirements/install.in
Expand Down Expand Up @@ -145,9 +145,7 @@ idna==2.10
# requests
# urlextract
importlib-metadata==6.0.0
# via
# flask
# jsonschema
# via flask
isodate==0.6.0
# via rdflib
itsdangerous==2.1.2
Expand All @@ -172,6 +170,8 @@ kombu[redis]==4.4.0
# via
# -r requirements/install.in
# celery
langdetect==1.0.9
# via -r requirements/install.in
lxml==4.4.2
# via -r requirements/install.in
markupsafe==2.1.2
Expand Down Expand Up @@ -219,7 +219,6 @@ python-dateutil==2.8.1
pytz==2022.7
# via
# -r requirements/install.in
# babel
# celery
# flask-babel
# flask-restx
Expand All @@ -246,6 +245,7 @@ six==1.16.0
# flask-cors
# isodate
# jsonschema
# langdetect
# python-dateutil
# wtforms-json
speaklater==1.3
Expand All @@ -256,8 +256,6 @@ text-unidecode==1.3
# via faker
tlds==2021080800
# via -r requirements/install.in
typing-extensions==4.7.1
# via importlib-metadata
ujson==1.35
# via -r requirements/install.in
unicodecsv==0.14.1
Expand Down
2 changes: 1 addition & 1 deletion udata/app.py
Expand Up @@ -207,7 +207,7 @@ def standalone(app):
def register_extensions(app):
from udata import (
models, routing, tasks, mail, i18n, auth, search, sitemap,
sentry
sentry, notifications
)
tasks.init_app(app)
i18n.init_app(app)
Expand Down
28 changes: 25 additions & 3 deletions udata/core/discussions/api.py
Expand Up @@ -6,6 +6,8 @@

from udata.auth import admin_permission
from udata.api import api, API, fields
from udata.core.spam.api import SpamAPIMixin
from udata.core.spam.fields import spam_fields
from udata.utils import id_or_404
from udata.core.user.api_fields import user_ref_fields

Expand All @@ -24,6 +26,7 @@
'posted_by': fields.Nested(user_ref_fields,
description='The message author'),
'posted_on': fields.ISODateTime(description='The message posting date'),
'spam': fields.Nested(spam_fields),
})

discussion_fields = api.model('Discussion', {
Expand All @@ -43,6 +46,7 @@
'url': fields.UrlFor('api.discussion',
description='The discussion API URI'),
'extras': fields.Raw(description='Extra attributes as key-value pairs'),
'spam': fields.Nested(spam_fields),
})

start_discussion_fields = api.model('DiscussionStart', {
Expand Down Expand Up @@ -86,6 +90,12 @@
help='The page size to fetch')


@ns.route('/<id>/spam/', endpoint='discussion_spam')
@ns.doc(delete={'id': 'unspam'})
class DiscussionSpamAPI(SpamAPIMixin):
model = Discussion


@ns.route('/<id>/', endpoint='discussion')
class DiscussionAPI(API):
'''
Expand Down Expand Up @@ -123,9 +133,9 @@ def post(self, id):
discussion.closed = datetime.utcnow()
discussion.save()
if close:
on_discussion_closed.send(discussion, message=message_idx)
discussion.signal_close(message=message_idx)
else:
on_new_discussion_comment.send(discussion, message=message_idx)
discussion.signal_comment(message=message_idx)
return discussion

@api.secure(admin_permission)
Expand All @@ -139,6 +149,17 @@ def delete(self, id):
return '', 204


@ns.route('/<id>/comments/<int:cidx>/spam', endpoint='discussion_comment_spam')
@ns.doc(delete={'id': 'unspam'})
class DiscussionSpamAPI(SpamAPIMixin):
def get_model(self, id, cidx):
discussion = Discussion.objects.get_or_404(id=id_or_404(id))
if len(discussion.discussion) <= cidx:
api.abort(404, 'Comment does not exist')
elif cidx == 0:
api.abort(400, 'You cannot unspam the first comment of a discussion')
return discussion, discussion.discussion[cidx]

@ns.route('/<id>/comments/<int:cidx>', endpoint='discussion_comment')
class DiscussionCommentAPI(API):
'''
Expand Down Expand Up @@ -196,6 +217,7 @@ def post(self):
discussion = Discussion(user=current_user.id, discussion=[message])
form.populate_obj(discussion)
discussion.save()
on_new_discussion.send(discussion)

discussion.signal_new()

return discussion, 201
35 changes: 33 additions & 2 deletions udata/core/discussions/models.py
Expand Up @@ -2,20 +2,26 @@
from datetime import datetime

from udata.models import db
from udata.core.spam.models import SpamMixin, spam_protected
from .signals import (on_new_discussion, on_discussion_closed, on_new_discussion_comment)


log = logging.getLogger(__name__)


COMMENT_SIZE_LIMIT = 50000


class Message(db.EmbeddedDocument):
class Message(SpamMixin, db.EmbeddedDocument):
content = db.StringField(required=True)
posted_on = db.DateTimeField(default=datetime.utcnow, required=True)
posted_by = db.ReferenceField('User')

def texts_to_check_for_spam(self):
return [self.content]


class Discussion(db.Document):
class Discussion(SpamMixin, db.Document):
user = db.ReferenceField('User')
subject = db.GenericReferenceField()
title = db.StringField(required=True)
Expand All @@ -41,8 +47,33 @@ def person_involved(self, person):
"""
return any(message.posted_by == person for message in self.discussion)

def texts_to_check_for_spam(self):
# Discussion should always have a first message but it's not the case in some tests…
return [self.title, self.discussion[0].content if len(self.discussion) else '']

def embeds_to_check_for_spam(self):
return self.discussion[1:]

@property
def external_url(self):
return self.subject.url_for(
_anchor='discussion-{id}'.format(id=self.id),
_external=True)

def spam_report_title(self):
return self.title

def spam_report_link(self):
return self.external_url

@spam_protected()
def signal_new(self):
on_new_discussion.send(self)

@spam_protected(lambda discussion, message: discussion.discussion[message])
def signal_close(self, message):
on_discussion_closed.send(self, message=message)

@spam_protected(lambda discussion, message: discussion.discussion[message])
def signal_comment(self, message):
on_new_discussion_comment.send(self, message=message)
Empty file added udata/core/spam/__init__.py
Empty file.
58 changes: 58 additions & 0 deletions udata/core/spam/api.py
@@ -0,0 +1,58 @@
from mongoengine import Q

from udata.api import api, API
from udata.auth import admin_permission
from udata.core.discussions.models import Discussion
from udata.core.spam.fields import potential_spam_fields
from udata.core.spam.models import POTENTIAL_SPAM
from udata.utils import id_or_404


class SpamAPIMixin(API):
"""
Base Spam Model API.
"""
model = None

def get_model(self, id):
"""
This function returns the base model and the spamable model which can be different. The base model is the
model stored inside Mongo and the spamable model is the embed document (for example a comment inside a
discussion)
"""
model = self.model.objects.get_or_404(id=id_or_404(id))
return model, model

@api.secure(admin_permission)
def delete(self, **kwargs):
"""
Mark a potential spam as no spam
"""
base_model, model = self.get_model(**kwargs)

if not model.is_spam():
return {}, 200

model.mark_as_no_spam(base_model)
return {}, 200


ns = api.namespace('spam', 'Spam related operations')


@ns.route('/', endpoint='spam')
class SpamAPI(API):
"""
Base class for a discussion thread.
"""
@api.doc('get_potential_spams')
@api.secure(admin_permission)
@api.marshal_with(potential_spam_fields)
def get(self):
"""Get all potential spam objects"""
discussions = Discussion.objects(Q(spam__status=POTENTIAL_SPAM) | Q(discussion__spam__status=POTENTIAL_SPAM))

return [{
'title': discussion.spam_report_title(),
'link': discussion.spam_report_link(),
} for discussion in discussions]
12 changes: 12 additions & 0 deletions udata/core/spam/fields.py
@@ -0,0 +1,12 @@
from udata.api import api, fields
from .models import SPAM_STATUS_CHOICES

spam_fields = api.model('Spam', {
'status': fields.String(description='Status', enum=SPAM_STATUS_CHOICES, readonly=True),
})

potential_spam_fields = api.model('PotentialSpam', {
'title': fields.String(readonly=True),
'link': fields.String(readonly=True),
})

0 comments on commit d7fbea1

Please sign in to comment.