Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ・ detection in hashtag regex to construct hashtag correctly #22888

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 17 additions & 5 deletions app/models/tag.rb
Expand Up @@ -26,8 +26,12 @@ class Tag < ApplicationRecord
has_many :featured_tags, dependent: :destroy, inverse_of: :tag
has_many :followers, through: :passive_relationships, source: :account

HASHTAG_SEPARATORS = "_\u00B7\u200c"
HASHTAG_NAME_PAT = "([[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_])|([[:word:]_]*[[:alpha:]][[:word:]_]*)"
HASHTAG_SEPARATORS = "_\u00B7\u30FB\u200c"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the unicode for the character in separators to fix the issue.

HASHTAG_FIRST_SEQUENCE_CHUNK_ONE = "[[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}]"
HASHTAG_FIRST_SEQUENCE_CHUNK_TWO = "[[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_]"
HASHTAG_FIRST_SEQUENCE = "(#{HASHTAG_FIRST_SEQUENCE_CHUNK_ONE}#{HASHTAG_FIRST_SEQUENCE_CHUNK_TWO})"
HASTAG_LAST_SEQUENCE = '([[:word:]_]*[[:alpha:]][[:word:]_]*)'
HASHTAG_NAME_PAT = "#{HASHTAG_FIRST_SEQUENCE}|#{HASTAG_LAST_SEQUENCE}"

HASHTAG_RE = /(?:^|[^\/\)\w])#(#{HASHTAG_NAME_PAT})/i
HASHTAG_NAME_RE = /\A(#{HASHTAG_NAME_PAT})\z/i
Expand All @@ -45,7 +49,11 @@ class Tag < ApplicationRecord
scope :listable, -> { where(listable: [true, nil]) }
scope :trendable, -> { Setting.trendable_by_default ? where(trendable: [true, nil]) : where(trendable: true) }
scope :not_trendable, -> { where(trendable: false) }
scope :recently_used, ->(account) { joins(:statuses).where(statuses: { id: account.statuses.select(:id).limit(1000) }).group(:id).order(Arel.sql('count(*) desc')) }
scope :recently_used, ->(account) {
joins(:statuses)
.where(statuses: { id: account.statuses.select(:id).limit(1000) })
.group(:id).order(Arel.sql('count(*) desc'))
}
scope :matches_name, ->(term) { where(arel_table[:name].lower.matches(arel_table.lower("#{sanitize_sql_like(Tag.normalize(term))}%"), nil, true)) } # Search with case-sensitive to use B-tree index

update_index('tags', :self)
Expand Down Expand Up @@ -105,7 +113,8 @@ def find_or_create_by_names(name_or_names)
names = Array(name_or_names).map { |str| [normalize(str), str] }.uniq(&:first)

names.map do |(normalized_name, display_name)|
tag = matching_name(normalized_name).first || create(name: normalized_name, display_name: display_name.gsub(HASHTAG_INVALID_CHARS_RE, ''))
tag = matching_name(normalized_name).first || create(name: normalized_name,
display_name: display_name.gsub(HASHTAG_INVALID_CHARS_RE, ''))

yield tag if block_given?

Expand Down Expand Up @@ -154,6 +163,9 @@ def validate_name_change
end

def validate_display_name_change
errors.add(:display_name, I18n.t('tags.does_not_match_previous_name')) unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero?
unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero?
errors.add(:display_name,
I18n.t('tags.does_not_match_previous_name'))
end
end
end
57 changes: 32 additions & 25 deletions spec/models/tag_spec.rb
@@ -1,21 +1,22 @@
# frozen_string_literal: true
require 'rails_helper'

RSpec.describe Tag, type: :model do
RSpec.describe Tag do
describe 'validations' do
it 'invalid with #' do
expect(Tag.new(name: '#hello_world')).to_not be_valid
expect(described_class.new(name: '#hello_world')).not_to be_valid
end

it 'invalid with .' do
expect(Tag.new(name: '.abcdef123')).to_not be_valid
expect(described_class.new(name: '.abcdef123')).not_to be_valid
end

it 'invalid with spaces' do
expect(Tag.new(name: 'hello world')).to_not be_valid
expect(described_class.new(name: 'hello world')).not_to be_valid
end

it 'valid with aesthetic' do
expect(Tag.new(name: 'aesthetic')).to be_valid
expect(described_class.new(name: 'aesthetic')).to be_valid
end
end

Expand Down Expand Up @@ -62,6 +63,10 @@
expect(subject.match('hello #one·two·three').to_s).to eq ' #one·two·three'
end

it 'matches ・unicode in ぼっち・ざ・ろっく correctly' do
expect(subject.match('testing #ぼっち・ざ・ろっく').to_s).to eq ' #ぼっち・ざ・ろっく'
end

it 'matches ZWNJ' do
expect(subject.match('just add #نرم‌افزار and').to_s).to eq ' #نرم‌افزار'
end
Expand Down Expand Up @@ -89,44 +94,46 @@
describe '.find_normalized' do
it 'returns tag for a multibyte case-insensitive name' do
upcase_string = 'abcABCabcABCやゆよ'
downcase_string = 'abcabcabcabcやゆよ';
downcase_string = 'abcabcabcabcやゆよ'

tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.find_normalized(upcase_string)).to eq tag
expect(described_class.find_normalized(upcase_string)).to eq tag
end
end

describe '.matches_name' do
it 'returns tags for multibyte case-insensitive names' do
upcase_string = 'abcABCabcABCやゆよ'
downcase_string = 'abcabcabcabcやゆよ';
downcase_string = 'abcabcabcabcやゆよ'

tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.matches_name(upcase_string)).to eq [tag]
expect(described_class.matches_name(upcase_string)).to eq [tag]
end

it 'uses the LIKE operator' do
expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')]
result = %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')]
expect(described_class.matches_name('100%abc').to_sql).to eq result
end
end

describe '.matching_name' do
it 'returns tags for multibyte case-insensitive names' do
upcase_string = 'abcABCabcABCやゆよ'
downcase_string = 'abcabcabcabcやゆよ';
downcase_string = 'abcabcabcabcやゆよ'

tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.matching_name(upcase_string)).to eq [tag]
expect(described_class.matching_name(upcase_string)).to eq [tag]
end
end

describe '.find_or_create_by_names' do
let(:upcase_string) { 'abcABCabcABCやゆよ' }
let(:downcase_string) { 'abcabcabcabcやゆよ' }

it 'runs a passed block once per tag regardless of duplicates' do
upcase_string = 'abcABCabcABCやゆよ'
downcase_string = 'abcabcabcabcやゆよ';
count = 0
count = 0

Tag.find_or_create_by_names([upcase_string, downcase_string]) do |tag|
described_class.find_or_create_by_names([upcase_string, downcase_string]) do |_tag|
count += 1
end

Expand All @@ -136,28 +143,28 @@

describe '.search_for' do
it 'finds tag records with matching names' do
tag = Fabricate(:tag, name: "match")
_miss_tag = Fabricate(:tag, name: "miss")
tag = Fabricate(:tag, name: 'match')
_miss_tag = Fabricate(:tag, name: 'miss')

results = Tag.search_for("match")
results = described_class.search_for('match')

expect(results).to eq [tag]
end

it 'finds tag records in case insensitive' do
tag = Fabricate(:tag, name: "MATCH")
_miss_tag = Fabricate(:tag, name: "miss")
tag = Fabricate(:tag, name: 'MATCH')
_miss_tag = Fabricate(:tag, name: 'miss')

results = Tag.search_for("match")
results = described_class.search_for('match')

expect(results).to eq [tag]
end

it 'finds the exact matching tag as the first item' do
similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc)
tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc)
similar_tag = Fabricate(:tag, name: 'matchlater', reviewed_at: Time.now.utc)
tag = Fabricate(:tag, name: 'match', reviewed_at: Time.now.utc)

results = Tag.search_for("match")
results = described_class.search_for('match')

expect(results).to eq [tag, similar_tag]
end
Expand Down