Skip to content

Commit

Permalink
Detect links and nostr links in notes (#39)
Browse files Browse the repository at this point in the history
Links will not be broken up as to avoid breaking them. Nostr links will
be detected and convereted to njump links.
  • Loading branch information
boreq committed Nov 13, 2023
1 parent c498ebc commit 91426e2
Show file tree
Hide file tree
Showing 8 changed files with 662 additions and 30 deletions.
2 changes: 2 additions & 0 deletions cmd/crossposting-service/di/wire.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/planetary-social/nos-crossposting-service/service/app"
"github.com/planetary-social/nos-crossposting-service/service/config"
"github.com/planetary-social/nos-crossposting-service/service/domain"
"github.com/planetary-social/nos-crossposting-service/service/domain/content"
)

func BuildService(context.Context, config.Config) (Service, func(), error) {
Expand Down Expand Up @@ -94,6 +95,7 @@ var downloaderSet = wire.NewSet(
)

var tweetGeneratorSet = wire.NewSet(
content.NewTransformer,
domain.NewTweetGenerator,
wire.Bind(new(app.TweetGenerator), new(*domain.TweetGenerator)),
)
6 changes: 4 additions & 2 deletions cmd/crossposting-service/di/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

307 changes: 307 additions & 0 deletions service/domain/content/lexer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
package content

import (
"strings"
"unicode"
"unicode/utf8"

"github.com/boreq/errors"
)

type Token struct {
Type TokenType
Text string
}

var (
TokenTypeText = TokenType{"text"}
TokenTypeLink = TokenType{"link"}
TokenTypeNostrLink = TokenType{"nostrLink"}
)

type TokenType struct {
s string
}

type Lexer struct {
in string
out string
tokens []Token
}

func NewLexer(s string) *Lexer {
return &Lexer{in: s}
}

func (l *Lexer) Lex() ([]Token, error) {
if !utf8.ValidString(l.in) {
return nil, errors.New("invalid utf-8")
}

var state stateFn = stateText

for state != nil {
newState, err := state(l)
if err != nil {
return nil, errors.Wrap(err, "state fn error")
}

state = newState
}

return l.mergeConsecutiveTexts(l.tokens), nil
}

func (l *Lexer) next() (rune, bool) {
r, size := utf8.DecodeRuneInString(l.in)
if r == utf8.RuneError && size == 0 {
return 0, false
}
if r == utf8.RuneError && size == 1 {
panic("invalid utf-8") // checked in constructor
}
l.in = l.in[size:]
l.out += string(r)
return r, true
}

func (l *Lexer) back() error {
r, size := utf8.DecodeLastRuneInString(l.out)
if r == utf8.RuneError && size == 0 {
return errors.New("empty out")
}
if r == utf8.RuneError && size == 1 {
return errors.New("this should be impossible, out is malformed")
}
l.out = l.out[:len(l.out)-size]
l.in = string(r) + l.in
return nil
}

func (l *Lexer) backN(n int) error {
for i := 0; i < n; i++ {
if err := l.back(); err != nil {
return errors.Wrapf(err, "error calling back for index '%d'", i)
}
}
return nil
}

func (l *Lexer) tryOrBack(s string) bool {
counter := 0
for _, expectedR := range s {
counter++
nextR, ok := l.next()
if !ok || nextR != expectedR {
if err := l.backN(counter); err != nil {
panic(err) // we just called next `counter` times so if there is a bug here it would have to be insanely silly
}
return false
}
}
return true
}

func (l *Lexer) comesNext(s string) bool {
counter := 0

defer func() {
if err := l.backN(counter); err != nil {
panic(err) // we just called next `counter` times so if there is a bug here it would have to be insanely silly
}
}()

for _, expectedR := range s {
nextR, ok := l.next()
if ok {
counter++
}
if !ok || nextR != expectedR {
return false
}
}
return true

}

func (l *Lexer) emit(typ TokenType) {
if l.out != "" {
l.tokens = append(l.tokens,
Token{
Type: typ,
Text: l.out,
},
)
l.out = ""
}
}

func (l *Lexer) peek() (rune, bool) {
r, ok := l.next()
if ok {
if err := l.back(); err != nil {
panic(err)
}
}
return r, ok
}

func (l *Lexer) destroyPrefix(prefix string) {
l.out = strings.TrimPrefix(l.out, prefix)
}

// honestly I feel like this is easier than debugging the lexer so that it
// doesn't make those mistakes
func (l *Lexer) mergeConsecutiveTexts(tokens []Token) []Token {
var result []Token
for _, token := range tokens {
if len(result) > 0 && token.Type == TokenTypeText && result[len(result)-1].Type == TokenTypeText {
result[len(result)-1].Text += token.Text
} else {
result = append(result, token)
}
}

return result

}

type stateFn func(l *Lexer) (stateFn, error)

const (
httpColonSlashSlash = "http://"
httpsColonSlashSlash = "https://"
nostrColon = "nostr:"
nevent = "nevent"
npub = "npub"
note = "note"
)

func stateText(l *Lexer) (stateFn, error) {
for {
if l.comesNext(httpColonSlashSlash) || l.comesNext(httpsColonSlashSlash) {
l.emit(TokenTypeText)
return stateLinkProtocol, nil
}

if l.comesNext(nostrColon) {
l.emit(TokenTypeText)
return stateNostrLinkProtocol, nil
}

if l.comesNext(nevent) || l.comesNext(npub) || l.comesNext(note) {
l.emit(TokenTypeText)
return stateNostrLinkType, nil
}

_, ok := l.next()
if !ok {
l.emit(TokenTypeText)
return nil, nil
}
}
}

func stateLinkProtocol(l *Lexer) (stateFn, error) {
if !l.tryOrBack(httpColonSlashSlash) && !l.tryOrBack(httpsColonSlashSlash) {
return nil, errors.New("where did the protocol go?")
}

return stateLinkAddress, nil
}

func stateLinkAddress(l *Lexer) (stateFn, error) {
counter := 0
for {
r, ok := l.next()
if !ok {
if counter == 0 {
l.emit(TokenTypeText)
} else {
l.emit(TokenTypeLink)
}
return nil, nil
}

counter++

if isValidLinkCharacterExcludingDot(r) {
continue
}

switch r {
case '.':
nextR, ok := l.peek()
if !ok {
continue
}

if !isValidLinkCharacterExcludingDot(nextR) {
if err := l.back(); err != nil {
return nil, errors.New("where did the dot go?")
}
l.emit(TokenTypeLink)
return stateText, nil
}
default:
if err := l.back(); err != nil {
return nil, errors.New("we just went forward but we can't go back?")
}
l.emit(TokenTypeLink)
return stateText, nil
}
}
}

func stateNostrLinkProtocol(l *Lexer) (stateFn, error) {
if !l.tryOrBack(nostrColon) {
return nil, errors.New("where did 'nostr:' go?")
}

return stateNostrLinkType, nil
}

func stateNostrLinkType(l *Lexer) (stateFn, error) {
if !l.tryOrBack(nevent) && !l.tryOrBack(npub) && !l.tryOrBack(note) {
return stateText, nil
}

return stateNostrLinkData, nil
}

func stateNostrLinkData(l *Lexer) (stateFn, error) {
counter := 0
for {
r, ok := l.next()
if !ok || !isBech32(r) {
if ok {
if err := l.back(); err != nil {
return nil, errors.Wrap(err, "we just consumed a rune?")
}
}

if counter == 0 {
l.emit(TokenTypeText)
} else {
l.destroyPrefix(nostrColon)
l.emit(TokenTypeNostrLink)
}

return stateText, nil
}

counter++
}
}

func isValidLinkCharacterExcludingDot(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r) || r == '%'
}

func isBech32(r rune) bool {
r = unicode.ToLower(r)
if r == 'b' || r == 'i' || r == 'o' {
return false
}
return (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
}
Loading

0 comments on commit 91426e2

Please sign in to comment.