Skip to content

Commit c7ffefa

Browse files
committed
Try adapting to malformed mboxes
Reviewed-by: ehelin
1 parent 03ea4f9 commit c7ffefa

File tree

2 files changed

+52
-8
lines changed
  • mailinglist/src

2 files changed

+52
-8
lines changed

mailinglist/src/main/java/org/openjdk/skara/mailinglist/Mbox.java

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public class Mbox {
3737
private final static Logger log = Logger.getLogger("org.openjdk.skara.mailinglist");
3838

3939
private final static Pattern mboxMessagePattern = Pattern.compile(
40-
"^\\R^(From (?:.(?!^\\R^From ))*)", Pattern.MULTILINE | Pattern.DOTALL);
40+
"^(From (?:.(?!^\\R^From ))*)", Pattern.MULTILINE | Pattern.DOTALL);
4141
private final static DateTimeFormatter ctimeFormat = DateTimeFormatter.ofPattern(
4242
"EEE LLL dd HH:mm:ss yyyy", Locale.US);
4343
private final static Pattern fromStringEncodePattern = Pattern.compile("^(>*From )", Pattern.MULTILINE);
@@ -46,15 +46,30 @@ public class Mbox {
4646
private final static Pattern decodedQuotedPrintablePattern = Pattern.compile("=\\?utf-8\\?b\\?(.*?)\\?=");
4747

4848
private static List<Email> splitMbox(String mbox) {
49+
// Initial split
4950
var messages = mboxMessagePattern.matcher(mbox).results()
5051
.map(match -> match.group(1))
52+
.filter(message -> message.length() > 0)
53+
.map(Mbox::decodeFromStrings)
54+
.map(Mbox::decodeQuotedPrintable)
5155
.collect(Collectors.toList());
52-
return messages.stream()
53-
.filter(message -> message.length() > 0)
54-
.map(Mbox::decodeFromStrings)
55-
.map(Mbox::decodeQuotedPrintable)
56-
.map(Email::parse)
57-
.collect(Collectors.toList());
56+
57+
// Pipermail can occasionally fail to encode 'From ' in message bodies, try to handle this
58+
var messageBuilder = new StringBuilder();
59+
var parsedMails = new ArrayList<Email>();
60+
Collections.reverse(messages);
61+
for (var message : messages) {
62+
messageBuilder.insert(0, message);
63+
try {
64+
var email = Email.parse(messageBuilder.toString());
65+
parsedMails.add(email);
66+
messageBuilder.setLength(0);
67+
} catch (RuntimeException ignored) {
68+
}
69+
}
70+
71+
Collections.reverse(parsedMails);
72+
return parsedMails;
5873
}
5974

6075
private static String encodeFromStrings(String body) {

mailinglist/src/test/java/org/openjdk/skara/mailinglist/MboxTests.java

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@
2727

2828
import org.junit.jupiter.api.Test;
2929

30+
import java.io.IOException;
31+
import java.nio.charset.StandardCharsets;
32+
import java.nio.file.Files;
3033
import java.time.Duration;
3134

32-
import static org.junit.jupiter.api.Assertions.assertEquals;
35+
import static org.junit.jupiter.api.Assertions.*;
3336

3437
class MboxTests {
3538
@Test
@@ -155,4 +158,30 @@ void utf8Encode() {
155158
assertEquals(sentMail, conversation.first());
156159
}
157160
}
161+
162+
@Test
163+
void unencodedFrom() throws IOException {
164+
try (var folder = new TemporaryDirectory()) {
165+
var rawMbox = folder.path().resolve("test.mbox");
166+
Files.writeString(rawMbox,
167+
"From test at example.com Wed Aug 21 17:22:50 2019\n" +
168+
"From: test at example.com (test at example.com)\n" +
169+
"Date: Wed, 21 Aug 2019 17:22:50 +0000\n" +
170+
"Subject: this is a test\n" +
171+
"Message-ID: <abc123@example.com>\n" +
172+
"\n" +
173+
"Sometimes there are unencoded from lines as well\n" +
174+
"\n" +
175+
"From this point onwards, it may be hard to parse this\n" +
176+
"\n", StandardCharsets.UTF_8);
177+
var mbox = MailingListServerFactory.createMboxFileServer(folder.path());
178+
var list = mbox.getList("test");
179+
var conversations = list.conversations(Duration.ofDays(30));
180+
assertEquals(1, conversations.size());
181+
var conversation = conversations.get(0);
182+
assertEquals(1, conversation.allMessages().size());
183+
assertTrue(conversation.first().body().contains("there are unencoded"), conversation.first().body());
184+
assertTrue(conversation.first().body().contains("this point onwards"), conversation.first().body());
185+
}
186+
}
158187
}

0 commit comments

Comments
 (0)