/
TLD.java
213 lines (190 loc) · 7.67 KB
/
TLD.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
package dk.netarkivet.common.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.UnknownID;
import static dk.netarkivet.common.utils.DomainUtils.DOMAINNAME_CHAR_REGEX_STRING;
/**
* Encapsulate the reading of Top level domains from settings and the embedded public_suffix.dat file.
*
*/
public class TLD {
/** The class logger. */
private static final Logger log = LoggerFactory.getLogger(TLD.class);
private static TLD tld;
public final static String PUBLIC_SUFFIX_LIST_EMBEDDED_PATH = "dk/netarkivet/common/utils/public_suffix_list.dat";
public final static String PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH = "conf/public_suffix_list.dat";
/**
* A regular expression matching hostnames, and remembering the hostname in group 1 and the domain in group 2.
*/
private final Pattern HOSTNAME_REGEX;
/** A string for a regexp recognising a TLD */
private final String TLD_REGEX_STRING;
/**
* Regexp for matching a valid domain, that is a single domain-name part followed by a TLD from settings, or an IP
* address.
*/
private final Pattern VALID_DOMAIN_MATCHER;
/**
* GetInstance method for the TLD. Ensures singleton usage of the TLD class.
* @return the current instance of the TLD class.
*/
public static synchronized TLD getInstance() {
if (tld == null) {
tld = new TLD();
}
return tld;
}
/**
* Reset TLD instance. primarily used for testing.
*/
public static void reset() {
tld = null;
}
/**
* List of quoted TLD read from both settings and public suffix file.
*/
private final List<String> tldListQuoted;
/**
* List of TLD read from both settings and public suffix file.
*/
private final List<String> tldList;
/**
* Private constructor of the TLD class. This constructor reads the TLDs from both settings and public suffix file.
* both quoted and unquoted. Sets the TLD_REGEX_STRING,HOSTNAME_REGEX, and VALID_DOMAIN_MATCHER.
*/
private TLD() {
tldListQuoted = new ArrayList<String>();
tldList = new ArrayList<String>();
readTldsFromPublicSuffixFile(tldList, tldListQuoted);
readTldsFromSettings(tldList, tldListQuoted);
TLD_REGEX_STRING = "\\.(" + StringUtils.conjoin("|", tldListQuoted) + ")";
HOSTNAME_REGEX = Pattern.compile("^(|.*?\\.)(" + DOMAINNAME_CHAR_REGEX_STRING + "+"
+ TLD_REGEX_STRING + ")");
VALID_DOMAIN_MATCHER = Pattern.compile("^(" + Constants.IP_REGEX_STRING + "|"
+ DOMAINNAME_CHAR_REGEX_STRING + "+" + TLD_REGEX_STRING + ")$");
}
/**
* Helper method for reading TLDs from settings. Will read all settings, validate them as legal TLDs and warn and
* ignore them if any are invalid. Settings may be with or without prefix "."
* @param tldList the list to add all the tlds found in the settings
* @param quotedTldList the list to add all the tlds found in the settings - as a pattern
*/
protected static void readTldsFromSettings(List<String> tldList, List<String> quotedTldList) {
int count=0;
try {
String[] settingsTlds = Settings.getAll(CommonSettings.TLDS);
for (String tld : settingsTlds) {
if (tld.startsWith(".")) {
tld = tld.substring(1);
}
if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
log.warn("Invalid tld '{}', ignoring", tld);
continue;
}
tldList.add(tld);
quotedTldList.add(Pattern.quote(tld));
count++;
}
log.info("Read {} TLDs from settings", count);
} catch (UnknownID e) {
log.debug("No tlds found in settingsfiles " + StringUtils.conjoin(",", Settings.getSettingsFiles()));
}
}
/**
* Helper method for reading TLDs from the embedded public suffix file. Will read all entries, validate them as legal TLDs and warn and
* ignore them if any are invalid.
* Now silently ignores starred tld's in public suffix file (e.g "*.kw") and exclusion rules (e.g. !metro.tokyo.jp)
* @param tldList the list to add all the tlds found in the public suffix file
* @param quotedTldList the list to add all the tlds found in the public suffix file - as a pattern
*/
protected static void readTldsFromPublicSuffixFile(List<String> tldList, List<String> quotedTldList) {
InputStream stream = getPublicSuffixListDataStream();
boolean silentlyIgnoringStarTldsInPublicSuffixFile = Settings.getBoolean(CommonSettings.TLD_SILENTLY_IGNORE_STARRED_TLDS);
int count=0;
if (stream != null) {
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(stream));
String line;
while ((line = br.readLine()) != null) {
String tld = line.trim();
if (tld.isEmpty() || tld.startsWith("//")) {
continue;
} else if (silentlyIgnoringStarTldsInPublicSuffixFile && (tld.startsWith("*.") || tld.startsWith("!"))) {
continue;
} else {
if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
log.warn("Invalid tld '{}', ignoring", tld);
continue;
}
tldList.add(tld);
quotedTldList.add(Pattern.quote(tld));
}
}
log.info("Read {} TLDs from public suffix file", count);
} catch(IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeQuietly(br);
}
} else {
log.warn("Unable to retrieve public suffix_list failed. No tlds added!");
}
}
private static InputStream getPublicSuffixListDataStream() {
InputStream stream = null;
File alternateExternalFile = new File(PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH);
if (alternateExternalFile.isFile()) {
try {
stream = new FileInputStream(alternateExternalFile);
} catch (FileNotFoundException e) {
// Will never happen!
e.printStackTrace();
}
log.info("Reading public suffixes list from external file '{}'", alternateExternalFile.getAbsolutePath());
} else { // Read embedded copy
log.info("Did not found external public suffix list at '{}'! Reading instead the public suffixes list from embedded file '{}' in common-core.jar-VERSION.jar.",
alternateExternalFile.getAbsolutePath(), PUBLIC_SUFFIX_LIST_EMBEDDED_PATH);
stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(PUBLIC_SUFFIX_LIST_EMBEDDED_PATH);
}
return stream;
}
/**
* @return the VALID_DOMAIN_MATCHER pattern.
*/
public Pattern getValidDomainMatcher() {
return VALID_DOMAIN_MATCHER;
}
/**
*
* @return the HOSTNAME_REGEX pattern.
*/
public Pattern getHostnamePattern() {
return HOSTNAME_REGEX;
}
/**
* GetAllTlds method.
* @param quoted do you want the quoted, or unquoted list.
* @return the quoted list (if quoted=true), else the unquoted list.
*/
public List<String> getAllTlds(boolean quoted) {
if (quoted) {
return tldListQuoted;
} else {
return tldList;
}
}
}