Skip to content
Permalink
Browse files

Implemented NAS-2519 by replacing the listing of all valid tlds with …

…the public suffixes file
  • Loading branch information...
svcarlsen committed Jul 18, 2016
1 parent 89f0929 commit ce98b36a592e080e6ac036f4dbd5e300b18f760c
@@ -259,12 +259,7 @@
* The default number of jobs to show in the harvest status section, on one result page.
*/
public static String HARVEST_STATUS_DFT_PAGE_SIZE = "settings.common.webinterface.harvestStatus.defaultPageSize";
/**
* <b>settings.common.topLevelDomains.tld</b>: <br>
* Valid top level domain, like .co.uk, .dk, .org. Is part of repeated in settings for each top level domain
*/
public static String TLDS = "settings.common.topLevelDomains.tld";


// TODO Currently only used by harvestscheduler - move to harvester
// settings?
/**
@@ -22,15 +22,19 @@
*/
package dk.netarkivet.common.utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.ArgumentNotValid;

@@ -45,8 +49,8 @@
/** Valid characters in a domain name, according to RFC3490. */
public static final String DOMAINNAME_CHAR_REGEX_STRING = "[^\\0000-,.-/:-@\\[-`{-\\0177]+";

/** A string for a regexp recognising a TLD read from settings. */
public static final String TLD_REGEX_STRING = "\\.(" + StringUtils.conjoin("|", readTlds()) + ")";
/** A string for a regexp recognising a TLD read from the public suffix file. */
public static final String TLD_REGEX_STRING = "\\.(" + StringUtils.conjoin("|", readTldsFromPublicSuffixFile(true)) + ")";

/**
* Regexp for matching a valid domain, that is a single domainnamepart followed by a TLD from settings, or an IP
@@ -65,27 +69,50 @@
private DomainUtils() {
}


/**
* Helper method for reading TLDs from settings. Will read all settings, validate them as legal TLDs and warn and
* ignore them if any are invalid. Settings may be with or without prefix "."
*
* Helper method for reading TLDs from the embedded public suffix file. Will read all entries, validate them as legal TLDs and warn and
* ignore them if any are invalid.
* @param asPattern if true, return a list of quoted Strings using Pattern.quote
* @return a List of TLDs as Strings
*/
private static List<String> readTlds() {
protected static List<String> readTldsFromPublicSuffixFile(boolean asPattern) {
List<String> tlds = new ArrayList<String>();
for (String tld : Settings.getAll(CommonSettings.TLDS)) {
if (tld.startsWith(".")) {
tld = tld.substring(1);
}
if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
log.warn("Invalid tld '{}', ignoring", tld);
continue;
}
tlds.add(Pattern.quote(tld));
}
String filePath = "dk/netarkivet/common/utils/public_suffix_list.dat";
InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(filePath);

if (stream != null) {
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(stream));
String line;
while ((line = br.readLine()) != null) {
String tld = line.trim();
if (tld.isEmpty() || tld.startsWith("//")) {
continue;
} else {
if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
log.warn("Invalid tld '{}', ignoring", tld);
continue;
}
if (asPattern) {
tlds.add(Pattern.quote(tld));
} else {
tlds.add(tld);
}
}
}
} catch(IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeQuietly(br);
}
} else {
log.warn("Filepath '{}' to public suffix_list incorrect", filePath);
}
return tlds;
}

/**
* Check if a given domainName is valid domain. A valid domain is an IP address or a domain name part followed by a
* TLD as defined in settings.
@@ -0,0 +1,3 @@
Downloaded from https://www.publicsuffix.org/list/public_suffix_list.dat
on July 18, 2016

0 comments on commit ce98b36

Please sign in to comment.
You can’t perform that action at this time.