Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
csrster committed Jan 18, 2016
2 parents d442b63 + 65ee048 commit 68a87d4
Show file tree
Hide file tree
Showing 17 changed files with 44 additions and 99 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,10 @@ public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean bl
}

/**
* @param item
* Create Lucene Document for given CrawlDataItem.
* @param item A CrawlDataItem
* @param defaultOrigin
* @return
* @return Lucene Document for the given CrawlDataItem
*/
private Document createDocument(CrawlDataItem item, String defaultOrigin) {
Document doc = new Document();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
import dk.netarkivet.common.utils.Settings;

/**
* Utilities to allow testing databases. //FIXME: Rename without Test as these are not specifically test related.
* Utilities to allow testing databases.
* FIXME: Rename without Test as these are not specifically test related.
*/
public class DatabaseTestUtils {

Expand All @@ -60,7 +61,7 @@ public class DatabaseTestUtils {
*
* @param resourcePath A file that contains a test database.
* @param dbCreationDir
* @return a connection to the database stored in the given file
*
*/
public static void createDatabase(String resourcePath, String dbname, File dbCreationDir) throws Exception {
Settings.set(CommonSettings.DB_MACHINE, "");
Expand Down Expand Up @@ -131,7 +132,6 @@ private static void applyStatementsInInputStream(Connection connection, InputStr
*
* @param resourcePath A file that contains a test database.
* @param dbCreationDir
* @return a connection to the database stored in the given file
*/
public static void createDatabase(String resourcePath, File dbCreationDir) throws Exception {
createDatabase(resourcePath, "derivenamefromresource", dbCreationDir);
Expand All @@ -143,7 +143,6 @@ public static void createDatabase(String resourcePath, File dbCreationDir) throw
*
* @param resourcePath Location of the sql files to create and populate the test DB.
* @param dbCreationDir
* @return a connection to the given sample harvest definition database
*/
public static void createHDDB(String resourcePath, String dbname, File dbCreationDir) throws Exception {
createDatabase(resourcePath, dbname, dbCreationDir);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,9 @@ public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable
/**
* Constructor for HeritrixTemplate class.
*
* @param doc the order.xml
* @param verify If true, verifies if the given dom4j Document contains the elements required by our software.
* @throws ArgumentNotValid if doc is null, or verify is true and doc does not obey the constraints required by our
* software.
* @param template_id The persistent id of the template in the database
* @param template The template as String object
* @throws ArgumentNotValid if template is null.
*/
public H3HeritrixTemplate(long template_id, String template) {
ArgumentNotValid.checkNotNull(template, "String template");
Expand Down Expand Up @@ -264,7 +263,7 @@ public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
* Make sure that Heritrix will archive its data in the chosen archiveFormat.
*
* @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported)
* @throw ArgumentNotValid If the chosen archiveFormat is not supported.
* @throws ArgumentNotValid If the chosen archiveFormat is not supported.
*/
@Override
public void setArchiveFormat(String archiveFormat) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,8 @@ public void setIsActive(boolean isActive) {
* global traps.
*
* @param elementName The name of the added element.
* @param crawlerTraps A list of crawler trap regular expressions to add to this job.
* @param crawlertraps A list of crawler trap regular expressions to add to this job.
*/

public abstract void insertCrawlerTraps(String elementName, List<String> crawlertraps);

/**
Expand Down Expand Up @@ -186,6 +185,12 @@ public void editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg) {
public abstract void writeToFile(File orderXmlFile);
public abstract void setRecoverlogNode(File recoverlogGzFile);

/**
* Construct a H1HeritrixTemplate or H3HeritrixTemplate based on the signature of the given string.
* @param template_id The id of the template
* @param templateAsString The template as a String object
* @return a HeritrixTemplate based on the signature of the given string.
*/
public static HeritrixTemplate getTemplateFromString(long template_id, String templateAsString){
if (templateAsString.contains(H1_SIGNATURE)) {
try {
Expand Down Expand Up @@ -215,8 +220,10 @@ public static HeritrixTemplate read(File orderXmlFile){
}

/**
* Read the template using the given Reader
* @param reader A given Reader
* Read the template using the given Reader.
*
* @param template_id The id of the template
* @param orderTemplateReader A given Reader to read a template
* @return a HeritrixTemplate object
*/
public static HeritrixTemplate read(long template_id, Reader orderTemplateReader) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ public SeedList(String name, String seedsAsString) {
*
* @param url The url to check
* @return true, if it is accepted
* @see {@link HarvesterSettings#VALID_SEED_REGEX}.
* @see HarvesterSettings#VALID_SEED_REGEX
*/
private boolean isAcceptableURL(String url) {
Pattern validSeedPattern = Pattern.compile(Settings.get(HarvesterSettings.VALID_SEED_REGEX));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public AbstractHarvestReport() {
/**
* Constructor from DomainStatsReports.
*
* @param files the result of parsing the crawl.log for domain statistics
* @param dsr the result of parsing the crawl.log for domain statistics
*/
public AbstractHarvestReport(DomainStatsReport dsr) {
ArgumentNotValid.checkNotNull(dsr, "DomainStatsReport dsr");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public class BnfHarvestReport extends AbstractHarvestReport{
/**
* Constructor for this class.
*
* @param files A HeritrixFiles object.
* @param dsr A DomainStatsReport
* @throws IOFailure If the processing of the files goes wrong
*/
public BnfHarvestReport(DomainStatsReport dsr) throws IOFailure {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
import dk.netarkivet.harvester.datamodel.StopReason;

/**
* Class responsible for generating a domain harvest report from crawl logs created by Heritrix and presenting the
* Class responsible for representing a domain harvest report from crawl logs created by Heritrix and presenting the
* relevant information to clients.
*/
@SuppressWarnings({"serial"})
Expand All @@ -50,16 +50,7 @@ public class LegacyHarvestReport extends AbstractHarvestReport {
private static final Logger log = LoggerFactory.getLogger(LegacyHarvestReport.class);

/**
* The constructor gets the data in a crawl.log file, and parses the file. The crawl.log is described in the
* Heritrix user-manual, section 8.2.1: http://crawler.archive.org/articles/user_manual/analysis.html#logs Note:
* Invalid lines are logged and then ignored.
* <p>
* Each url listed in the file is assigned to a domain, the total object count and byte count per domain is
* calculated. Finally, a StopReason is found for each domain: When the response is CrawlURI.S_BLOCKED_BY_QUOTA (
* currently = -5003), the StopReason is set to StopReason.SIZE_LIMIT, if the annotation equals "Q:group-max-all-kb"
* or StopReason.OBJECT_LIMIT, if the annotation equals "Q:group-max-fetch-successes".
*
* @param hFiles the Heritrix reports and logs.
* @dsr a DomainStatsReport for a harvest
*/
public LegacyHarvestReport(DomainStatsReport dsr) {
super(dsr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ public static void processRequest(PageContext context, I18n i18n) throws Forward
*
* @param context the context of the servlet request triggering this action.
* @param i18n the internationalisation to use for presenting the results.
* @return true, if we should continue our rendering of the page, false otherwise
*/
protected abstract void doAction(PageContext context, I18n i18n);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@

/**
* Base implementation for a harvest report.
* The constructor gets the data in a crawl.log file, and parses the file. The crawl.log is described in the
* Heritrix user-manual, section 8.2.1: http://crawler.archive.org/articles/user_manual/analysis.html#logs Note:
* Invalid lines are logged and then ignored.
* <p>
* Each url listed in the file is assigned to a domain, the total object count and byte count per domain is
* calculated. Finally, a StopReason is found for each domain: When the response is CrawlURI.S_BLOCKED_BY_QUOTA (
* currently = -5003), the StopReason is set to StopReason.SIZE_LIMIT, if the annotation equals "Q:group-max-all-kb"
* or StopReason.OBJECT_LIMIT, if the annotation equals "Q:group-max-fetch-successes".
*/
@SuppressWarnings({"serial"})
public class HarvestReportGenerator {
Expand Down Expand Up @@ -380,8 +388,7 @@ private String getDomainNameFromURIString(String uriAsString) throws URISyntaxEx
}

/**
*
* @return
* @return default stopReason
*/
public StopReason getDefaultStopReason() {
return defaultStopReason;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.management.AttributeNotFoundException;
import javax.management.MBeanException;
import javax.management.ReflectionException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
Expand Down Expand Up @@ -196,9 +193,7 @@ public WARCWriterProcessor(final String name) {
e.setExpertSetting(true);

// Add map setting to add NAS metadata to WarcInfo records.

e = addElementToDefinition(new MapType(ATTR_METADATA_ITEMS, "Metadata items.", String.class));
//e = addElementToDefinition(new StringList(ATTR_METADATA_ITEMS, "Metadata items."));
e.setOverrideable(true);
e.setExpertSetting(true);
}
Expand All @@ -207,45 +202,7 @@ protected void setupPool(final AtomicInteger serialNo) {
setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(), getPoolMaximumWait()));
}

/**
* @return Metadata inputs as convenient map. Returns null if no metadata items.
* @throws AttributeNotFoundException
* @throws ReflectionException
* @throws MBeanException
*/
/*
public Map<String,Object> getMetadataItems() throws AttributeNotFoundException, MBeanException, ReflectionException {
Map<String,Object> result = null;
MapType items = (MapType)getAttribute(ATTR_METADATA_ITEMS);
if (items != null) {
for (Iterator i = items.iterator(null); i.hasNext();) {
Attribute a = (Attribute)i.next();
if (result == null) {
result = new HashMap<String,Object>();
}
result.put(a.getName(), a.getValue());
}
}
return result;
}
*/

@SuppressWarnings("unchecked")
/*
public List<String> getMetadataItems() {
ArrayList<String> results = new ArrayList<String>();
Object obj = getAttributeUnchecked(ATTR_METADATA_ITEMS);
if (obj != null) {
List list = (StringList)obj;
for (Iterator i = list.iterator(); i.hasNext();) {
String str = (String)i.next();
results.add(str);
}
}
return results;
}
*/


/**
* Writes a CrawlURI and its associated data to store file.
* <p>
Expand Down Expand Up @@ -711,15 +668,7 @@ protected String getFirstrecordBody(File orderFile) {
} catch (XPathExpressionException e) {
logger.log(Level.WARNING, "Error obtaining metadata items", e);
}
/* catch (AttributeNotFoundException e) {
logger.log(Level.WARNING, "Error obtaining warcinfo", e);
} catch (MBeanException e) {
logger.log(Level.WARNING, "Error obtaining warcinfo", e);
} catch (ReflectionException e) {
logger.log(Level.WARNING, "Error obtaining warcinfo", e);
}
*/


// add fields from harvesInfo.xml version 0.4
/*
* <harvestInfo> <version>0.4</version> <jobId>1</jobId> <priority>HIGHPRIORITY</priority>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -871,7 +871,7 @@ protected void doTimestampAnalysis(CrawlURI curi, Document urlHit, Statistics cu
*
* @param fieldName name of the field to look in.
* @param value The value to query for
* @returns A Query for the given value in the given field.
* @return A Query for the given value in the given field.
*/
protected Query queryField(String fieldName, String value) {
Query query = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public ContentSizeAnnotationPostProcessor() {
* @param crawlURI URI to add annotation for if successful.
* @throws ArgumentNotValid if crawlURI is null.
* @throws InterruptedException never.
* @see Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI)
* @see Processor
*/
protected void innerProcess(CrawlURI crawlURI) throws InterruptedException {
ArgumentNotValid.checkNotNull(crawlURI, "CrawlURI crawlURI");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,11 @@ public class DomainnameQueueAssignmentPolicy
/** Return a key for queue names based on domain names (last two parts of
* host name) or IP address. They key may include a #<portnr> at the end.
*
* @param controller The controller the crawl is running on.
* @param cauri A potential URI.
* @param basis A potential URI.
* @return a class key (really an arbitrary string), one of <domainOrIP>,
* <domainOrIP>#<port>, or "default...".
* @see HostnameQueueAssignmentPolicy#getClassKey(
* org.archive.crawler.framework.CrawlController,
* org.archive.crawler.datamodel.CandidateURI)
* @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.modules.CrawlURI)
*/

@Override
protected String getCoreKey(UURI basis) {
String candidate;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,10 @@ public class SeedUriDomainnameQueueAssignmentPolicy
/** Return a key for queue names based on domain names (last two parts of
* host name) or IP address. They key may include a #<portnr> at the end.
*
* @param controller The controller the crawl is running on.
* @param cauri A potential URI.
* @return a class key (really an arbitrary string), one of <domainOrIP>,
* <domainOrIP>#<port>, or "default...".
* @see HostnameQueueAssignmentPolicy#getClassKey(
* org.archive.crawler.framework.CrawlController,
* org.archive.crawler.datamodel.CandidateURI)
* @see HostnameQueueAssignmentPolicy#getClassKey(CrawlURI)
*/
public String getClassKey(CrawlURI cauri) {
String candidate;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -926,7 +926,7 @@ protected void doTimestampAnalysis(CrawlURI curi, Document urlHit,
*
* @param fieldName name of the field to look in.
* @param value The value to query for
* @returns A Query for the given value in the given field.
* @return A Query for the given value in the given field.
*/
protected Query queryField(String fieldName, String value) {
Query query = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ value="<%= jobID %>"/>
%>
<a href="<%=link %>"><fmt:message key="show.job.0.harvesttemplate">
<fmt:param value="<%=job.getJobID()%>"/>
</fmt:message></a>&nbsp;(<a href="<%=linkWithrequestedType %>">text/plain)</a>)
</fmt:message></a>&nbsp;(<a href="<%=linkWithrequestedType %>">text/plain</a>)


<%
Expand Down

0 comments on commit 68a87d4

Please sign in to comment.