Merge remote-tracking branch 'origin/master'

netarchivesuite · Jan 18, 2016 · 68a87d4 · 68a87d4
2 parents d442b63 + 65ee048
commit 68a87d4
Show file tree

Hide file tree

Showing 17 changed files with 44 additions and 99 deletions.
diff --git a/common/common-core/src/main/java/is/hi/bok/deduplicator/DigestIndexer.java b/common/common-core/src/main/java/is/hi/bok/deduplicator/DigestIndexer.java
@@ -218,9 +218,10 @@ public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean bl
     }
 
     /**
-     * @param item
+     * Create Lucene Document for given CrawlDataItem.
+     * @param item A CrawlDataItem
      * @param defaultOrigin
-     * @return
+     * @return Lucene Document for the given CrawlDataItem
      */
     private Document createDocument(CrawlDataItem item, String defaultOrigin) {
         Document doc = new Document();

diff --git a/...ter/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/DatabaseTestUtils.java b/...ter/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/DatabaseTestUtils.java
@@ -46,7 +46,8 @@
 import dk.netarkivet.common.utils.Settings;
 
 /**
- * Utilities to allow testing databases. //FIXME: Rename without Test as these are not specifically test related.
+ * Utilities to allow testing databases. 
+ * FIXME: Rename without Test as these are not specifically test related.
  */
 public class DatabaseTestUtils {
 
@@ -60,7 +61,7 @@ public class DatabaseTestUtils {
      *
      * @param resourcePath A file that contains a test database.
      * @param dbCreationDir
-     * @return a connection to the database stored in the given file
+     * 
      */
     public static void createDatabase(String resourcePath, String dbname, File dbCreationDir) throws Exception {
         Settings.set(CommonSettings.DB_MACHINE, "");
@@ -131,7 +132,6 @@ private static void applyStatementsInInputStream(Connection connection, InputStr
      *
      * @param resourcePath A file that contains a test database.
      * @param dbCreationDir
-     * @return a connection to the database stored in the given file
      */
     public static void createDatabase(String resourcePath, File dbCreationDir) throws Exception {
         createDatabase(resourcePath, "derivenamefromresource", dbCreationDir);
@@ -143,7 +143,6 @@ public static void createDatabase(String resourcePath, File dbCreationDir) throw
      *
      * @param resourcePath Location of the sql files to create and populate the test DB.
      * @param dbCreationDir
-     * @return a connection to the given sample harvest definition database
      */
     public static void createHDDB(String resourcePath, String dbname, File dbCreationDir) throws Exception {
         createDatabase(resourcePath, dbname, dbCreationDir);

diff --git a/...er/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/H3HeritrixTemplate.java b/...er/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/H3HeritrixTemplate.java
@@ -102,10 +102,9 @@ public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable
     /**
      * Constructor for HeritrixTemplate class.
      *
-     * @param doc the order.xml
-     * @param verify If true, verifies if the given dom4j Document contains the elements required by our software.
-     * @throws ArgumentNotValid if doc is null, or verify is true and doc does not obey the constraints required by our
-     * software.
+     * @param template_id The persistent id of the template in the database
+     * @param template The template as String object
+     * @throws ArgumentNotValid if template is null.
      */
     public H3HeritrixTemplate(long template_id, String template) {
         ArgumentNotValid.checkNotNull(template, "String template");
@@ -264,7 +263,7 @@ public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
      * Make sure that Heritrix will archive its data in the chosen archiveFormat.
      *
      * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported)
-     * @throw ArgumentNotValid If the chosen archiveFormat is not supported.
+     * @throws ArgumentNotValid If the chosen archiveFormat is not supported.
      */
 	@Override
 	public void setArchiveFormat(String archiveFormat) {

diff --git a/...ster/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/HeritrixTemplate.java b/...ster/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/HeritrixTemplate.java
@@ -124,9 +124,8 @@ public void setIsActive(boolean isActive) {
 	 * global traps.
 	 *
 	 * @param elementName The name of the added element.
-	 * @param crawlerTraps A list of crawler trap regular expressions to add to this job.
+	 * @param crawlertraps A list of crawler trap regular expressions to add to this job.
 	 */
-
 	public abstract void insertCrawlerTraps(String elementName, List<String> crawlertraps);
 
 	/**
@@ -186,6 +185,12 @@ public void editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg) {
 	public abstract void writeToFile(File orderXmlFile);
 	public abstract void setRecoverlogNode(File recoverlogGzFile);
 
+	/**
+	 * Construct a H1HeritrixTemplate or H3HeritrixTemplate based on the signature of the given string.
+	 * @param template_id The id of the template
+	 * @param templateAsString The template as a String object
+	 * @return a HeritrixTemplate based on the signature of the given string.
+	 */
 	public static HeritrixTemplate getTemplateFromString(long template_id, String templateAsString){
 		if (templateAsString.contains(H1_SIGNATURE)) {
 			try {
@@ -215,8 +220,10 @@ public static HeritrixTemplate read(File orderXmlFile){
 	}
 
 	/**
-	 * Read the template using the given Reader
-	 * @param reader A given Reader
+	 * Read the template using the given Reader.
+	 * 
+	 * @param template_id The id of the template
+	 * @param orderTemplateReader A given Reader to read a template
 	 * @return a HeritrixTemplate object
 	 */
 	public static HeritrixTemplate read(long template_id, Reader orderTemplateReader) {

diff --git a/harvester/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/SeedList.java b/harvester/harvester-core/src/main/java/dk/netarkivet/harvester/datamodel/SeedList.java
@@ -92,7 +92,7 @@ public SeedList(String name, String seedsAsString) {
      *
      * @param url The url to check
      * @return true, if it is accepted
-     * @see {@link HarvesterSettings#VALID_SEED_REGEX}.
+     * @see HarvesterSettings#VALID_SEED_REGEX
      */
     private boolean isAcceptableURL(String url) {
         Pattern validSeedPattern = Pattern.compile(Settings.get(HarvesterSettings.VALID_SEED_REGEX));

diff --git a/...r-core/src/main/java/dk/netarkivet/harvester/harvesting/report/AbstractHarvestReport.java b/...r-core/src/main/java/dk/netarkivet/harvester/harvesting/report/AbstractHarvestReport.java
@@ -60,7 +60,7 @@ public AbstractHarvestReport() {
     /**
      * Constructor from DomainStatsReports.
      *
-     * @param files the result of parsing the crawl.log for domain statistics
+     * @param dsr the result of parsing the crawl.log for domain statistics
      */
     public AbstractHarvestReport(DomainStatsReport dsr) {
         ArgumentNotValid.checkNotNull(dsr, "DomainStatsReport dsr");

diff --git a/...vester-core/src/main/java/dk/netarkivet/harvester/harvesting/report/BnfHarvestReport.java b/...vester-core/src/main/java/dk/netarkivet/harvester/harvesting/report/BnfHarvestReport.java
@@ -61,7 +61,7 @@ public class BnfHarvestReport extends AbstractHarvestReport{
     /**
      * Constructor for this class.
      *
-     * @param files A HeritrixFiles object.
+     * @param dsr A DomainStatsReport
      * @throws IOFailure If the processing of the files goes wrong
      */
     public BnfHarvestReport(DomainStatsReport dsr) throws IOFailure {

diff --git a/...ter-core/src/main/java/dk/netarkivet/harvester/harvesting/report/LegacyHarvestReport.java b/...ter-core/src/main/java/dk/netarkivet/harvester/harvesting/report/LegacyHarvestReport.java
@@ -40,7 +40,7 @@
 import dk.netarkivet.harvester.datamodel.StopReason;
 
 /**
- * Class responsible for generating a domain harvest report from crawl logs created by Heritrix and presenting the
+ * Class responsible for representing a domain harvest report from crawl logs created by Heritrix and presenting the
  * relevant information to clients.
  */
 @SuppressWarnings({"serial"})
@@ -50,16 +50,7 @@ public class LegacyHarvestReport extends AbstractHarvestReport {
     private static final Logger log = LoggerFactory.getLogger(LegacyHarvestReport.class);
 
     /**
-     * The constructor gets the data in a crawl.log file, and parses the file. The crawl.log is described in the
-     * Heritrix user-manual, section 8.2.1: http://crawler.archive.org/articles/user_manual/analysis.html#logs Note:
-     * Invalid lines are logged and then ignored.
-     * <p>
-     * Each url listed in the file is assigned to a domain, the total object count and byte count per domain is
-     * calculated. Finally, a StopReason is found for each domain: When the response is CrawlURI.S_BLOCKED_BY_QUOTA (
-     * currently = -5003), the StopReason is set to StopReason.SIZE_LIMIT, if the annotation equals "Q:group-max-all-kb"
-     * or StopReason.OBJECT_LIMIT, if the annotation equals "Q:group-max-fetch-successes".
-     *
-     * @param hFiles the Heritrix reports and logs.
+     * @dsr a DomainStatsReport for a harvest
      */
     public LegacyHarvestReport(DomainStatsReport dsr) {
         super(dsr);

diff --git a/harvester/harvester-core/src/main/java/dk/netarkivet/harvester/webinterface/TrapAction.java b/harvester/harvester-core/src/main/java/dk/netarkivet/harvester/webinterface/TrapAction.java
@@ -83,7 +83,6 @@ public static void processRequest(PageContext context, I18n i18n) throws Forward
      *
      * @param context the context of the servlet request triggering this action.
      * @param i18n the internationalisation to use for presenting the results.
-     * @return true, if we should continue our rendering of the page, false otherwise
      */
     protected abstract void doAction(PageContext context, I18n i18n);
 

diff --git a/...oller/src/main/java/dk/netarkivet/harvester/harvesting/report/HarvestReportGenerator.java b/...oller/src/main/java/dk/netarkivet/harvester/harvesting/report/HarvestReportGenerator.java
@@ -51,6 +51,14 @@
 
 /**
  * Base implementation for a harvest report.
+ * The constructor gets the data in a crawl.log file, and parses the file. The crawl.log is described in the
+ * Heritrix user-manual, section 8.2.1: http://crawler.archive.org/articles/user_manual/analysis.html#logs Note:
+ * Invalid lines are logged and then ignored.
+ * <p>
+ * Each url listed in the file is assigned to a domain, the total object count and byte count per domain is
+ * calculated. Finally, a StopReason is found for each domain: When the response is CrawlURI.S_BLOCKED_BY_QUOTA (
+ * currently = -5003), the StopReason is set to StopReason.SIZE_LIMIT, if the annotation equals "Q:group-max-all-kb"
+ * or StopReason.OBJECT_LIMIT, if the annotation equals "Q:group-max-fetch-successes".
  */
 @SuppressWarnings({"serial"})
 public class HarvestReportGenerator {
@@ -380,8 +388,7 @@ private String getDomainNameFromURIString(String uriAsString) throws URISyntaxEx
     }
 
     /**
-     * 
-     * @return
+     * @return default stopReason
      */
     public StopReason getDefaultStopReason() {
         return defaultStopReason;

diff --git a/...rix1-extensions/src/main/java/dk/netarkivet/harvester/harvesting/WARCWriterProcessor.java b/...rix1-extensions/src/main/java/dk/netarkivet/harvester/harvesting/WARCWriterProcessor.java
@@ -36,9 +36,6 @@
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-import javax.management.AttributeNotFoundException;
-import javax.management.MBeanException;
-import javax.management.ReflectionException;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpression;
@@ -196,9 +193,7 @@ public WARCWriterProcessor(final String name) {
         e.setExpertSetting(true);
 
         // Add map setting to add NAS metadata to WarcInfo records. 
-
         e = addElementToDefinition(new MapType(ATTR_METADATA_ITEMS, "Metadata items.", String.class));
-        //e = addElementToDefinition(new StringList(ATTR_METADATA_ITEMS, "Metadata items."));
         e.setOverrideable(true);
         e.setExpertSetting(true);
     }
@@ -207,45 +202,7 @@ protected void setupPool(final AtomicInteger serialNo) {
         setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(), getPoolMaximumWait()));
     }
 
-    /**
-     * @return Metadata inputs as convenient map.  Returns null if no metadata items.
-     * @throws AttributeNotFoundException
-     * @throws ReflectionException 
-     * @throws MBeanException 
-     */
-    /*
-    public Map<String,Object> getMetadataItems() throws AttributeNotFoundException, MBeanException, ReflectionException {
-        Map<String,Object> result = null;
-        MapType items = (MapType)getAttribute(ATTR_METADATA_ITEMS);
-        if (items != null) {
-            for (Iterator i = items.iterator(null); i.hasNext();) {
-                Attribute a = (Attribute)i.next();
-                if (result == null) {
-                    result = new HashMap<String,Object>();
-                }
-                result.put(a.getName(), a.getValue());
-            }
-        }
-        return result;
-    }
-    */
-
-    @SuppressWarnings("unchecked")
-    /*
-    public List<String> getMetadataItems() {
-        ArrayList<String> results = new ArrayList<String>();
-        Object obj = getAttributeUnchecked(ATTR_METADATA_ITEMS);
-        if (obj != null) {
-            List list = (StringList)obj;
-            for (Iterator i = list.iterator(); i.hasNext();) {
-                String str = (String)i.next();
-                results.add(str);
-            }
-        }
-        return results;
-    }
-    */
-
+
     /**
      * Writes a CrawlURI and its associated data to store file.
      * <p>
@@ -711,15 +668,7 @@ protected String getFirstrecordBody(File orderFile) {
         } catch (XPathExpressionException e) {
             logger.log(Level.WARNING, "Error obtaining metadata items", e);
         }
-        /* catch (AttributeNotFoundException e) {
-        	logger.log(Level.WARNING, "Error obtaining warcinfo", e);
-		} catch (MBeanException e) {
-			logger.log(Level.WARNING, "Error obtaining warcinfo", e);
-		} catch (ReflectionException e) {
-			logger.log(Level.WARNING, "Error obtaining warcinfo", e);
-		}
-		*/
-
+
         // add fields from harvesInfo.xml version 0.4
         /*
          * <harvestInfo> <version>0.4</version> <jobId>1</jobId> <priority>HIGHPRIORITY</priority>

diff --git a/...ter/heritrix1/heritrix1-extensions/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java b/...ter/heritrix1/heritrix1-extensions/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java
@@ -871,7 +871,7 @@ protected void doTimestampAnalysis(CrawlURI curi, Document urlHit, Statistics cu
      *
      * @param fieldName name of the field to look in.
      * @param value The value to query for
-     * @returns A Query for the given value in the given field.
+     * @return A Query for the given value in the given field.
      */
     protected Query queryField(String fieldName, String value) {
         Query query = null;

diff --git a/.../src/main/java/dk/netarkivet/harvester/harvesting/ContentSizeAnnotationPostProcessor.java b/.../src/main/java/dk/netarkivet/harvester/harvesting/ContentSizeAnnotationPostProcessor.java
@@ -55,7 +55,7 @@ public ContentSizeAnnotationPostProcessor() {
      * @param crawlURI URI to add annotation for if successful.
      * @throws ArgumentNotValid if crawlURI is null.
      * @throws InterruptedException never.
-     * @see Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI)
+     * @see Processor
      */
     protected void innerProcess(CrawlURI crawlURI) throws InterruptedException {
         ArgumentNotValid.checkNotNull(crawlURI, "CrawlURI crawlURI");

diff --git a/...ons/src/main/java/dk/netarkivet/harvester/harvesting/DomainnameQueueAssignmentPolicy.java b/...ons/src/main/java/dk/netarkivet/harvester/harvesting/DomainnameQueueAssignmentPolicy.java
@@ -54,15 +54,11 @@ public class DomainnameQueueAssignmentPolicy
     /** Return a key for queue names based on domain names (last two parts of
      * host name) or IP address.  They key may include a #<portnr> at the end.
      *
-     * @param controller The controller the crawl is running on.
-     * @param cauri A potential URI.
+     * @param basis A potential URI.
      * @return a class key (really an arbitrary string), one of <domainOrIP>,
      * <domainOrIP>#<port>, or "default...".
-     * @see HostnameQueueAssignmentPolicy#getClassKey(
-     *  org.archive.crawler.framework.CrawlController,
-     *  org.archive.crawler.datamodel.CandidateURI)
+     * @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.modules.CrawlURI)
      */
-
     @Override
     protected String getCoreKey(UURI basis) {
         String candidate; 

diff --git a/.../main/java/dk/netarkivet/harvester/harvesting/SeedUriDomainnameQueueAssignmentPolicy.java b/.../main/java/dk/netarkivet/harvester/harvesting/SeedUriDomainnameQueueAssignmentPolicy.java
@@ -64,13 +64,10 @@ public class SeedUriDomainnameQueueAssignmentPolicy
     /** Return a key for queue names based on domain names (last two parts of
      * host name) or IP address.  They key may include a #<portnr> at the end.
      *
-     * @param controller The controller the crawl is running on.
      * @param cauri A potential URI.
      * @return a class key (really an arbitrary string), one of <domainOrIP>,
      * <domainOrIP>#<port>, or "default...".
-     * @see HostnameQueueAssignmentPolicy#getClassKey(
-     *  org.archive.crawler.framework.CrawlController,
-     *  org.archive.crawler.datamodel.CandidateURI)
+     * @see HostnameQueueAssignmentPolicy#getClassKey(CrawlURI)
      */
      public String getClassKey(CrawlURI cauri) {
         String candidate;

diff --git a/...ter/heritrix3/heritrix3-extensions/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java b/...ter/heritrix3/heritrix3-extensions/src/main/java/is/hi/bok/deduplicator/DeDuplicator.java
@@ -926,7 +926,7 @@ protected void doTimestampAnalysis(CrawlURI curi, Document urlHit,
      *
      * @param fieldName name of the field to look in.
      * @param value The value to query for
-     * @returns A Query for the given value in the given field.
+     * @return A Query for the given value in the given field.
      */
 	protected Query queryField(String fieldName, String value) {
 		Query query = null;

diff --git a/harvester/history-gui/src/main/webapp/Harveststatus-jobdetails.jsp b/harvester/history-gui/src/main/webapp/Harveststatus-jobdetails.jsp
@@ -316,7 +316,7 @@ value="<%= jobID %>"/>
 %>
 <a href="<%=link %>"><fmt:message key="show.job.0.harvesttemplate">
 <fmt:param value="<%=job.getJobID()%>"/>
-</fmt:message></a>&nbsp;(<a href="<%=linkWithrequestedType %>">text/plain)</a>)
+</fmt:message></a>&nbsp;(<a href="<%=linkWithrequestedType %>">text/plain</a>)
 
 
 <%