Skip to content

Commit

Permalink
NAS-2793 Made two new parameters instead of hardcoding the rabbitmq u…
Browse files Browse the repository at this point in the history
…rl and the regular expression to limit search pages
  • Loading branch information
Knud Åge Hansen committed Aug 31, 2018
1 parent 5f17398 commit 35c7c05
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ public void doOneCrawl(Job job, String origHarvestName, String origHarvestDesc,
IOFailure {

final String umbra = "UMBRA";
String rabbitMQUrl = "amqp://guest:guest@activemq:5672/%2f";
String limitSearchRegEx = "^$|.*L";
ArgumentNotValid.checkNotNull(job, "job");
ArgumentNotValid.checkNotNull(metadata, "metadata");

Expand All @@ -229,7 +231,7 @@ public void doOneCrawl(Job job, String origHarvestName, String origHarvestDesc,
{
log.info("Since we now are sure that it is an umbra channel we can insert umbra information");
HeritrixTemplate ht = job.getOrderXMLdoc();
ht.insertUmbrabean(job);
ht.insertUmbrabean(job, rabbitMQUrl, limitSearchRegEx);
}

DoOneCrawlMessage nMsg = new DoOneCrawlMessage(job, HarvesterChannels.getHarvestJobChannelId(channel),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,19 +390,19 @@ public static void editOrderXMLAddCrawlerTraps(Document orderXMLdoc, String elem
}
}

/**
* Updates the order.xml to include a MatchesListRegExpDecideRule for each crawlertrap associated with for the given
* DomainConfiguration.
* <p>
* The added nodes have the form
* <p>
* <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string
* name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list">
* <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject>
*
* @param cfg The DomainConfiguration for which to generate crawler trap deciderules
* @throws IllegalState If unable to update order.xml due to wrong order.xml format
*/
// /**
// * Updates the order.xml to include a MatchesListRegExpDecideRule for each crawlertrap associated with for the given
// * DomainConfiguration.
// * <p>
// * The added nodes have the form
// * <p>
// * <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string
// * name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list">
// * <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject>
// *
// * @param cfg The DomainConfiguration for which to generate crawler trap deciderules
// * @throws IllegalState If unable to update order.xml due to wrong order.xml format
// */
// FIXME REMOVE IF NOT USED
/*
public static void editOrderXMLAddPerDomainCrawlerTraps(Document orderXmlDoc, DomainConfiguration cfg) {
Expand Down Expand Up @@ -919,4 +919,10 @@ public void writeTemplate(JspWriter out) throws IOFailure {

}

}
@Override
public void insertUmbrabean(Job aJob, String rabbitMQUrl, String limitSearchRegEx)
{
//NOP
log.debug("In H1 templates we don't do umbra search.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -234,10 +234,11 @@ public boolean isValid() {
}

@Override
public void insertUmbrabean(Job aJob)
public void insertUmbrabean(Job aJob, String rabbitMQUrl, String limitSearchRegEx)
{
String tmp = template;
this.template = tmp.replace(UMBRA_BEAN_IN_SIMPLEOVERRIDES_BEAN_PLACEHOLDER, getUmbraBeanInformationInSimpleoverridesBean(aJob));
this.template = tmp.replace(UMBRA_BEAN_IN_SIMPLEOVERRIDES_BEAN_PLACEHOLDER,
getUmbraBeanInformationInSimpleoverridesBean(aJob, rabbitMQUrl, limitSearchRegEx));
this.template = tmp.replace(UMBRA_BEAN_PLACEHOLDER, getUmbrabeanPlaceholder());
this.template = tmp.replace(AMQP_URLRECEIVER_PLACEHOLDER, getAmqpUrlreceiverPlaceholder());
this.template = tmp.replace(CALL_UMBRABEAN_PLACEHOLDER, getCallUmbrabean());
Expand All @@ -249,7 +250,7 @@ public void insertUmbrabean(Job aJob)
* Umbrabean text from the current harvest job that will replace the placeholder in the Simpleoverride bean
* @param aJob The job for the current harvest
*/
public String getUmbraBeanInformationInSimpleoverridesBean(Job aJob) {
public String getUmbraBeanInformationInSimpleoverridesBean(Job aJob, String rabbitMQUrl, String limitSearchRegEx) {
// umbraBean.clientId=MySpecialJobName
// umbraBean.amqpUri=amqp://guest:guest@activemq:5672/%2f
// ## The following rule restricts umbra to processing only on seeds or links, leaving embeds and redirects
Expand All @@ -258,10 +259,10 @@ public String getUmbraBeanInformationInSimpleoverridesBean(Job aJob) {

StringBuilder umbrabeanBuilder = new StringBuilder();
umbrabeanBuilder.append("umbraBean.clientId="+aJob.getJobID());
umbrabeanBuilder.append("umbraBean.amqpUri=amqp://guest:guest@activemq:5672/%2f");
umbrabeanBuilder.append("umbraBean.amqpUri="+rabbitMQUrl);
umbrabeanBuilder.append("## The following rule restricts umbra to processing only on seeds or links, leaving embeds and redirects");
umbrabeanBuilder.append("## to be handled by the browser itself");
umbrabeanBuilder.append("umbraBean.shouldProcessRule.rules[1].regex=^$|.*L");
umbrabeanBuilder.append("umbraBean.shouldProcessRule.rules[1].regex="+limitSearchRegEx);

return umbrabeanBuilder.toString();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,6 @@ public abstract void insertWarcInfoMetadata(Job ajob,
* Method to add beans in case the job is an umbra job.
* @param ajob a HarvestJob for the current harvest
*/
public abstract void insertUmbrabean(Job ajob);
public abstract void insertUmbrabean(Job ajob, String rabbitMQUrl, String limitSearchRegEx);

}

0 comments on commit 35c7c05

Please sign in to comment.