Skip to content

Commit

Permalink
NAS-2887 add crawler traps for selective harvest
Browse files Browse the repository at this point in the history
  • Loading branch information
bnfleb committed Aug 2, 2023
1 parent 01b069f commit 0f1e3bc
Show file tree
Hide file tree
Showing 21 changed files with 252 additions and 25 deletions.
7 changes: 4 additions & 3 deletions deploy/deploy-core/scripts/derby/createfullhddb.sql
Expand Up @@ -70,9 +70,9 @@ insert into schemaversions ( tablename, version )
insert into schemaversions ( tablename, version )
values ( 'config_seedlists', 1);
insert into schemaversions ( tablename, version )
values ( 'harvestdefinitions', 3);
values ( 'harvestdefinitions', 4);
insert into schemaversions ( tablename, version )
values ( 'partialharvests', 1);
values ( 'partialharvests', 2);
insert into schemaversions ( tablename, version )
values ( 'fullharvests', 5);
insert into schemaversions ( tablename, version )
Expand Down Expand Up @@ -325,8 +325,9 @@ create table partialharvests (
harvest_id bigint not null primary key, -- Unique id for the selective/
-- event harvest definition
schedule_id bigint not null, -- Schedule for the selective/event harvest
nextdate timestamp -- Time when the selective/event harvest is to
nextdate timestamp, -- Time when the selective/event harvest is to
-- run next time
crawlertraps clob(64M) -- Regexp(s) for excluded urls.
);

create index partialharvestsnextdate ON partialharvests (nextdate);
Expand Down
8 changes: 5 additions & 3 deletions deploy/deploy-core/scripts/mysql/createfullhddb.mysql
Expand Up @@ -56,9 +56,9 @@ insert into schemaversions ( tablename, version )
insert into schemaversions ( tablename, version )
values ( 'config_seedlists', 1);
insert into schemaversions ( tablename, version )
values ( 'harvestdefinitions', 3);
values ( 'harvestdefinitions', 4);
insert into schemaversions ( tablename, version )
values ( 'partialharvests', 1);
values ( 'partialharvests', 2);
insert into schemaversions ( tablename, version )
values ( 'fullharvests', 5);
insert into schemaversions ( tablename, version )
Expand Down Expand Up @@ -201,6 +201,7 @@ create table harvestdefinitions (
submitted datetime not null,
isactive int not null,
edition bigint not null,
channel_id bigint,
audience varchar(100)
);

Expand All @@ -220,7 +221,8 @@ create table fullharvests (
create table partialharvests (
harvest_id bigint not null primary key,
schedule_id bigint not null,
nextdate datetime
nextdate datetime,
crawlertraps longtext
);

create index partialharvestsnextdate on partialharvests (nextdate);
Expand Down
Expand Up @@ -70,7 +70,7 @@ INSERT INTO schemaversions ( tablename, version )
INSERT INTO schemaversions ( tablename, version )
VALUES ( 'harvestdefinitions', 4);
INSERT INTO schemaversions ( tablename, version )
VALUES ( 'partialharvests', 1);
VALUES ( 'partialharvests', 2);
INSERT INTO schemaversions ( tablename, version )
VALUES ( 'fullharvests', 5);
INSERT INTO schemaversions ( tablename, version )
Expand Down Expand Up @@ -294,7 +294,8 @@ GRANT SELECT,INSERT,UPDATE,DELETE ON TABLE fullharvests TO netarchivesuite;
CREATE TABLE partialharvests (
harvest_id bigint NOT NULL PRIMARY KEY,
schedule_id bigint NOT NULL,
nextdate timestamp
nextdate timestamp,
crawlertraps text
);

CREATE INDEX partialharvestsnextdate on partialharvests (nextdate) TABLESPACE tsindex;
Expand Down
Expand Up @@ -158,6 +158,9 @@ public Job getNewJob(HarvestDefinition harvest, DomainConfiguration cfg) {
newJob = new Job(harvest.getOid(), cfg, orderXMLdoc, channel, harvest.getMaxCountObjects(),
harvest.getMaxBytes(), ((FullHarvest) harvest).getMaxJobRunningTime(), harvest.getNumEvents());
} else {
// add specific crawlertraps for the harvest
editOrderXMLAddPerHarvestCrawlerTraps(orderXMLdoc, (PartialHarvest) harvest);

newJob = new Job(harvest.getOid(), cfg, orderXMLdoc, channel, Constants.HERITRIX_MAXOBJECTS_INFINITY,
Constants.HERITRIX_MAXBYTES_INFINITY, Constants.HERITRIX_MAXJOBRUNNINGTIME_INFINITY,
harvest.getNumEvents());
Expand Down Expand Up @@ -290,4 +293,9 @@ public boolean ignoreConfiguration(DomainConfiguration cfg) {
return noValidSeeds;
}

private HeritrixTemplate editOrderXMLAddPerHarvestCrawlerTraps(HeritrixTemplate orderXMLdoc, PartialHarvest focused) {
log.info("Inserting {} crawlertraps for harvest '{}' into the template", focused.getCrawlerTraps().size(), focused.getName());
orderXMLdoc.insertCrawlerTraps(focused.getName(), focused.getCrawlerTraps());
return orderXMLdoc;
}
}
Expand Up @@ -35,6 +35,7 @@
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -195,7 +196,7 @@ public void testNullJob() {

private SparsePartialHarvest createDefaultSparsePartialHarvest() {
SparsePartialHarvest harvest = new SparsePartialHarvest(9L, "TestHarvest", "A comment", 2, new Date(), true, 3,
"schedule", new Date(), "The audience", SELECTIVE_HARVEST_CHANNEL.getId());
"schedule", new Date(), "The audience", SELECTIVE_HARVEST_CHANNEL.getId(), Collections.emptyList());
return harvest;
}

Expand Down
Expand Up @@ -98,6 +98,8 @@ deleteconfig:
If present, it must contain a domain name followed by a colon (:) and
a configuration name on that domain. the given configuration from the
given domain is deleted from the harvest.
crawlertraps:
String (possibly empty) of crawlertraps for this harvest
DomainConfigurations are posted as pairs
Expand Down Expand Up @@ -503,6 +505,49 @@ if (hdd != null) {
<input type="submit" value="<fmt:message key="add.domains"/>"/>
<br/>
<br/>

<div id="crawlertraps">
<%
%>
<table class="selection_table">
<tr>
<th><fmt:message key="crawler.traps"/></th>
</tr>
<tr><td><a id="showCrawlertraps" href="" onclick="showHideCrawlertraps();return false;">
<span id="showCrawlerTrap"><fmt:message key="show.crawler.traps"/></span>
<span id="hideCrawlerTrap"><fmt:message key="hide.crawler.traps"/></span>
</a></td></tr>
<tr id="crawlertrapRow">
<td>
<textarea rows="<%=Constants.CRAWLERTRAPS_ROWS%>" cols="<%=Constants.CRAWLERTRAPS_COLUMNS%>"
name="<%=Constants.CRAWLERTRAPS_PARAM%>"><%=hdd != null ?
HTMLUtils.escapeHtmlValues(StringUtils.conjoin("\n", hdd.getCrawlerTraps())) : ""
%></textarea>
</td>
</tr>
</table>
</div>

<script type="text/javascript">
document.getElementById("crawlertrapRow").style.display = "none";
document.getElementById("hideCrawlerTrap").style.display = "none";
function showHideCrawlertraps() {
if (document.getElementById("hideCrawlerTrap").style.display == "none") {
// show crawler traps
document.getElementById("crawlertrapRow").style.display = "block";
document.getElementById("hideCrawlerTrap").style.display = "block";
document.getElementById("showCrawlerTrap").style.display = "none";
} else {
// hide crawler traps
document.getElementById("crawlertrapRow").style.display = "none";
document.getElementById("hideCrawlerTrap").style.display = "none";
document.getElementById("showCrawlerTrap").style.display = "block";
}
}
</script>
<br/>
<br/>

<input type="submit" value="<fmt:message key="save"/>"
name="<%=Constants.SAVE_PARAM%>"/>
</form>
Expand Down
Expand Up @@ -159,6 +159,8 @@ public synchronized void updateTable(String tableName, int toVersion) {
upgradeEavTypeAttributeTable(currentVersion, toVersion);
} else if (tableName.equals(HarvesterDatabaseTables.EAVATTRIBUTE.getTablename())) {
upgradeEavAttributeTable(currentVersion, toVersion);
} else if (tableName.equals(HarvesterDatabaseTables.PARTIALHARVESTS.getTablename())) {
upgradePartialharvestsTable(currentVersion, toVersion);
} else {
// Add new if else when other tables need to be upgraded
throw new NotImplementedException("No method exists for migrating table '" + tableName + "' to version "
Expand Down Expand Up @@ -514,6 +516,19 @@ private void upgradeHarvestchannelTable(int currentVersion, int toVersion) {
}
}

private void upgradePartialharvestsTable (int currentVersion, int toVersion) {
if (currentVersion == 1 && toVersion == 2 ) {
migratePartialharvestsv1tov2();
currentVersion = 2;
}
// insert new migrations here
if (currentVersion != HarvesterDatabaseTables.PARTIALHARVESTS.getRequiredVersion()) {
throw new NotImplementedException("No method exists for migrating table '"
+ HarvesterDatabaseTables.PARTIALHARVESTS.getTablename() + "' from version " + currentVersion
+ " to version " + toVersion);
}
}

protected abstract void createHarvestChannelTable();

/**
Expand Down Expand Up @@ -706,6 +721,12 @@ private void upgradeHarvestchannelTable(int currentVersion, int toVersion) {
*/
protected abstract void migrateOrderTemplatesTablev1tov2();

/**
* Migrates the 'partialharvests' table from version 3 to version 4 consisting of adding the bigint channel_id
* field.
*/
protected abstract void migratePartialharvestsv1tov2();

/**
* Update all tables in the enum class {@link HarvesterDatabaseTables} to the required version. There is no attempt
* to undo the update.
Expand Down
Expand Up @@ -430,4 +430,10 @@ public void createEavAttributeTable(int toVersion) {
HarvestDBConnection.executeSql("derby", tableName, 1 );
}

@Override
protected void migratePartialharvestsv1tov2() {
String[] sqlStatements = {"ALTER TABLE partialharvests ADD COLUMN crawlertraps CLOB(64M)"};
HarvestDBConnection.updateTable("partialharvests", 2, sqlStatements);
}

}
Expand Up @@ -172,11 +172,12 @@ public synchronized Long create(HarvestDefinition harvestDefinition) {
long scheduleId = DBUtils.selectLongValue(connection,
"SELECT schedule_id FROM schedules WHERE name = ?", ph.getSchedule().getName());
try (PreparedStatement s = connection
.prepareStatement("INSERT INTO partialharvests ( harvest_id, schedule_id, nextdate ) "
+ "VALUES ( ?, ?, ? )");) {
.prepareStatement("INSERT INTO partialharvests ( harvest_id, schedule_id, nextdate, crawlertraps ) "
+ "VALUES ( ?, ?, ?, ? )");) {
s.setLong(1, id);
s.setLong(2, scheduleId);
DBUtils.setDateMaybeNull(s, 3, ph.getNextDate());
s.setString(4, StringUtils.conjoin("\n", ph.getCrawlerTraps()));
s.executeUpdate();
createHarvestConfigsEntries(connection, ph, id);
}
Expand Down Expand Up @@ -342,7 +343,8 @@ private HarvestDefinition read(Connection c, Long harvestDefinitionID) throws Un
+ " harvestdefinitions.numevents," + " harvestdefinitions.submitted,"
+ " harvestdefinitions.isactive," + " harvestdefinitions.edition,"
+ " harvestdefinitions.audience," + " schedules.name,"
+ " partialharvests.nextdate, " + " harvestdefinitions.channel_id "
+ " partialharvests.nextdate, " + " harvestdefinitions.channel_id, "
+ " partialharvests.crawlertraps "
+ "FROM harvestdefinitions, partialharvests, schedules" + " WHERE harvestdefinitions.harvest_id = ?"
+ " AND harvestdefinitions.harvest_id " + "= partialharvests.harvest_id"
+ " AND schedules.schedule_id " + "= partialharvests.schedule_id");
Expand All @@ -363,6 +365,7 @@ private HarvestDefinition read(Connection c, Long harvestDefinitionID) throws Un
final String scheduleName = res.getString(8);
final Date nextDate = DBUtils.getDateMaybeNull(res, 9);
final Long channelId = DBUtils.getLongMaybeNull(res, 10);
final String crawlertraps = res.getString(11);
s.close();
// Found partial harvest -- have to find configurations.
// To avoid holding on to the readlock while getting domains,
Expand Down Expand Up @@ -398,6 +401,10 @@ private HarvestDefinition read(Connection c, Long harvestDefinitionID) throws Un
if (channelId != null) {
ph.setChannelId(channelId);
}
boolean strictMode = false;
List<String> insertList = getCrawlertrapsList(crawlertraps);
log.trace("Found {} crawlertraps for harvest '{}' in database", insertList.size(), name);
ph.setCrawlerTraps(insertList, strictMode);

readExtendedFieldValues(ph);

Expand All @@ -411,6 +418,19 @@ private HarvestDefinition read(Connection c, Long harvestDefinitionID) throws Un
+ ExceptionUtils.getSQLExceptionCause(e), e);
}
}
private List<String> getCrawlertrapsList(String crawlertraps){
List<String> insertList = new ArrayList<String>();
if (crawlertraps != null) {
// don't throw exception if illegal regexps are found.
String[] traps = crawlertraps.split("\n");
for (String trap: traps) {
if (!trap.isEmpty()) { // Ignore empty traps (NAS-2480)
insertList.add(trap);
}
}
}
return insertList;
}

/**
* Update an existing harvest definition with new info.
Expand Down Expand Up @@ -477,11 +497,13 @@ public synchronized void update(HarvestDefinition hd) {
} else if (hd instanceof PartialHarvest) {
PartialHarvest ph = (PartialHarvest) hd;
s = c.prepareStatement("UPDATE partialharvests SET " + "schedule_id = "
+ "(SELECT schedule_id FROM schedules WHERE schedules.name = ?), " + "nextdate = ? "
+ "(SELECT schedule_id FROM schedules WHERE schedules.name = ?), " + "nextdate = ?, "
+ "crawlerTraps = ? "
+ "WHERE harvest_id = ?");
s.setString(1, ph.getSchedule().getName());
DBUtils.setDateMaybeNull(s, 2, ph.getNextDate());
s.setLong(3, ph.getOid());
s.setString(3, StringUtils.conjoin("\n", ph.getCrawlerTraps()));
s.setLong(4, ph.getOid());
rows = s.executeUpdate();
log.debug("{} partialharvests records updated", rows);
s.close();
Expand Down Expand Up @@ -862,7 +884,7 @@ public Iterable<SparsePartialHarvest> getSparsePartialHarvestDefinitions(boolean
+ " harvestdefinitions.comments," + " harvestdefinitions.numevents,"
+ " harvestdefinitions.submitted," + " harvestdefinitions.isactive,"
+ " harvestdefinitions.edition," + " schedules.name," + " partialharvests.nextdate, "
+ " harvestdefinitions.audience, " + " harvestdefinitions.channel_id "
+ " harvestdefinitions.audience, " + " harvestdefinitions.channel_id, "+ " partialharvests.crawlerTraps "
+ "FROM harvestdefinitions, partialharvests, schedules" + " WHERE harvestdefinitions.harvest_id "
+ " = partialharvests.harvest_id" + " AND (harvestdefinitions.isactive " + " = ?"
// This linie is duplicated to allow to select both active
Expand All @@ -877,7 +899,7 @@ public Iterable<SparsePartialHarvest> getSparsePartialHarvestDefinitions(boolean
SparsePartialHarvest sph = new SparsePartialHarvest(res.getLong(1), res.getString(2), res.getString(3),
res.getInt(4), new Date(res.getTimestamp(5).getTime()), res.getBoolean(6), res.getLong(7),
res.getString(8), DBUtils.getDateMaybeNull(res, 9), res.getString(10),
DBUtils.getLongMaybeNull(res, 11));
DBUtils.getLongMaybeNull(res, 11), getCrawlertrapsList(res.getString(12)));
harvests.add(sph);
}
return harvests;
Expand All @@ -902,7 +924,7 @@ public SparsePartialHarvest getSparsePartialHarvest(String harvestName) {
+ " harvestdefinitions.submitted," + " harvestdefinitions.isactive,"
+ " harvestdefinitions.edition," + " schedules.name,"
+ " partialharvests.nextdate, " + " harvestdefinitions.audience, "
+ " harvestdefinitions.channel_id "
+ " harvestdefinitions.channel_id, " + " partialharvests.crawlertraps "
+ "FROM harvestdefinitions, partialharvests, schedules" + " WHERE harvestdefinitions.name = ?"
+ " AND harvestdefinitions.harvest_id " + "= partialharvests.harvest_id"
+ " AND schedules.schedule_id " + "= partialharvests.schedule_id");) {
Expand All @@ -912,7 +934,7 @@ public SparsePartialHarvest getSparsePartialHarvest(String harvestName) {
SparsePartialHarvest sph = new SparsePartialHarvest(res.getLong(1), harvestName, res.getString(2),
res.getInt(3), new Date(res.getTimestamp(4).getTime()), res.getBoolean(5), res.getLong(6),
res.getString(7), DBUtils.getDateMaybeNull(res, 8), res.getString(9),
DBUtils.getLongMaybeNull(res, 10));
DBUtils.getLongMaybeNull(res, 10), getCrawlertrapsList(res.getString(11)));
sph.setExtendedFieldValues(getExtendedFieldValues(sph.getOid()));
return sph;
} else {
Expand Down
Expand Up @@ -177,7 +177,7 @@ public String getTablename() {
/** The table containing information about partial harvests. */
PARTIALHARVESTS {
static final String NAME = "partialharvests";
static final int REQUIRED_VERSION = 1;
static final int REQUIRED_VERSION = 2;

@Override
public int getRequiredVersion() {
Expand Down
Expand Up @@ -442,4 +442,10 @@ public void createEavAttributeTable(int toVersion) {
HarvestDBConnection.executeSql("mysql", tableName, 1 );
}

@Override
protected void migratePartialharvestsv1tov2() {
String[] sqlStatements = {"ALTER TABLE partialharvests ADD COLUMN crawlertraps LONGTEXT"};
HarvestDBConnection.updateTable("partialharvests", 2, sqlStatements);
}

}

0 comments on commit 0f1e3bc

Please sign in to comment.