Skip to content

Commit

Permalink
Small refactor and implemented harvestRecentFilenames
Browse files Browse the repository at this point in the history
  • Loading branch information
Bohlski committed Oct 13, 2020
1 parent 0ecd959 commit 199355b
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 39 deletions.
Expand Up @@ -22,15 +22,18 @@
*/
package dk.netarkivet.wayback.indexer;

import static dk.netarkivet.common.distribute.arcrepository.bitrepository.BitmagArcRepositoryClient.BITREPOSITORY_COLLECTIONID;
import static dk.netarkivet.common.distribute.arcrepository.bitrepository.BitmagArcRepositoryClient.BITREPOSITORY_USEPILLAR;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Date;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.bitrepository.access.getfileids.GetFileIDsClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -39,10 +42,9 @@
import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
import dk.netarkivet.common.distribute.arcrepository.PreservationArcRepositoryClient;
import dk.netarkivet.common.distribute.arcrepository.bitrepository.BitmagArcRepositoryClient;
import dk.netarkivet.common.distribute.arcrepository.bitrepository.Bitrepository;
import dk.netarkivet.common.distribute.bitrepository.action.getfileids.GetFileIDsAction;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.BitmagUtils;
import dk.netarkivet.common.distribute.bitrepository.BitmagUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.batch.DatedFileListJob;
import dk.netarkivet.common.utils.batch.FileListJob;
Expand All @@ -60,22 +62,9 @@ public class FileNameHarvester {
*/
public static synchronized void harvestAllFilenames() {
ArchiveFileDAO dao = new ArchiveFileDAO();
//TODO I believe that by default this only fetches max 10000 files. Look at the query interface
//in bitmag
if (Settings.getBoolean(CommonSettings.USING_HADOOP)) {
// Initialize connection to the bitrepository
Bitrepository bitrep = BitmagUtils.initBitrep();
String collection = Settings.get(BitmagArcRepositoryClient.BITREPOSITORY_COLLECTIONID);
List<String> fileNames = bitrep.getFileIds(collection, Settings.get(BitmagArcRepositoryClient.BITREPOSITORY_USEPILLAR));
if (fileNames != null) {
for (String fileName : fileNames) {
if (!dao.exists(fileName)) {
createArchiveFileInDB(fileName, dao);
}
}
} else {
log.info("No files found in collection '{}'", collection);
}
Set<String> fileNames = getFilesFromBitmagSince(new Date(0));
createFilesInDB(fileNames, dao);
} else {
PreservationArcRepositoryClient client = ArcRepositoryClientFactory.getPreservationInstance();
BatchStatus status = client.batch(new FileListJob(), Settings.get(WaybackSettings.WAYBACK_REPLICA));
Expand All @@ -89,33 +78,52 @@ public static synchronized void harvestAllFilenames() {
public static synchronized void harvestRecentFilenames() {
ArchiveFileDAO dao = new ArchiveFileDAO();
long timeAgo = Settings.getLong(WaybackSettings.WAYBACK_INDEXER_RECENT_PRODUCER_SINCE);
Date since = new Date(System.currentTimeMillis() - timeAgo);
Date sinceDate = new Date(System.currentTimeMillis() - timeAgo);

if (Settings.getBoolean(CommonSettings.USING_HADOOP)) {
log.info("Harvesting of recent files not yet implemented with bitmag.");
//TODO can this be implemented using queries in bitmag

/* Bitrepository bitrep = BitmagUtils.initBitrep();
String collection = Settings.get(BitmagArcRepositoryClient.BITREPOSITORY_COLLECTIONID);
List<String> fileNames = bitrep.getFileIds(collection);
if (fileNames != null) {
for (String fileName : fileNames) {
File file = bitrep.getFile(fileName, collection, null);
if (file.lastModified() > since.getTime()) {
createArchiveFileInDB(fileName, dao);
}
}
} else {
log.info("No files found in collection '{}'", collection);
}*/
Set<String> fileNames = getFilesFromBitmagSince(sinceDate);
createFilesInDB(fileNames, dao);
} else {
PreservationArcRepositoryClient client = ArcRepositoryClientFactory.getPreservationInstance();
BatchStatus status = client
.batch(new DatedFileListJob(since), Settings.get(WaybackSettings.WAYBACK_REPLICA));
BatchStatus status = client.batch(
new DatedFileListJob(sinceDate), Settings.get(WaybackSettings.WAYBACK_REPLICA));
getResultFileAndCreateInDB(status, dao);
}
}

/**
* Creates the given filenames in the database if they don't already exist.
* If the given set is null it just logs that no files were found in the collection.
* @param fileNames Files to create
* @param dao The DAO through which the database is accessed
*/
private static void createFilesInDB(Set<String> fileNames, ArchiveFileDAO dao) {
if (fileNames != null) {
for (String fileName : fileNames) {
if (!dao.exists(fileName)) {
createArchiveFileInDB(fileName, dao);
}
}
} else {
String collectionID = Settings.get(BITREPOSITORY_COLLECTIONID);
log.info("No files found in collection '{}'", collectionID);
}
}

/**
* Performs a get-file-ids action on the used bitmag instance and returns the results in a set.
* @param sinceDate A date specifying how far back to fetch files from
* @return The resulting set of filenames from the get-file-ids action
*/
private static Set<String> getFilesFromBitmagSince(Date sinceDate) {
String collectionID = Settings.get(BITREPOSITORY_COLLECTIONID);
String usePillar = Settings.get(BITREPOSITORY_USEPILLAR);
GetFileIDsClient client = BitmagUtils.getFileIDsClient();
GetFileIDsAction action = new GetFileIDsAction(client, collectionID, usePillar, sinceDate);
action.performAction();
return action.getActionResult();
}

/**
* Helper method for handling results from BatchStatus and putting it in the database.
* @param status The BatchStatus with results from batch()
Expand Down
Expand Up @@ -22,11 +22,16 @@
*/
package dk.netarkivet.wayback.indexer;

import static dk.netarkivet.common.distribute.arcrepository.bitrepository.BitmagArcRepositoryClient.BITREPOSITORY_KEYFILENAME;
import static dk.netarkivet.common.distribute.arcrepository.bitrepository.BitmagArcRepositoryClient.BITREPOSITORY_SETTINGS_DIR;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Date;
import java.util.Timer;
import java.util.TimerTask;
Expand All @@ -35,6 +40,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.distribute.bitrepository.BitmagUtils;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.UnknownID;
Expand Down Expand Up @@ -82,6 +89,12 @@ private WaybackIndexer() {
File batchOutputDir = Settings.getFile(WaybackSettings.WAYBACK_BATCH_OUTPUTDIR);
FileUtils.createDir(temporaryBatchDir);
FileUtils.createDir(batchOutputDir);

if (Settings.getBoolean(CommonSettings.USING_HADOOP)) {
Path configDir = Paths.get(Settings.get(BITREPOSITORY_SETTINGS_DIR));
Path clientCertificate = configDir.resolve(Settings.get(BITREPOSITORY_KEYFILENAME));
BitmagUtils.initialize(configDir, clientCertificate);
}
ingestInitialFiles();
startProducerThread();
startConsumerThreads();
Expand Down

0 comments on commit 199355b

Please sign in to comment.