Skip to content
Permalink
Browse files

NAS-2520: Groovy script package in harverster-core.jar. Partial crawl…

…log search implemented.
  • Loading branch information...
nclarkekb committed Oct 19, 2016
1 parent 3368311 commit 55bc3e73304ce047b6b7e9d9c7bed5be1b67547c
@@ -13,7 +13,7 @@
import dk.netarkivet.harvester.datamodel.Job;
import dk.netarkivet.harvester.harvesting.monitor.StartedJobInfo;

public class Heritrix3JobMonitor {
public class Heritrix3JobMonitor implements Pageable {

public boolean bActive = true;

@@ -139,96 +139,25 @@ public synchronized void updateCrawlLog(byte[] tmpBuf) throws IOException {
}
}

public synchronized boolean isReady() {
return (bActive && bInitialized);
@Override
public synchronized long getIndexSize() {
return idxFile.length();
}

@Override
public synchronized byte[] readPage(long page, long itemsPerPage, boolean descending) throws IOException {
byte[] bytes = null;;
if (page < 1) {
throw new IllegalArgumentException();
}
if (itemsPerPage < 25) {
throw new IllegalArgumentException();
}
long length = idxRaf.length();
if (length > 8) {
if (!descending) {
// Forwards.
long fromIdx = (page - 1) * (itemsPerPage * 8);
long toIdx = fromIdx + (itemsPerPage * 8);
if (toIdx > length) {
toIdx = length;
}
idxRaf.seek(fromIdx);
fromIdx = idxRaf.readLong();
idxRaf.seek(toIdx);
toIdx = idxRaf.readLong();
logRaf.seek(fromIdx);
bytes = new byte[(int)(toIdx - fromIdx)];
logRaf.readFully(bytes, 0, (int)(toIdx - fromIdx));
} else {
// Backwards.
long toIdx = length - ((page - 1) * itemsPerPage * 8);
long fromIdx = toIdx - (itemsPerPage * 8) - 8;
if (fromIdx < 0) {
fromIdx = 0;
}
// Read line indexes for page.
int pageIdxArrLen = (int)(toIdx - fromIdx);
byte[] pageIdxArr = new byte[pageIdxArrLen];
idxRaf.seek(fromIdx);
int pos = 0;
int limit = pageIdxArrLen;
int read = 0;
while (limit > 0 && read != -1) {
read = idxRaf.read(pageIdxArr, pos, limit);
if (read != -1) {
pos += read;
limit -= read;
}
}
// Convert line indexes for page.
limit = pos;
pos = 0;
long[] idxArr = new long[limit / 8];
long l;
int dstIdx = 0;
while (pos < limit) {
l = (pageIdxArr[pos++] & 255) << 56 | (pageIdxArr[pos++] & 255) << 48 | (pageIdxArr[pos++] & 255) << 40 | (pageIdxArr[pos++] & 255) << 32
| (pageIdxArr[pos++] & 255) << 24 | (pageIdxArr[pos++] & 255) << 16 | (pageIdxArr[pos++] & 255) << 8 | (pageIdxArr[pos++] & 255);
idxArr[dstIdx++] = l;
}
// Load the crawllog lines for page.
pos = 0;
limit /= 8;
fromIdx = idxArr[pos];
toIdx = idxArr[limit - 1];
logRaf.seek(fromIdx);
byte[] tmpBytes = new byte[(int)(toIdx - fromIdx)];
logRaf.readFully(tmpBytes, 0, (int)(toIdx - fromIdx));
// Reverse crawllog lines for page.
bytes = new byte[tmpBytes.length];
long base = idxArr[pos++];
fromIdx = base;
int len;
dstIdx = bytes.length;
while (pos < limit) {
toIdx = idxArr[pos++];
len = (int)(toIdx - fromIdx);
dstIdx -= len;
System.arraycopy(tmpBytes, (int)(fromIdx - base), bytes, dstIdx, len);
fromIdx = toIdx;
}
}
}
return bytes;
return StringIndexFile.readPage(idxRaf, logRaf, page, itemsPerPage, descending);
}

public synchronized boolean isReady() {
return (bActive && bInitialized);
}

public void search() {
public synchronized SearchResult getSearchResult(String q) throws IOException {
return new SearchResult(this, q);
}

public synchronized void dispose() {
public synchronized void cleanup() {
bActive = false;
bInitialized = false;
hostUrl = null;
@@ -106,7 +106,7 @@ public boolean accept(File dir, String name) {
jobmonitor = jobmonitorIter.next();
oldFilesList.add(jobmonitor.logFile);
oldFilesList.add(jobmonitor.idxFile);
jobmonitor.dispose();
jobmonitor.cleanup();
}
jobmonitorIter = runningJobMonitorMap.values().iterator();
while (jobmonitorIter.hasNext()) {
@@ -1,10 +1,9 @@
package dk.netarkivet.harvester.webinterface.servlet;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.InputStream;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.servlet.ServletContext;
import javax.servlet.ServletOutputStream;
@@ -114,12 +113,25 @@ public void frontier_list(HttpServletRequest req, HttpServletResponse resp, List
regex =".*";
}

String resource = "dk/netarkivet/harvester/webinterface/servlet/nas.groovy";
InputStream in = JobResource.class.getClassLoader().getResourceAsStream(resource);
ByteArrayOutputStream bOut = new ByteArrayOutputStream();
byte[] tmpArr = new byte[8192];
int read;
while ((read = in.read(tmpArr)) != -1) {
bOut.write(tmpArr, 0, read);
}
in.close();
String script = new String(bOut.toByteArray(), "UTF-8");

/*
//RandomAccessFile raf = new RandomAccessFile("/home/nicl/workspace-nas-h3/heritrix3-scripts/src/main/java/view-frontier-url.groovy", "r");
RandomAccessFile raf = new RandomAccessFile("/home/nicl/workspace-nas-h3/heritrix3-scripts/src/main/java/nas.groovy", "r");
byte[] src = new byte[(int)raf.length()];
raf.readFully(src);
raf.close();
String script = new String(src, "UTF-8");
*/

String tmpStr = req.getParameter("delete");
if (tmpStr != null && "1".equals(tmpStr) ) {
@@ -147,12 +159,12 @@ public void frontier_list(HttpServletRequest req, HttpServletResponse resp, List
ScriptResult scriptResult = h3Job.h3wrapper.ExecuteShellScriptInJob(h3Job.jobResult.job.shortName, "groovy", script);
//System.out.println(new String(scriptResult.response, "UTF-8"));
if (scriptResult != null && scriptResult.script != null) {
if (scriptResult.script.htmlOutput != null) {
if (scriptResult.script.htmlOutput != null) {
sb.append(scriptResult.script.htmlOutput);
}
if (scriptResult.script.rawOutput != null) {
}
if (scriptResult.script.rawOutput != null) {
sb.append(scriptResult.script.rawOutput);
}
}
}
} else {
sb.append("Job ");
@@ -173,6 +185,7 @@ public void crawllog_list(HttpServletRequest req, HttpServletResponse resp, List
long linesPerPage = 100;
long page = 1;
long pages = 0;
String q = null;

String tmpStr;
tmpStr = req.getParameter("page");
@@ -195,19 +208,29 @@ public void crawllog_list(HttpServletRequest req, HttpServletResponse resp, List
if (linesPerPage > 1000) {
linesPerPage = 1000;
}

tmpStr = req.getParameter("q");
if (tmpStr != null && tmpStr.length() > 0) {
Pattern p = Pattern.compile(tmpStr);
Matcher m = p.matcher("aaaaab");
boolean b = m.matches();
if (tmpStr != null && tmpStr.length() > 0 && tmpStr.equalsIgnoreCase(".*")) {
q = tmpStr;
}
if (q == null) {
q = ".*";
}

StringBuilder sb = new StringBuilder();

Heritrix3JobMonitor h3Job = environment.h3JobMonitorThread.getRunningH3Job(numerics.get(0));
Pageable pageable = h3Job;

if (h3Job != null && h3Job.isReady()) {
lines = h3Job.idxFile.length();
SearchResult searchResult = null;
if (q != null) {
searchResult = h3Job.getSearchResult(q);
searchResult.update();
pageable = searchResult;
}

lines = pageable.getIndexSize();
if (lines > 0) {
lines = (lines / 8) - 1;
pages = Pagination.getPages(lines, linesPerPage);
@@ -222,12 +245,19 @@ public void crawllog_list(HttpServletRequest req, HttpServletResponse resp, List
sb.append(page);
sb.append(" of ");
sb.append(pages);
sb.append("<br />\n");

sb.append("<form class=\"form-horizontal\" action=\"?\" name=\"insert_form\" method=\"post\" enctype=\"application/x-www-form-urlencoded\" accept-charset=\"utf-8\">");
sb.append("<input type=<\"text\" id=\"regex\" name=\"regex\" value=\"" + q + "\" placeholder=\"content-type\">\n");
sb.append("<button type=\"submit\" name=\"search\" value=\"1\" class=\"btn btn-success\"><i class=\"icon-white icon-thumbs-up\"></i> Search</button>\n");
sb.append("</form>");

sb.append("<br />\n");
sb.append("<br />\n");
sb.append(Pagination.getPagination(page, linesPerPage, pages, false));
sb.append("<pre>\n");
if (lines > 0) {
byte[] bytes = h3Job.readPage(page, linesPerPage, true);
byte[] bytes = pageable.readPage(page, linesPerPage, true);
sb.append(new String(bytes, "UTF-8"));
}
sb.append("</pre>\n");
@@ -0,0 +1,11 @@
package dk.netarkivet.harvester.webinterface.servlet;

import java.io.IOException;

public interface Pageable {

public long getIndexSize();

public byte[] readPage(long page, long itemsPerPage, boolean descending) throws IOException;

}
@@ -0,0 +1,96 @@
package dk.netarkivet.harvester.webinterface.servlet;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;

public class SearchResult implements Pageable {

protected Heritrix3JobMonitor h3Job;

protected Pattern p;
protected Matcher m;

protected File idxFile;

protected long lastIndex;

protected RandomAccessFile idxRaf;

public SearchResult(Heritrix3JobMonitor h3Job, String q) throws IOException {
this.h3Job = h3Job;
p = Pattern.compile(q, Pattern.CASE_INSENSITIVE);
m = p.matcher("42");
idxFile = new File("crwawllog-" + h3Job.jobId + "-" + "1" + ".idx");
lastIndex = 0;
idxRaf = new RandomAccessFile(idxFile, "rw");
idxRaf.setLength(0);
}

public synchronized void update() throws IOException {
RandomAccessFile logRaf = new RandomAccessFile(h3Job.logFile, "r");
idxRaf.seek(idxRaf.length());
logRaf.seek(lastIndex);
FileChannel logChannel = logRaf.getChannel();
byte[] bytes = new byte[1024*1024];
ByteBuffer byteBuffer = ByteBuffer.wrap(bytes);
String tmpStr;
long index = lastIndex;
int pos;
int to;
int mark;
int limit;
boolean b;
while (logChannel.read(byteBuffer) != -1) {
byteBuffer.flip();
pos = byteBuffer.position();
mark = pos;
limit = byteBuffer.limit();
b = true;
while (b) {
if (pos < limit) {
if (bytes[pos++] == '\n') {
to = pos - 1;
if (bytes[to - 1] == '\r') {
--to;
}
tmpStr = new String(bytes, mark, to - mark, "UTF-8");
m.reset(tmpStr);
if (m.matches()) {
idxRaf.writeLong(index);
}
// next
mark = pos;
index += mark - pos;
lastIndex = index;
}
} else {
b = false;
}
}
}
logRaf.close();
}

@Override
public long getIndexSize() {
return idxFile.length();
}

@Override
public synchronized byte[] readPage(long page, long itemsPerPage, boolean descending) throws IOException {
RandomAccessFile logRaf = new RandomAccessFile(h3Job.logFile, "r");
return StringIndexFile.readPage(idxRaf, logRaf, page, itemsPerPage, descending);
}

public synchronized void cleanup() {
IOUtils.closeQuietly(idxRaf);
}

}

0 comments on commit 55bc3e7

Please sign in to comment.
You can’t perform that action at this time.