diff --git a/src/test/java/com/amihaiemil/charles/ElasticSearchRepositoryTestCase.java b/src/main/java/com/amihaiemil/charles/AbstractWebCrawl.java similarity index 50% rename from src/test/java/com/amihaiemil/charles/ElasticSearchRepositoryTestCase.java rename to src/main/java/com/amihaiemil/charles/AbstractWebCrawl.java index 6e50f80..75afc54 100644 --- a/src/test/java/com/amihaiemil/charles/ElasticSearchRepositoryTestCase.java +++ b/src/main/java/com/amihaiemil/charles/AbstractWebCrawl.java @@ -23,67 +23,53 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - package com.amihaiemil.charles; -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.List; - -import org.junit.Test; - -import com.jcabi.http.mock.MkAnswer; -import com.jcabi.http.mock.MkContainer; -import com.jcabi.http.mock.MkGrizzlyContainer; +import org.openqa.selenium.WebDriver; /** - * Test cases for {@link ElasticSearchRepository} + * An abstract webcrawl - contains the webdriver and other common data of each crawl. * @author Mihai Andronache (amihaiemil@gmail.com) + * @version $Id$ + * @since 1.0.0 * */ -public class ElasticSearchRepositoryTestCase { - - +public abstract class AbstractWebCrawl implements WebCrawl { + /** - * {@link ElasticSearchRepository} can send the given list of json docs - * to the specified elastisearch index. - * @throws Exception - If something goes wrong. + * WebDriver. */ - @Test - public void indexesListOfDocuments() throws Exception { - List pages = new ArrayList(); - pages.add(this.webPage("http://www.amihaiemil.com/index.html")); - pages.add(this.webPage("http://eva.amihaiemil.com/index.html")); - - MkContainer server = new MkGrizzlyContainer() - .next(new MkAnswer.Simple("{\"response\":\"ok\", \"errors\":false, \"took\":1}")) - .next(new MkAnswer.Simple(200)) - .start(9201); - - ElasticSearchRepository elasticRepo = new ElasticSearchRepository( - "http://localhost:9201/test5" - ); - try { - elasticRepo.export(pages); - } finally { - server.close(); - } - } + protected WebDriver driver; /** - * Returns a WebPage. - * @param url URL of the page. - * @return WebPage - */ - private WebPage webPage(String url) { - WebPage page = new SnapshotWebPage(); - page.setUrl(url); - page.setLinks(new LinkedHashSet()); - page.setName("indextest.html"); - page.setTitle("Intex Test | Title"); - page.setTextContent("Test content of this awesome test page."); - page.setCategory("page"); - return page; - } + * Ignored pages patterns. + */ + protected IgnoredPatterns ignoredLinks; + + /** + * Repo to export the pages to. + */ + protected Repository repo; + + /** + * Pages are crawled and exported in batches in order to avoid flooding + * the memory if there are many pages on a website. Default value is 100. + */ + protected int batchSize; + + /** + * Ctor. + * @param webd Selenium WebDriver. + * @param igp Ignored patterns. + * @param repo Repository to export the crawled pages into. + * @param batch Size of a crawl batch. + */ + public AbstractWebCrawl(WebDriver webd, IgnoredPatterns igp, Repository repo, int batch) { + this.driver = webd; + this.ignoredLinks = igp; + this.repo = repo; + this.batchSize = batch; + } + public abstract void crawl() throws DataExportException; } diff --git a/src/main/java/com/amihaiemil/charles/ElasticSearchRepository.java b/src/main/java/com/amihaiemil/charles/ElasticSearchRepository.java deleted file mode 100644 index bb80355..0000000 --- a/src/main/java/com/amihaiemil/charles/ElasticSearchRepository.java +++ /dev/null @@ -1,194 +0,0 @@ -/* - Copyright (c) 2016, Mihai Emil Andronache - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of charles nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -package com.amihaiemil.charles; - -import java.io.IOException; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import javax.json.JsonObject; - -import org.apache.http.HttpStatus; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.jcabi.http.Request; -import com.jcabi.http.Response; -import com.jcabi.http.request.ApacheRequest; -import com.jcabi.http.response.JsonResponse; -import com.jcabi.http.response.RestResponse; - -/** - * Elasticsearch repository. - * Documents are put into an elastic search index making a HTTP POST to - * the _bulk API.

- * - * Use this class when you have your own ES instance setup. - * - * @author Mihai Andronache (amihaiemil@gmail.com) - * - */ -public final class ElasticSearchRepository implements Repository { - private static final Logger LOG = LoggerFactory.getLogger(ElasticSearchRepository.class); - - /** - * Regex pattern to validate index url. - */ - private static final String ES_INDEX_PATTERN = - "^(http:\\/\\/|https:\\/\\/)([a-zA-Z0-9._-]+)(:[0-9]{1,5})?\\/[a-zA-Z0-9-_.]+$"; - - /** - * Request made to ES. - */ - private Request post; - - /** - * Ctor. - * @param index ES index address. - */ - public ElasticSearchRepository(String index) { - this(index, null, null); - } - - /** - * Ctor. - * @param index ES index address. - * @param username Basic auth user. - * @param password Basic auth pass. - */ - public ElasticSearchRepository( - String index, String username, String password) { - if(!this.isIndexUrlValid(index)) { - throw new IllegalArgumentException( - "Wrong ES index url pattern! Expected " - + "(http|https)://domain[:port]/indexname" - ); - } - if(username != null && password != null) { - String wCredentials; - if(index.startsWith("http://")) { - wCredentials = "http://" + username + ":" + password - + "@" + index.substring(7); - } else { - wCredentials = "https://" + username + ":" + password - + "@" + index.substring(8); - } - this.post = new ApacheRequest(wCredentials + "/_bulk?pretty") - .header("content-type", "application/json"); - } else { - this.post = new ApacheRequest(index + "/_bulk?pretty") - .header("content-type", "application/json"); - } - } - - /** - * This will put all the specified WebPages into the - * elastic search index. If a document already exists, it will be updated - * (only if the id is specified). The indexing is done as bulk operation, to avoid - * many http requests.
- *
- * Note: The "id" String attribute is searched in each json document - * and if found, it will be used for indexing. If not found, elasticsearch - * will generate one automatically. - * @param pages Crawled pages to be indexed - * @see - * _bulk API - */ - @Override - public void export(List pages) throws DataExportException { - try { - LOG.info("Sending " + pages.size() + " to the elasticsearch index"); - JsonObject jsonResponse = this.sendToIndex( - new EsBulkContent(pages).structure() - ); - if(jsonResponse.getBoolean("errors", Boolean.TRUE)) { - LOG.error( - "There were errors during indexing to " + - ". Whole JSON response: " + - jsonResponse.toString() - ); - throw new DataExportException("Errors when calling the _bulk api."); - } - LOG.info("Bulk indexing of the " + pages.size() + " documents, finished in " + jsonResponse.getInt("took") + " miliseconds!"); - } catch (IOException e) { - LOG.error(e.getMessage(), e); - throw new DataExportException(e.getMessage()); - } - } - - /** - * POSTs the given json string to an elasticsearch index. - * @param jsonStructure Json structure to index - * @return JSON response body. - * @throws IOException if something goes wrong. - */ - private JsonObject sendToIndex(String jsonStructure) - throws IOException { - - this.post = this.post - .method(Request.POST).body().set(jsonStructure).back(); - Response resp = post.fetch(); - int status = resp.as(RestResponse.class).status(); - JsonObject json = resp.as(JsonResponse.class).json().readObject(); - if(status != HttpStatus.SC_OK) { - LOG.warn( - "Http status response from elastic search index: " + - status + - ". Whole JSON response: " + - json - ); - if(status == HttpStatus.SC_INTERNAL_SERVER_ERROR) { - LOG.error( - "500 SERVER ERROR from elasticsearch /_bulk api. Whole JSON response " + - json.toString() - ); - throw new IOException("500 SERVER ERROR from elasticsearch /_bulk api!"); - } - } - return json; - } - - /** - * Checks if the index url is well formatted. - * It has have the following format: - * http://domain[:port]/indexname or https://domain[:port]/indexnam - * @param url Given url - * @return true if valid, false if not - */ - public boolean isIndexUrlValid(String url) { - try { - Pattern pattern = Pattern.compile(ES_INDEX_PATTERN); - Matcher matcher = pattern.matcher(url); - return matcher.matches(); - } catch (PatternSyntaxException ex) { - return false; - } - } - -} diff --git a/src/main/java/com/amihaiemil/charles/EsBulkContent.java b/src/main/java/com/amihaiemil/charles/EsBulkContent.java deleted file mode 100644 index 4c564ad..0000000 --- a/src/main/java/com/amihaiemil/charles/EsBulkContent.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - Copyright (c) 2016, Mihai Emil Andronache - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of charles nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -package com.amihaiemil.charles; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import javax.json.Json; -import javax.json.JsonObject; - -import com.fasterxml.jackson.core.JsonProcessingException; - -/** - * Index documents in bulk. - * @author Mihai Andronache (amihaiemil@gmail.com) - * - */ -final class EsBulkContent { - - /** - * WebPages that go to the ES _bulk API, - */ - private List pages; - - /** - * Ctor. - * @param index Index where the pages will be stored. - * @param pages Given web pages. - */ - public EsBulkContent(List pages) { - if(pages == null || pages.size() == 0) { - throw new IllegalArgumentException("There must be at least 1 page!"); - } - this.pages = pages; - } - - /** - * Pepare the json structure for bulk indexing. - * @param docs The json documents to be indexed. - * @return The json structure as a String. - * @throws IOException If something goes wrong while parsing. - */ - public String structure() throws IOException { - StringBuilder sb = new StringBuilder(); - for(WebPage page : pages) { - JsonObject doc = this.preparePage(page); - String id = doc.getString("id", ""); - String action_and_meta_data; - if(id.isEmpty()) { - action_and_meta_data = "{\"index\":{\"_type\":\"" + doc.getString("category") + "\"}}"; - } else { - action_and_meta_data = "{\"index\":{\"_type\":\"" + doc.getString("category") + "\", " - + "\"_id\":\"" + id + "\"}}"; - } - sb = sb.append(action_and_meta_data).append("\n"); - sb = sb.append(doc.getJsonObject("page").toString()).append("\n"); - } - return sb.toString(); - } - - /** - * Converts the WebPage to a Json (with the URL as id) for the ES index. - * @param page WebPage to index. - * @return JSON which contains the id + json-formatted page - * @throws IOException In case there are problems when parsing the webpage - */ - private JsonObject preparePage(WebPage page) throws IOException { - JsonWebPage jsonPage = new JsonWebPage(page); - JsonObject parsed = jsonPage.toJsonObject(); - return Json.createObjectBuilder() - .add("id", page.getUrl()) - .add("category", parsed.getString("category")) - .add("page", parsed).build(); - } - -} diff --git a/src/main/java/com/amihaiemil/charles/GraphCrawl.java b/src/main/java/com/amihaiemil/charles/GraphCrawl.java index 302e460..d50c61f 100644 --- a/src/main/java/com/amihaiemil/charles/GraphCrawl.java +++ b/src/main/java/com/amihaiemil/charles/GraphCrawl.java @@ -32,57 +32,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE import java.util.Set; import org.openqa.selenium.WebDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriverService; -import org.openqa.selenium.remote.DesiredCapabilities; /** * Crawl the website as a graph (tree) starting from the index page. * @author Mihai Andronache (amihaiemil@gmail.com) * */ -public final class GraphCrawl implements WebCrawl { - - /** - * WebDriver. - */ - private WebDriver driver; +public final class GraphCrawl extends AbstractWebCrawl { /** * Page to start the crawling from. */ private Link index; - - /** - * Ignored pages patterns. - */ - private IgnoredPatterns ignoredLinks; - - /** - * Repo to export the pages to. - */ - private Repository repo; - - /** - * Pages are crawled and exported in batches in order to avoid flooding - * the memory if there are many pages on a website. Default value is 100. - */ - private int batchSize; - - /** - * Constructor. - * @param idx The index page of the site. - * @param phantomJsExecPath path to PhantomJS. - * @param ignored Ignored pages patterns. - * @param repo Repository where the crawled pages are exported. - */ - public GraphCrawl( - String idx, String phantomJsExecPath, - IgnoredPatterns ignored, Repository repo - ) { - this(idx, phantomJsExecPath, ignored, repo, 100); - } - + /** * Constructor. * @param idx The index page of the site. @@ -96,32 +58,6 @@ public GraphCrawl( ) { this(idx, drv, ignored, repo, 100); } - - - /** - * Constructor. - * @param idx The index page of the site. - * @param phantomJsExecPath path to PhantomJS. - * @param ignored Ignored pages patterns. - * @param repo Repository where the crawled pages are exported. - * @param batchSize Size of the export batch. - */ - public GraphCrawl( - String idx, String phantomJsExecPath, - IgnoredPatterns ignored, Repository repo, int batchSize - ) { - this.batchSize = batchSize; - this.ignoredLinks = ignored; - this.index = new Link("index", idx); - DesiredCapabilities dc = new DesiredCapabilities(); - dc.setJavascriptEnabled(true); - dc.setCapability( - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, - phantomJsExecPath - ); - this.driver = new PhantomJSDriver(dc); - this.repo = repo; - } /** * Constructor. @@ -135,12 +71,10 @@ public GraphCrawl( String idx, WebDriver drv, IgnoredPatterns ignored, Repository repo, int batchSize ) { - this.ignoredLinks = ignored; - this.index = new Link("index", idx); - this.driver = drv; - this.repo = repo; + super(drv, ignored, repo, batchSize); + this.index = new Link("index", idx); } - + @Override public void crawl() throws DataExportException { List pages = new ArrayList(); @@ -169,6 +103,7 @@ public void crawl() throws DataExportException { link = toCrawl.remove(0); } this.repo.export(pages); + this.driver.quit(); } } diff --git a/src/main/java/com/amihaiemil/charles/LiveWebPage.java b/src/main/java/com/amihaiemil/charles/LiveWebPage.java index 91c9d31..a3df035 100644 --- a/src/main/java/com/amihaiemil/charles/LiveWebPage.java +++ b/src/main/java/com/amihaiemil/charles/LiveWebPage.java @@ -32,6 +32,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE import org.openqa.selenium.NoSuchElementException; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; +import org.openqa.selenium.support.CacheLookup; import org.openqa.selenium.support.FindBy; import org.openqa.selenium.support.FindBys; import org.openqa.selenium.support.PageFactory; @@ -51,18 +52,21 @@ public final class LiveWebPage implements LivePage { * Visible anchors. */ @FindBys(@FindBy(tagName=("a"))) + @CacheLookup private List anchors; /** * Text content from the page. */ @FindBy(tagName=("body")) + @CacheLookup private WebElement body; /** * Page logical category. Defaults to "page" */ @FindBy(id = "pagectg") + @CacheLookup private WebElement category; public LiveWebPage(WebDriver driver, Link l) { @@ -142,4 +146,5 @@ public Set getLinks() { public void setLinks(Set links) { throw new UnsupportedOperationException("#setLinks"); } + } diff --git a/src/main/java/com/amihaiemil/charles/SitemapXmlCrawl.java b/src/main/java/com/amihaiemil/charles/SitemapXmlCrawl.java index 0f69c46..402c8a9 100644 --- a/src/main/java/com/amihaiemil/charles/SitemapXmlCrawl.java +++ b/src/main/java/com/amihaiemil/charles/SitemapXmlCrawl.java @@ -32,9 +32,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE import java.util.Set; import org.openqa.selenium.WebDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriverService; -import org.openqa.selenium.remote.DesiredCapabilities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,29 +43,9 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * Crawl a website based on the given sitemap xml. * @author Mihai Andronache (amihaiemil@gmail.com) */ -public final class SitemapXmlCrawl implements WebCrawl { +public final class SitemapXmlCrawl extends AbstractWebCrawl { private static final Logger LOG = LoggerFactory.getLogger(SitemapXmlCrawl.class); - - - private WebDriver driver; private Set urlset; - private IgnoredPatterns ignoredLinks; - private Repository repo; - private int batchSize; - - /** - * Start a new sitemap.xml crawl using phantom js. - * @param phantomJsExecPath Path to the phantomJS executable. - * @param ignored Ignored pages patterns. - * @param repo Repository where the crawled pages are exported. - * @param sitemapXmlPath Path to the sitemap.xml file. - */ - public SitemapXmlCrawl( - String phantomJsExecPath, SitemapXmlLocation sitemapLoc, - IgnoredPatterns ignored, Repository repo - ) throws IOException { - this(phantomJsExecPath, sitemapLoc, ignored, repo, 100); - } /** * Start a new sitemap.xml crawl using the specified driver. @@ -93,41 +70,8 @@ public SitemapXmlCrawl( WebDriver drv, SitemapXmlLocation sitemapLoc, IgnoredPatterns ignored, Repository repo, int batch ) throws IOException { - this.driver = drv; + super(drv, ignored, repo, batch); this.urlset = new SitemapXml(sitemapLoc.getStream()).read().getUrls(); - this.ignoredLinks = ignored; - this.repo = repo; - this.batchSize = batch; - } - - /** - * Start a new sitemap.xml crawl using phantom js. - * @param phantomJsExecPath Path to the phantomJS executable. - * @param sitemapXmlPath Path to the sitemap.xml file. - * @param ignored Ignored pages patterns. - * @param repo Repository where the crawled pages are exported. - * @param batch Size of the batch to export. - */ - public SitemapXmlCrawl( - String phantomJsExecPath, SitemapXmlLocation sitemapLoc, - IgnoredPatterns ignored, Repository repo, int batch - ) throws IOException { - DesiredCapabilities dc = new DesiredCapabilities(); - dc.setJavascriptEnabled(true); - dc.setCapability( - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, - phantomJsExecPath - ); - this.driver = new PhantomJSDriver(dc); - try { - this.urlset = new SitemapXml(sitemapLoc.getStream()).read().getUrls(); - } catch (IOException ex) { - this.driver.quit(); - throw ex; - } - this.ignoredLinks = ignored; - this.repo = repo; - this.batchSize = batch; } public void crawl() throws DataExportException { @@ -151,7 +95,7 @@ public void crawl() throws DataExportException { } LOG.info("Finished crawling the sitemap.xml!"); this.repo.export(pages); - driver.quit(); + this.driver.quit(); } } diff --git a/src/main/java/com/amihaiemil/charles/WebPage.java b/src/main/java/com/amihaiemil/charles/WebPage.java index d6d5adf..8810b20 100644 --- a/src/main/java/com/amihaiemil/charles/WebPage.java +++ b/src/main/java/com/amihaiemil/charles/WebPage.java @@ -39,20 +39,70 @@ public interface WebPage { * @return String filename. */ String getName(); + + /** + * Set the name of this webpage. + * @param name Given name. + */ void setName(String name); - + + /** + * Page's url. + * @return String url. e.g. http://charles.amihaiemil.com/index.html + */ String getUrl(); + + /** + * Set url. + * @param url + */ void setUrl(String url); + /** + * Get the title of the page. + * @return String title. + */ String getTitle(); + + /** + * Set the page title. + * @param title + */ void setTitle(String title); + /** + * Get all the text content of the page. + * @return + */ String getTextContent(); + + /** + * Set the text content. + * @param textContent + */ void setTextContent(String textContent); + /** + * Get the page's category (text of an element with id = "pagectg") + * @return + */ String getCategory(); + + /** + * Set the page's category. + * @param category + */ void setCategory(String category); + /** + * Fetch all the anchors (links) from the page. + * @return + */ Set getLinks(); + + /** + * Set the anchors on a page. + * @param links + */ void setLinks(Set links); } diff --git a/src/test/java/com/amihaiemil/charles/ElasticSearchRepositoryITCase.java b/src/test/java/com/amihaiemil/charles/ElasticSearchRepositoryITCase.java deleted file mode 100644 index f243179..0000000 --- a/src/test/java/com/amihaiemil/charles/ElasticSearchRepositoryITCase.java +++ /dev/null @@ -1,252 +0,0 @@ -/* - Copyright (c) 2016, Mihai Emil Andronache - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of charles nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -package com.amihaiemil.charles; - -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.List; - -import javax.json.Json; -import javax.json.JsonArray; -import javax.json.JsonObject; - -import org.apache.commons.io.IOUtils; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.impl.client.HttpClients; -import org.junit.Assert; -import org.junit.Test; - -import static org.junit.Assert.*; - -/** - * Integration tests for {@link ElasticSearchRepository} - * @author Mihai Andronache (amihaiemil@gmail.com) - * - */ -public class ElasticSearchRepositoryITCase { - - /** - * Test ES credentials. - */ - private static final String USER = "jeff"; - private static final String PASS = "s3cr3t"; - - /** - * {@link ElasticSearchRepository} can send the documents to index. - */ - @Test - public void indexesDocuments() throws Exception { - List pages = new ArrayList(); - pages.add(this.webPage("http://www.amihaiemil.com/index.html")); - pages.add(this.webPage("http://eva.amihaiemil.com/index.html")); - - String indexInfo = "http://localhost:9200/charlesit"; - ElasticSearchRepository elasticRepo = new ElasticSearchRepository(indexInfo); - elasticRepo.export(pages); - - Thread.sleep(3000);//indexed docs don't become searchable instantly - - JsonObject resp = this.search("*:*", indexInfo + "/page", false); - JsonObject hits = resp.getJsonObject("hits"); - assertTrue(hits.getInt("total") == 2); - JsonArray results = hits.getJsonArray("hits"); - assertTrue(hits.getJsonArray("hits").size() == 2); - boolean containsEva = false; - for(int i=0;i pages = new ArrayList(); - pages.add(this.webPage("http://www.amihaiemil.com/index.html")); - pages.add(this.webPage("http://eva.amihaiemil.com/index.html")); - - String indexInfo = "http://localhost:8080/charlesitauth"; - ElasticSearchRepository elasticRepo = - new ElasticSearchRepository(indexInfo, USER, PASS); - elasticRepo.export(pages); - - Thread.sleep(3000);//indexed docs don't become searchable instantly - - JsonObject resp = this.search("*:*", indexInfo + "/page", true); - JsonObject hits = resp.getJsonObject("hits"); - assertTrue(hits.getInt("total") == 2); - JsonArray results = hits.getJsonArray("hits"); - assertTrue(hits.getJsonArray("hits").size() == 2); - boolean containsEva = false; - for(int i=0;i()); - page.setName("indextest.html"); - page.setTitle("Intex Test | Title"); - page.setCategory("page"); - page.setTextContent("Test content of this awesome test page."); - return page; - } -} diff --git a/src/test/java/com/amihaiemil/charles/EsBulkContentTestCase.java b/src/test/java/com/amihaiemil/charles/EsBulkContentTestCase.java deleted file mode 100644 index 0de924e..0000000 --- a/src/test/java/com/amihaiemil/charles/EsBulkContentTestCase.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - Copyright (c) 2016, Mihai Emil Andronache - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of charles nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -package com.amihaiemil.charles; - -import static org.junit.Assert.*; - -import java.io.File; -import java.io.FileInputStream; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; - -import javax.json.Json; -import javax.json.JsonObject; - -import org.apache.commons.io.IOUtils; -import org.junit.Test; - -/** - * Unit tests for {@link EsBulkContent} - * @author Mihai Andronache (amihaiemil@gmail.com) - * - */ -public class EsBulkContentTestCase { - - /** - * EsBulkIndex throws exception on empty docs list. - */ - @Test(expected = IllegalArgumentException.class) - public void exceptionOnEmptyList() { - List docs = new ArrayList(); - new EsBulkContent(docs); - } - - /** - * EsBulkIndex throws exception on null docs list. - */ - @Test(expected = IllegalArgumentException.class) - public void exceptionOnNullList() { - new EsBulkContent(null); - } - - /** - * EsBulkContent can create the json bulk for Elastic search _bulk api. - * @throws Exception If something goes wrong. - */ - @Test - public void structuresPagesCorrectly() throws Exception { - List pages = new ArrayList(); - pages.add(this.mockWebPage("http://amihaiemil.com/page.html", "tech")); - pages.add(this.mockWebPage("http://amihaiemil.com/stuff/page.html", "mischelaneous")); - pages.add(this.mockWebPage("http://amihaiemil.com/stuff/more/page.html", "development")); - - String bulkStrucure = new EsBulkContent(pages).structure(); - - String expected = new String( - IOUtils.toByteArray( - new FileInputStream( - new File("src/test/resources/bulkIndexStructure.txt") - ) - ) - ); - assertTrue( - "The 2 structures are not the same! (did you forget to add a final newline (\\n)?", - expected.equals(bulkStrucure) - ); - } - - /** - * Mock a WebPage for test. - * @param url - * @param category - * @return Webpage instance. - */ - private WebPage mockWebPage(String url, String category) { - WebPage page = new SnapshotWebPage(); - page.setUrl(url); - page.setCategory(category); - - page.setLinks(new HashSet()); - page.setTextContent("text content..."); - page.setName("page.html"); - page.setTitle("page | title"); - return page; - } - -} - - diff --git a/src/test/java/com/amihaiemil/charles/SitemapXmlCrawlITCase.java b/src/test/java/com/amihaiemil/charles/SitemapXmlCrawlITCase.java index 4a2e33d..04572c4 100644 --- a/src/test/java/com/amihaiemil/charles/SitemapXmlCrawlITCase.java +++ b/src/test/java/com/amihaiemil/charles/SitemapXmlCrawlITCase.java @@ -27,7 +27,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE import static org.junit.Assert.assertTrue; +import org.junit.After; +import org.junit.Before; import org.junit.Test; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.openqa.selenium.remote.DesiredCapabilities; import com.amihaiemil.charles.sitemap.SitemapXmlOnDisk; @@ -37,19 +43,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * */ public class SitemapXmlCrawlITCase { + + private WebDriver driver; + /** * A page's title can be retrieved. * @throws Exception - If something goes wrong. */ @Test public void getsPageTitle() throws Exception { - String phantomJsExecPath = System.getProperty("phantomjsExec"); - if("".equals(phantomJsExecPath)) { - phantomJsExecPath = "/usr/local/bin/phantomjs"; - } + InMemoryRepository inmr = new InMemoryRepository(); SitemapXmlCrawl sitemapXmlCrawl = new SitemapXmlCrawl( - phantomJsExecPath, + this.driver, new SitemapXmlOnDisk("src/test/resources/testsitemap.xml"), new IgnoredPatterns(), inmr @@ -58,4 +64,28 @@ public void getsPageTitle() throws Exception { assertTrue(inmr.getCrawledPages().size() == 1); assertTrue(inmr.getCrawledPages().get(0).getTitle().equals("EvA project")); } + + @Before + public void initDriver() { + this.driver = this.phantomJsDriver(); + } + + @After + public void quitDriver() { + this.driver.quit(); + } + + private WebDriver phantomJsDriver() { + String phantomJsExecPath = System.getProperty("phantomjsExec"); + if("".equals(phantomJsExecPath)) { + phantomJsExecPath = "/usr/local/bin/phantomjs"; + } + DesiredCapabilities dc = new DesiredCapabilities(); + dc.setJavascriptEnabled(true); + dc.setCapability( + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, + phantomJsExecPath + ); + return new PhantomJSDriver(dc); + } }