Skip to content
This repository has been archived by the owner on Apr 24, 2020. It is now read-only.

Commit

Permalink
checkstyle fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
amihaiemil committed Jan 10, 2017
1 parent 6cee9b1 commit 9d84338
Show file tree
Hide file tree
Showing 14 changed files with 256 additions and 144 deletions.
14 changes: 0 additions & 14 deletions checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,6 @@
<!--
JavaDoc regexp checks
-->
<module name="RegexpSingleline">
<property name="format" value="\* +@return +[^A-Z]"/>
<property name="fileExtensions" value="java"/>
<property name="message" value="@return tag description should start with capital letter"/>
</module>
<module name="RegexpSingleline">
<property name="format" value="\* +@param +\w+ +[^A-Z]"/>
<property name="fileExtensions" value="java"/>
<property name="message" value="@param tag description should start with capital letter"/>
</module>
<module name="RegexpSingleline">
<property name="format" value="/\*\* +[^A-Z\{]"/>
<property name="fileExtensions" value="java"/>
Expand Down Expand Up @@ -167,7 +157,6 @@
<module name="EmptyStatement"/>
<module name="EqualsAvoidNull"/>
<module name="EqualsHashCode"/>
<module name="FinalLocalVariable"/>
<module name="HiddenField">
<property name="ignoreConstructorParameter" value="true"/>
</module>
Expand Down Expand Up @@ -267,9 +256,6 @@
<module name="LocalVariableName">
<property name="format" value="^(id|[A-Za-z]{3,20})$"/>
</module>
<module name="CatchParameterName">
<property name="format" value="^(ex|[A-Za-z]{3,20})$"/>
</module>
<module name="MemberName">
<property name="format" value="^(id|[A-Za-z]{3,20})$"/>
</module>
Expand Down
53 changes: 46 additions & 7 deletions src/main/java/com/amihaiemil/charles/AbstractWebCrawl.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/package com.amihaiemil.charles;
*/
package com.amihaiemil.charles;

import org.openqa.selenium.WebDriver;

/**
* An abstract webcrawl - contains the webdriver and other common data of each crawl.
* An abstract webcrawl - contains the webdriver and other common data of each
* crawl.
* @author Mihai Andronache (amihaiemil@gmail.com)
* @version $Id$
* @since 1.0.0
Expand All @@ -39,37 +41,74 @@ public abstract class AbstractWebCrawl implements WebCrawl {
/**
* WebDriver.
*/
protected WebDriver driver;
private final WebDriver driver;

/**
* Ignored pages patterns.
*/
protected IgnoredPatterns ignoredLinks;
private final IgnoredPatterns ignoredLinks;

/**
* Repo to export the pages to.
*/
protected Repository repo;
private final Repository repo;

/**
* Pages are crawled and exported in batches in order to avoid flooding
* the memory if there are many pages on a website. Default value is 100.
*/
protected int batchSize;
private final int batchSize;

/**
* Ctor.
* @param webd Selenium WebDriver.
* @param igp Ignored patterns.
* @param repo Repository to export the crawled pages into.
* @param batch Size of a crawl batch.
* @checkstyle ParameterNumber (6 lines)
*/
public AbstractWebCrawl(WebDriver webd, IgnoredPatterns igp, Repository repo, int batch) {
public AbstractWebCrawl(
final WebDriver webd, final IgnoredPatterns igp,
final Repository repo, final int batch
) {
this.driver = webd;
this.ignoredLinks = igp;
this.repo = repo;
this.batchSize = batch;
}

@Override
public abstract void crawl() throws DataExportException;

/**
* Fetch the used WebSriver.
* @return driver Webdriver of this crawl
*/
public final WebDriver driver() {
return this.driver;
}

/**
* Fetch the used Repository.
* @return repo Repository where the pages are sent
*/
public final Repository repo() {
return this.repo;
}

/**
* Fetch the ignored links patterns.
* @return ignoredLinks IgnoredPatterns of this crawl
*/
public final IgnoredPatterns ignoredPatterns() {
return this.ignoredLinks;
}

/**
* Batch size. How many pages will be crawled at once?
* @return Integer batch size.
*/
public final int batchSize() {
return this.batchSize;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/package com.amihaiemil.charles;
*/
package com.amihaiemil.charles;

/**
* Exception thrown if a page cannot be exported properly.
Expand Down
16 changes: 8 additions & 8 deletions src/main/java/com/amihaiemil/charles/GraphCrawl.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ public GraphCrawl(

@Override
public void crawl() throws DataExportException {
if(!this.ignoredLinks.contains(this.index.getHref())) {
if(!this.ignoredPatterns().contains(this.index.getHref())) {
List<WebPage> pages = new ArrayList<WebPage>();
WebPage indexSnapshot = new LiveWebPage(this.driver, this.index).snapshot();
WebPage indexSnapshot = new LiveWebPage(this.driver(), this.index).snapshot();
pages.add(indexSnapshot);

Set<Link> crawledLinks = new HashSet<Link>();
Expand All @@ -114,22 +114,22 @@ public void crawl() throws DataExportException {
if(toCrawl.size() > 0) {
Link link = toCrawl.remove(0);
while(toCrawl.size() > 0) {
if(this.ignoredLinks.contains(link.getHref())) {
if(this.ignoredPatterns().contains(link.getHref())) {
link = toCrawl.remove(0);
continue;
}
boolean notCrawledAlready = crawledLinks.add(link);
if(notCrawledAlready) {
WebPage snapshotCrawled = new LiveWebPage(this.driver, link).snapshot();
WebPage snapshotCrawled = new LiveWebPage(this.driver(), link).snapshot();
pages.add(snapshotCrawled);
this.checkBatchSize(pages);
toCrawl.addAll(snapshotCrawled.getLinks());
}
link = toCrawl.remove(0);
}
}
this.repo.export(pages);
this.driver.quit();
this.repo().export(pages);
this.driver().quit();
}
}

Expand All @@ -140,8 +140,8 @@ public void crawl() throws DataExportException {
* @throws DataExportException If something goes wrong during processing of crawled pages.
*/
private void checkBatchSize(List<WebPage> pages) throws DataExportException {
if(pages.size() == this.batchSize) {
this.repo.export(pages);
if(pages.size() == this.batchSize()) {
this.repo().export(pages);
pages.clear();
}
}
Expand Down
12 changes: 6 additions & 6 deletions src/main/java/com/amihaiemil/charles/InMemoryRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,25 @@
* @author Mihai Andronache (amihaiemil@gmail.com)
*
*/
public class InMemoryRepository implements Repository {
public final class InMemoryRepository implements Repository {

/**
* Holds all the crawled pages.
*/
private List<WebPage> pages = new ArrayList<WebPage>();
private final List<WebPage> pgs = new ArrayList<WebPage>();

/**
* Get all the pages from this Repository.
* @return List of pages.
*/
public List<WebPage> getCrawledPages() {
return this.pages;
return this.pgs;
}

@Override
public void export(List<WebPage> pages) throws DataExportException {
for(WebPage page : pages) {
this.pages.add(page);
public void export(final List<WebPage> pages) throws DataExportException {
for(final WebPage page : pages) {
this.pgs.add(page);
}
}
}
17 changes: 9 additions & 8 deletions src/main/java/com/amihaiemil/charles/Link.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public Link() {
this("", "");
}

public Link(String text, String href) {
public Link(final String text, final String href) {
this.text = text;
this.href = href;
}
Expand All @@ -55,15 +55,15 @@ public String getText() {
return text;
}

public void setText(String text) {
public void setText(final String text) {
this.text = text;
}

public String getHref() {
return href;
}

public void setHref(String href) {
public void setHref(final String href) {
this.href = href;
}

Expand All @@ -75,8 +75,9 @@ public int hashCode() {
result = prime * result + 0;
} else {
if (this.href.contains("#")) {
result = new Link("", href.substring(0, href.indexOf("#")))
.hashCode();
result = new Link(
"", this.href.substring(0, this.href.indexOf("#"))
).hashCode();
} else {
if (this.href.endsWith("/")) {
result = prime
Expand All @@ -103,7 +104,7 @@ public boolean equals(Object obj) {
return false;
}
Link other = (Link) obj;
if (href == null) {
if (this.href == null) {
if (other.href != null)
return false;
} else {
Expand All @@ -121,7 +122,7 @@ public boolean equals(Object obj) {
}

if (this.href.endsWith("/") && other.href.endsWith("/")) {
return this.href.substring(0, href.length() - 1).equals(
return this.href.substring(0, this.href.length() - 1).equals(
other.href.substring(0, other.href.length() - 1));
} else if (this.href.endsWith("/")) {
return this.href.substring(0, href.length() - 1).equals(
Expand All @@ -144,7 +145,7 @@ public String toString() {
*
* @return ture if valid, false otherwise.
*/
public boolean valid(String parentLoc) {
public boolean valid(final String parentLoc) {

if (this.href != null && !this.href.startsWith("mailto")) {
int slashIndex = parentLoc.indexOf("/", 8);// index of the first "/"
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/com/amihaiemil/charles/LiveWebPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ public final class LiveWebPage implements LivePage {
/**
* Visible anchors.
*/
@FindBys(@FindBy(tagName=("a")))
@FindBys(@FindBy(tagName="a"))
@CacheLookup
private List<WebElement> anchors;

/**
* Text content from the page.
*/
@FindBy(tagName=("body"))
@FindBy(tagName="body")
@CacheLookup
private WebElement body;

Expand Down
20 changes: 9 additions & 11 deletions src/main/java/com/amihaiemil/charles/SitemapXmlCrawl.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,9 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import org.openqa.selenium.WebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.amihaiemil.charles.sitemap.SitemapXml;
import com.amihaiemil.charles.sitemap.SitemapXmlLocation;
import com.amihaiemil.charles.sitemap.Url;
Expand Down Expand Up @@ -98,24 +96,24 @@ public void crawl() throws DataExportException {
List<WebPage> pages = new ArrayList<WebPage>();
LOG.info("Started crawling the sitemap.xml...");
for(Url url : this.urlset) {
if(this.ignoredLinks.contains(url.getLoc())) {
if(this.ignoredPatterns().contains(url.getLoc())) {
continue;
}
LOG.info("Crawling page " + url.getLoc() + "... ");
pages.add(new LiveWebPage(this.driver, url.getLoc()).snapshot());
pages.add(new LiveWebPage(this.driver(), url.getLoc()).snapshot());
LOG.info("Done crawling page " + url.getLoc() + "!");
if(pages.size() == this.batchSize) {
try {
this.repo.export(pages);
if(pages.size() == this.batchSize()) {
try {
this.repo().export(pages);
pages.clear();
} catch (DataExportException e) {
} catch (DataExportException e) {
e.printStackTrace();
}
}
}
}
LOG.info("Finished crawling the sitemap.xml!");
this.repo.export(pages);
this.driver.quit();
this.repo().export(pages);
this.driver().quit();
}

}
Loading

0 comments on commit 9d84338

Please sign in to comment.