Skip to content

Commit

Permalink
added postedDate attribute on item class
Browse files Browse the repository at this point in the history
  • Loading branch information
nwihardjo committed Nov 8, 2018
1 parent 7f7bd64 commit ee2a0a3
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 12 deletions.
14 changes: 11 additions & 3 deletions src/main/java/comp3111/webscraper/Item.java
Original file line number Diff line number Diff line change
@@ -1,24 +1,32 @@
package comp3111.webscraper;

import java.util.Arrays;

import java.util.Date;

public class Item implements Comparable <Item> {
private String title ;
// price in HKD
private double price ;
private String url ;
private String portal ;
private Date postedDate;

public Item(String title, Double price, String url, String portal) {
public Item(String title, Double price, String url, String portal, Date postedDate) {
this.setTitle(title);
this.setPrice(price);
this.setUrl(url);
this.setPortal(portal);
this.setPostedDate(postedDate);
}

public Item() {}


public Date getPostedDate() {
return postedDate;
}
public void setPostedDate(Date postedDate) {
this.postedDate = postedDate;
}
public String getTitle() {
return title;
}
Expand Down
35 changes: 26 additions & 9 deletions src/main/java/comp3111/webscraper/WebScraper.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
import java.io.FileWriter;
import java.io.IOException;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.DomAttr;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
Expand Down Expand Up @@ -116,23 +119,23 @@ private static Double getPrice(HtmlElement item, String portal) {
else {
// USE CASE: no price available, but there some offers which contain price
if (DEBUG) System.out.println("\t DEBUG: no price available, but there are offers at " + offeredPrice.asText());
return new Double(offeredPrice.asText().replaceAll("\\(.*\\)", "").replace("$", "").replaceAll(",", ""));
return new Double(offeredPrice.asText().replaceAll("\\(.*\\)", "").replace("$", "").replace(",", ""));
}
} else if (ItemWholePrice.size() > 1 || ItemFractionalPrice.size() > 1) {
Double lowPrice = new Double (ItemWholePrice.get(0).asText() + "." + ItemFractionalPrice.get(0).asText());
Double highPrice = new Double (ItemWholePrice.get(1).asText() + "." + ItemFractionalPrice.get(1).asText());
Double lowPrice = new Double (ItemWholePrice.get(0).asText().replace("$", "").replace(",", "") + "." + ItemFractionalPrice.get(0).asText());
Double highPrice = new Double (ItemWholePrice.get(1).asText().replace("$", "").replace(",","") + "." + ItemFractionalPrice.get(1).asText());
// USE CASE: return average price if the price given is a range
return (lowPrice + highPrice) / 2.0;
} else {
if (DEBUG) System.out.println("\t DEBUG: GETPRICE FINAL " + ItemWholePrice.size() + " and " + ItemFractionalPrice.size());
return new Double (ItemWholePrice.get(0).asText() + "." + ItemFractionalPrice.get(0).asText()); }
return new Double (ItemWholePrice.get(0).asText().replace(",", "") + "." + ItemFractionalPrice.get(0).asText()); }
} else {
// portal: craigslist
HtmlElement itemPrice = ((HtmlElement) item.getFirstByXPath(".//a/span[@class='result-price']"));
if (itemPrice == null)
return 0.0;
else
return new Double(itemPrice.asText().replace("$", ""));
return new Double(itemPrice.asText().replace("$", "").replace(",",""));
}
}

Expand All @@ -147,6 +150,19 @@ private static String getUrl(HtmlElement item, String portal) {
return portal_url+itemUrl.getHrefAttribute();
}

private static Date getPostedDate(HtmlElement item) {
// currently only for craigslist
DomAttr itemDate = (DomAttr) item.getFirstByXPath(".//*[@class='result-date']/@datetime");
SimpleDateFormat dateFormatting = new SimpleDateFormat("yyyy-MM-dd HH:mm");
try {
return dateFormatting.parse(itemDate.getValue());
} catch (Exception e) {
if (DEBUG) System.out.println("\t DEBUG: postedDate non-existence!!!");
return null;
}
}


private static Vector<Item> sortResult(ArrayList<Item> amazonArrayList, ArrayList<Item> craigsArrayList){
if (DEBUG) System.out.println("\t DEBUG: entering getTitle method");
Vector<Item> result = new Vector<Item>();
Expand All @@ -166,7 +182,6 @@ else if (craigsArrayList.get(j).getPrice() == amazonArrayList.get(i).getPrice())
return result;
}


/**
* The only method implemented in this class, to scrape web content from the craigslist
*
Expand Down Expand Up @@ -194,13 +209,14 @@ public List<Item> scrape(String keyword) {
if (DEBUG) System.out.println("\t DEBUG: entering item : " + getTitle(amazonItem, AMAZON_URL));

// item instantiation
Item item = new Item(getTitle(amazonItem, AMAZON_URL), getPrice(amazonItem, AMAZON_URL), getUrl(amazonItem, AMAZON_URL), AMAZON_URL);
Item item = new Item(getTitle(amazonItem, AMAZON_URL), getPrice(amazonItem, AMAZON_URL), getUrl(amazonItem, AMAZON_URL), AMAZON_URL, null);
amazonArrayList.add(item);
if (DEBUG) System.out.println("\t DEBUG: [amazon] stored item " + i + ": " + item.getPrice() + " HKD. Name: " +item.getTitle());
}
Collections.sort(amazonArrayList);



System.out.println(" DEBUG: scraping craigslist...");
String searchUrl = DEFAULT_URL + "search/sss?sort=rel&query=" + URLEncoder.encode(keyword, "UTF-8");
HtmlPage page = client.getPage(searchUrl);
Expand All @@ -209,9 +225,10 @@ public List<Item> scrape(String keyword) {

for (int i = 0; i < items.size(); i++) {
HtmlElement htmlItem = (HtmlElement) items.get(i);

// item instantiation
Item item = new Item(getTitle(htmlItem, DEFAULT_URL), getPrice(htmlItem, DEFAULT_URL), getUrl(htmlItem, DEFAULT_URL), DEFAULT_URL);
Item item = new Item(getTitle(htmlItem, DEFAULT_URL), getPrice(htmlItem, DEFAULT_URL), getUrl(htmlItem, DEFAULT_URL), DEFAULT_URL,
getPostedDate(htmlItem));
craigsArrayList.add(item);
}
Collections.sort(craigsArrayList);
Expand Down

0 comments on commit ee2a0a3

Please sign in to comment.