Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Setup post and basescraper with QuoraProfileScraper #1249

Merged
merged 2 commits into from Jul 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 8 additions & 3 deletions src/org/loklak/api/search/ConsoleService.java
Expand Up @@ -33,6 +33,7 @@
import org.loklak.objects.QueryEntry;
import org.loklak.objects.ResultList;
import org.loklak.objects.Timeline;
import org.loklak.objects.Timeline2;
import org.loklak.objects.UserEntry;
import org.loklak.server.APIException;
import org.loklak.server.APIHandler;
Expand All @@ -43,6 +44,7 @@
import org.loklak.susi.SusiProcedures;
import org.loklak.susi.SusiThought;
import org.loklak.susi.SusiTransfer;
import org.loklak.harvester.BaseScraper;

import org.loklak.tools.storage.JSONObjectWithDefault;

Expand Down Expand Up @@ -234,13 +236,16 @@ public String getAPIPath() {
json.setData(transfer.conclude(json.getData()));
return json;
});
/*

dbAccess.put(Pattern.compile("SELECT +?(.*?) +?FROM +?quoraprofile +?WHERE +?profile ??= ??'(.*?)' ??;"), matcher -> {
BaseScraper quoraScrape = new QuoraProfileScraper(matcher.group(2));
Timeline2 dataList = quoraScrape.getData();
SusiThought json = new SusiThought(dataList.toJSON());
SusiTransfer transfer = new SusiTransfer(matcher.group(1));
return quoraScrape.getData().toJSON(transfer.conclude(json.getData()));
json.setData(transfer.conclude(json.getData()));
return json;
});
*/

dbAccess.put(Pattern.compile("SELECT +?(.*?) +?FROM +?wikigeodata +?WHERE +?place ??= ??'(.*?)' ??;"), matcher -> {
SusiThought json = WikiGeoData.wikiGeoData(matcher.group(2));
SusiTransfer transfer = new SusiTransfer(matcher.group(1));
Expand Down
45 changes: 23 additions & 22 deletions src/org/loklak/api/search/QuoraProfileScraper.java
Expand Up @@ -6,12 +6,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
Expand All @@ -30,8 +30,9 @@
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.loklak.data.DAO;
import org.loklak.harvester.Post;
import org.loklak.harvester.BaseScraper;
import org.loklak.harvester.Post;
import org.loklak.objects.Timeline2;
import org.loklak.server.BaseUserRole;

public class QuoraProfileScraper extends BaseScraper {
Expand Down Expand Up @@ -79,7 +80,7 @@ private Post scrapeProfile() {

Post quoraProfile = new QuoraPost(this.query, 0);
Document userHTML = Jsoup.parse(this.html);

String bio = userHTML.getElementsByAttributeValueContaining("class", "ProfileDescription").text();
quoraProfile.put("bio", bio);

Expand All @@ -97,27 +98,27 @@ private Post scrapeProfile() {
String infoText = info.text();
if (infoText.startsWith("Studi")) {
quoraProfile.put(infoText.split(" ")[0].toLowerCase().trim() + "_at", infoText);
} else if (infoText.startsWith("Lives")) {
quoraProfile.put("lives_in", infoText);
} else {
quoraProfile.put("works_at", infoText);
}
} else if (infoText.startsWith("Lives")) {
quoraProfile.put("lives_in", infoText);
} else {
quoraProfile.put("works_at", infoText);
}
}

Elements knowsAbout = userHTML.getElementsByAttributeValueContaining("class", "TopicNameSpan TopicName");
JSONArray topics = new JSONArray();
for (Element topic: knowsAbout) {
topics.put(topic.text());
topics.put(topic.text());
}
quoraProfile.put("knows_about", topics);

JSONObject feeds = new JSONObject();
Elements counts = userHTML.getElementsByAttributeValueContaining("class", "list_count");
for (Element count: counts) {
String topic = count.parent().text();
topic = topic.substring(0, topic.indexOf(count.text())).trim();
feeds.put(topic.toLowerCase() + "_url", baseUrl + count.parent().attr("href"));
feeds.put(topic.toLowerCase(), count.text());
String topic = count.parent().text();
topic = topic.substring(0, topic.indexOf(count.text())).trim();
feeds.put(topic.toLowerCase() + "_url", baseUrl + count.parent().attr("href"));
feeds.put(topic.toLowerCase(), count.text());
}
quoraProfile.put("feeds", feeds);

Expand All @@ -127,9 +128,9 @@ private Post scrapeProfile() {

//TODO: this method shall return Timeline object
@Override
// protected Timeline scrape(BufferedReader br) {
protected Post scrape(BufferedReader br) {
// Timeline dataSet = new Timeline(order);
protected Timeline2 scrape(BufferedReader br) {
// protected Post scrape(BufferedReader br) {
Timeline2 dataSet = new Timeline2(order);
//for profile
Post qPost;
try {
Expand All @@ -139,8 +140,8 @@ protected Post scrape(BufferedReader br) {
}
qPost = scrapeProfile();

// return dataSet.add(qPost);
return qPost;
return dataSet.add(qPost);
// return qPost;
}


Expand All @@ -156,6 +157,7 @@ public QuoraPost(String _quoraId, int _quoraPostNo) {
super();
this.quoraId = _quoraId;
this.quoraPostNo = _quoraPostNo;
this.setPostId();
}

public void getQuoraId(String _quoraId) {
Expand All @@ -166,12 +168,11 @@ public void getQuoraPostNo(int _quoraPostNo) {
this.quoraPostNo = _quoraPostNo;
}

public void getPostId() {
private void setPostId() {
this.postId = this.timestamp + this.quoraPostNo + this.quoraId;
}

public String setPostId() {
this.postId = this.timestamp + this.quoraPostNo + this.quoraId;
public String getPostId() {
return String.valueOf(this.postId);
}
//clean data
Expand Down
24 changes: 12 additions & 12 deletions src/org/loklak/harvester/BaseScraper.java
Expand Up @@ -11,11 +11,11 @@
import org.loklak.http.ClientConnection;
import org.loklak.objects.ProviderType;
import org.loklak.objects.SourceType;
import org.loklak.objects.Timeline;
import org.loklak.server.AbstractAPIHandler;
import org.loklak.server.APIException;
import org.loklak.server.Authorization;
import org.loklak.server.Query;
import org.loklak.objects.Timeline2;
import org.loklak.tools.storage.JSONObjectWithDefault;

/**
Expand All @@ -40,34 +40,34 @@ public abstract class BaseScraper extends AbstractAPIHandler {
//TODO: dummy variable, add datastructure for filter, type_of_posts, location, etc
protected String extra = "";
//TODO: setup Timeline for Post
protected final Timeline.Order order = Timeline.parseOrder("timestamp");
protected final Timeline2.Order order = Timeline2.parseOrder("timestamp");

@Override
public JSONObject serviceImpl(Query call, HttpServletResponse response, Authorization rights,
JSONObjectWithDefault permissions) throws APIException {
this.query = call.get("query", "");

//TODO: add different extra paramenters. this is dummy variable
this.extra = call.get("extra", "");
//TODO: to be implemented to use Timeline
//return getData().toJSON;
return this.getData();
return getData().toJSON(false, "metadata_base", "statuses_base");
//return this.getData();
}

protected abstract Map<?, ?> getExtra(String _extra);

// public Timeline getData() {
public Post getData() {
public Timeline2 getData() {
// public Post getData() {
ClientConnection connection;
BufferedReader br;

// Timeline tl = new Timeline(order);
Post tl = null;
Timeline2 tl = new Timeline2(order);
// Post tl = null;
this.url = this.baseUrl + this.midUrl + this.query;

try {
connection = new ClientConnection(this.url);

try {
// get instance of bufferReader
br = getHtml(connection);
Expand All @@ -92,8 +92,8 @@ public BufferedReader getHtml(ClientConnection connection) {
return br;
}

//protected abstract Timeline scrape(BufferedReader br);
protected abstract Post scrape(BufferedReader br);
protected abstract Timeline2 scrape(BufferedReader br);
//protected abstract Post scrape(BufferedReader br);

public String bufferedReaderToString(BufferedReader br) throws IOException {
StringBuilder everything = new StringBuilder();
Expand Down
14 changes: 12 additions & 2 deletions src/org/loklak/harvester/Post.java
@@ -1,6 +1,7 @@
package org.loklak.harvester;

import org.json.JSONObject;
import java.util.Date;

/**
* @author vibhcool (Vibhor Verma)
Expand All @@ -13,6 +14,7 @@ public abstract class Post extends JSONObject {

protected long timestamp = 0;
protected String postId;

protected Post() {
this.setTimestamp();
}
Expand All @@ -35,8 +37,16 @@ public void setTimestamp() {
this.setTimestamp(timestamp);
}

//public abstract void getPostId();
public Date getTimestampDate() {
return new Date(this.timestamp);
}

//TODO: Set up TwitterTweet before setting this as abstract
private void setPostId() { }

//public abstract String setPostId();
//TODO: Set up TwitterTweet before setting this as abstract
public String getPostId() {
return "";
}
}

3 changes: 2 additions & 1 deletion src/org/loklak/objects/AbstractObjectEntry.java
Expand Up @@ -30,10 +30,11 @@
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.loklak.harvester.Post;

import com.fasterxml.jackson.core.JsonGenerator;

public abstract class AbstractObjectEntry implements ObjectEntry {
public abstract class AbstractObjectEntry extends Post implements ObjectEntry {

public final static String TIMESTAMP_FIELDNAME = "timestamp";
public final static String CREATED_AT_FIELDNAME = "created_at";
Expand Down
22 changes: 16 additions & 6 deletions src/org/loklak/objects/MessageEntry.java
Expand Up @@ -48,7 +48,7 @@ public class MessageEntry extends AbstractObjectEntry implements ObjectEntry {

public static final String RICH_TEXT_SEPARATOR = "\n***\n";

protected Date timestamp; // a time stamp that is given in loklak upon the arrival of the tweet which is the current local time
protected Date timestampDate; // a time stamp that is given in loklak upon the arrival of the tweet which is the current local time
protected Date created_at; // the time given in the tweet which is the time when the user created it. This is also use to do the index partition into minute, hour, week
protected Date on; // on means 'valid from'
protected Date to; // 'to' means 'valid_until' and may not be set
Expand All @@ -74,7 +74,8 @@ public class MessageEntry extends AbstractObjectEntry implements ObjectEntry {
private Map<Context, Classification<String, Category>> classifier;

public MessageEntry() throws MalformedURLException {
this.timestamp = new Date();
this.timestamp = new Date().getTime();
this.timestampDate = new Date(this.timestamp);
this.created_at = new Date();
this.on = null;
this.to = null;
Expand Down Expand Up @@ -113,7 +114,8 @@ public MessageEntry() throws MalformedURLException {
}

public MessageEntry(JSONObject json) {
Object timestamp_obj = lazyGet(json, AbstractObjectEntry.TIMESTAMP_FIELDNAME); this.timestamp = parseDate(timestamp_obj);
Object timestamp_obj = lazyGet(json, AbstractObjectEntry.TIMESTAMP_FIELDNAME); this.timestampDate = parseDate(timestamp_obj);
this.timestamp = this.timestampDate.getTime();
Object created_at_obj = lazyGet(json, AbstractObjectEntry.CREATED_AT_FIELDNAME); this.created_at = parseDate(created_at_obj);
Object on_obj = lazyGet(json, "on"); this.on = on_obj == null ? null : parseDate(on);
Object to_obj = lazyGet(json, "to"); this.to = to_obj == null ? null : parseDate(to);
Expand Down Expand Up @@ -174,8 +176,8 @@ public MessageEntry(JSONObject json) {
enrich();
}

public Date getTimestamp() {
return this.timestamp == null ? new Date() : this.timestamp;
public Date getTimestampDate() {
return this.timestampDate == null ? new Date() : this.timestampDate;
}

public Date getCreatedAt() {
Expand Down Expand Up @@ -310,6 +312,14 @@ public void setLocationPoint(double[] location_point) {
this.location_point = location_point;
}

private void setPostId() {
this.postId = String.valueOf(this.timestamp) + String.valueOf(this.created_at.getTime());
}

public String getPostId() {
return String.valueOf(this.postId);
}

/**
* @return [longitude, latitude] which is inside of getLocationRadius() from getLocationPoint()
*/
Expand Down Expand Up @@ -561,7 +571,7 @@ public JSONObject toJSON(final UserEntry user, final boolean calculatedData, fin
JSONObject m = new JSONObject(true);

// tweet data
m.put(AbstractObjectEntry.TIMESTAMP_FIELDNAME, utcFormatter.print(getTimestamp().getTime()));
m.put(AbstractObjectEntry.TIMESTAMP_FIELDNAME, utcFormatter.print(getTimestampDate().getTime()));
m.put(AbstractObjectEntry.CREATED_AT_FIELDNAME, utcFormatter.print(getCreatedAt().getTime()));
if (this.on != null) m.put("on", utcFormatter.print(this.on.getTime()));
if (this.to != null) m.put("to", utcFormatter.print(this.to.getTime()));
Expand Down