Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Ongoing work for bug 182366. Use machine learning techniques to sort …

…autocomplete results. r=heikki. sr=hewitt.
  • Loading branch information...
commit 7b38fab678d949ecaaac833ff9d367d043ae987c 1 parent f0c4a3b
nisheeth%netscape.com authored
View
23 docshell/base/nsDocShell.cpp
@@ -4186,7 +4186,7 @@ nsDocShell::OnStateChange(nsIWebProgress * aProgress, nsIRequest * aRequest,
// Add the original url to global History so that
// visited url color changes happen.
if (uri)
- AddToGlobalHistory(uri, PR_TRUE);
+ AddToGlobalHistory(channel, uri, PR_TRUE);
} // channel
} // aProgress
}
@@ -5829,7 +5829,7 @@ nsDocShell::OnNewURI(nsIURI * aURI, nsIChannel * aChannel,
}
// Update Global history
- AddToGlobalHistory(aURI, IsFrame());
+ AddToGlobalHistory(aChannel, aURI, IsFrame());
}
// If this was a history load, update the index in
@@ -6368,8 +6368,9 @@ NS_IMETHODIMP nsDocShell::MakeEditable(PRBool inWaitForUriLoad)
}
nsresult
-nsDocShell::AddToGlobalHistory(nsIURI * aURI, PRBool aHidden)
+nsDocShell::AddToGlobalHistory(nsIChannel* aChannel, nsIURI * aURI, PRBool aHidden)
{
+ nsresult rv;
// first check if we should be adding it
PRBool updateHistory;
ShouldAddToGlobalHistory(aURI, &updateHistory);
@@ -6382,6 +6383,22 @@ nsDocShell::AddToGlobalHistory(nsIURI * aURI, PRBool aHidden)
NS_ENSURE_SUCCESS(mGlobalHistory->AddPage(spec.get()), NS_ERROR_FAILURE);
+ nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(aChannel, &rv));
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsIURI> referrer;
+ rv = httpChannel->GetReferrer(getter_AddRefs(referrer));
+ if (NS_SUCCEEDED(rv) && referrer) {
+ nsCAutoString referrerSpec;
+ rv = referrer->GetSpec(referrerSpec);
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsIBrowserHistory> browserHistory =
+ do_QueryInterface(mGlobalHistory);
+ browserHistory->OutputReferrerURL(spec.get(),
+ referrerSpec.get());
+ }
+ }
+ }
+
// this is a redirect, so hide the page from
// being enumerated in history
if (aHidden) {
View
2  docshell/base/nsDocShell.h
@@ -235,7 +235,7 @@ friend class nsDSURIContentListener;
// Global History
nsresult ShouldAddToGlobalHistory(nsIURI * aURI, PRBool * aShouldAdd);
- nsresult AddToGlobalHistory(nsIURI * aURI, PRBool );
+ nsresult AddToGlobalHistory(nsIChannel* aChannel, nsIURI * aURI, PRBool aHidden);
// Helper Routines
NS_IMETHOD GetPromptAndStringBundle(nsIPrompt ** aPrompt,
View
2  xpcom/reflect/xptinfo/src/xptiInterfaceInfoManager.cpp
@@ -260,7 +260,7 @@ PRBool xptiInterfaceInfoManager::BuildFileSearchPath(nsISupportsArray** aPath)
nsCOMPtr<nsILocalFile> greComponentDirectory;
nsresult rv = GetDirectoryFromDirService(NS_GRE_COMPONENT_DIR,
getter_AddRefs(greComponentDirectory));
- if (NS_SUCCEEDED(rv))
+ if (NS_SUCCEEDED(rv) && greComponentDirectory)
{
// make sure we only append a directory if its a different one
PRBool equalsCompDir = PR_FALSE;
View
6 xpfe/components/autocomplete/resources/content/autocomplete.xml
@@ -707,6 +707,9 @@
}
}
+ if (!this.noMatch)
+ this.autoComplete();
+
this.closeResultPopup();
}
@@ -739,6 +742,9 @@
}
}
+ if (!this.noMatch)
+ this.autoComplete();
+
this.mNeedToFinish = false;
this.mNeedToComplete = false;
View
17 xpfe/components/history/public/nsIBrowserHistory.idl
@@ -117,6 +117,23 @@ interface nsIBrowserHistory : nsISupports
*/
void markPageAsTyped(in string url);
+
+ /**
+ * outputReferrerURL
+ * Prints out referrer information for a url to a
+ * data file if browser.history.url.datacapture.mode is set
+ * to 1 or 2 (see the comment in nsGlobalHistory.cpp for
+ * more details).
+ *
+ * This method is a big hack and only temporary. Please do NOT use
+ * it in your code. It will be removed after the data
+ * collection phase of the project described in bug 182366 is
+ * over.
+ *
+ * @param aURL a url in global history
+ * @param aReferrer the referrer url to aURL
+ */
+ void outputReferrerURL(in string aURL, in string aReferrer);
};
%{ C++
View
1,130 xpfe/components/history/src/nsGlobalHistory.cpp
@@ -86,6 +86,9 @@
#include "nsIPrefBranchInternal.h"
#include "nsIObserverService.h"
+#include "prdtoa.h"
+#include "nsIBookmarksService.h"
+#include <math.h>
PRInt32 nsGlobalHistory::gRefCnt;
nsIRDFService* nsGlobalHistory::gRDFService;
@@ -102,6 +105,9 @@ nsIRDFResource* nsGlobalHistory::kNC_child;
nsIRDFResource* nsGlobalHistory::kNC_URL;
nsIRDFResource* nsGlobalHistory::kNC_HistoryRoot;
nsIRDFResource* nsGlobalHistory::kNC_HistoryByDate;
+nsIRDFResource* nsGlobalHistory::kNC_BookmarkAddDate;
+nsIRDFResource* nsGlobalHistory::kNC_Bookmark;
+nsIRDFResource* nsGlobalHistory::kRDF_Type;
nsIMdbFactory* nsGlobalHistory::gMdbFactory = nsnull;
nsIPrefBranch* nsGlobalHistory::gPrefBranch = nsnull;
@@ -110,6 +116,8 @@ nsIPrefBranch* nsGlobalHistory::gPrefBranch = nsnull;
#define PREF_BROWSER_STARTUP_PAGE "startup.page"
#define PREF_AUTOCOMPLETE_ONLY_TYPED "urlbar.matchOnlyTyped"
#define PREF_AUTOCOMPLETE_ENABLED "urlbar.autocomplete.enabled"
+#define PREF_AUTOCOMPLETE_LEARNING_MODE "urlbar.autocomplete.learning.mode"
+#define PREF_HISTORY_DATACAPTURE_MODE "history.url.datacapture.mode"
#define FIND_BY_AGEINDAYS_PREFIX "find:datasource=history&match=AgeInDays&method="
@@ -122,6 +130,181 @@ nsIPrefBranch* nsGlobalHistory::gPrefBranch = nsnull;
#define MSECS_PER_DAY (PR_MSEC_PER_SEC * 60 * 60 * 24)
+// ---------------------------
+// Autocomplete learning modes
+// ---------------------------
+
+// No learning
+#define AUTOCOMPLETE_NO_LEARNING 0
+// Only train the neural network. No user visible changes.
+#define AUTOCOMPLETE_ENABLE_TRAINING 1
+// Train the neural network and show its recommendation to the user.
+#define AUTOCOMPLETE_AFFECT_URL_LIST 2
+
+// --------------------------
+// URL data capture modes
+// --------------------------
+
+// URL information is captured for all urls that are loaded by the user
+// as well as for all urls matched by the autocomplete search engine
+// when the user types a partial url in the urlbar.
+
+// No data capture
+#define URLDATACAPTURE_NONE 0
+// Capture url features only. Don't store the url. The url features are
+// numeric and the original url *cannot* be reconstructed from it.
+#define URLDATACAPTURE_WITHOUT_URL_INFO 1
+// Capture url features and the url.
+#define URLDATACAPTURE_WITH_URL_INFO 2
+
+// ---------------------------
+
+#define AC_NUM_URL_FEATURES 44
+
+// This is the learning rate for the perceptron. The range of values is [0, 1]
+// It is used in to update the internal weights of the perceptron.
+// The perceptron update rule is:
+//
+// weight[i] = weight[i] + LEARN_RATE * ERROR * Input[i]
+//
+// See http://diwww.epfl.ch/mantra/tutorial/english/perceptron/html/learning.html
+// for an explanation of the perceptron update rule.
+const PRFloat64 LEARN_RATE = 0.5;
+
+// The following 4 constants are explained in the big comment
+// before FillInputFeatures.
+const PRFloat64 HISTORY_FAST_DECAY_CONSTANT = 0.2;
+const PRFloat64 HISTORY_SLOW_DECAY_CONSTANT = 0.8;
+const PRFloat64 BOOKMARK_FAST_DECAY_CONSTANT = 0.2;
+const PRFloat64 BOOKMARK_SLOW_DECAY_CONSTANT = 0.8;
+
+#define NS_AUTOCOMPLETE_WEIGHTS_FILE "ac-weights.txt"
+
+//----------------------------------------------------------------------
+// Perceptron implementation
+// XXX The implementations need to be moved out to their own .cpp file
+
+// XXX This should move to an Init method so that error handling can happen.
+nsPerceptron::nsPerceptron(PRInt32 aNumFeatures)
+{
+ mWeights = nsnull;
+ mNumWeights = 0;
+ if (aNumFeatures > 0) {
+ mWeights = new PRFloat64[aNumFeatures];
+ if (mWeights) {
+ mNumWeights = aNumFeatures;
+ LoadWeights();
+ }
+ }
+}
+
+void
+nsPerceptron::Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput)
+{
+ double output = 0.0;
+ double delta = 0.0;
+
+ // Calculate output
+ Test(aInputs, aNumInputs, &output);
+
+ delta = (double) aTargetOutput - (double) output;
+
+ // Update weights based on delta
+ PRInt32 i;
+ for (i = 0; i < mNumWeights; i++) {
+ mWeights[i] += LEARN_RATE * delta * aInputs[i];
+ }
+}
+
+void
+nsPerceptron::Test(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput)
+{
+ *aOutput = 0;
+
+ // Calculate output
+ PRInt32 i;
+ for (i = 0; i < aNumInputs; i++) {
+ *aOutput += mWeights[i] * aInputs[i];
+ }
+}
+
+void
+nsPerceptron::LoadWeights()
+{
+ nsCOMPtr<nsIFile> file;
+ FILE* from = 0;
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(file));
+
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsILocalFile> localFile(do_QueryInterface(file));
+ localFile->AppendNative(NS_LITERAL_CSTRING(NS_AUTOCOMPLETE_WEIGHTS_FILE));
+ localFile->OpenANSIFileDesc("r", &from);
+ }
+
+ PRInt32 i;
+ if (from) {
+ for (i = 0; i < mNumWeights; i++) {
+ fscanf(from, "%lf", &mWeights[i]);
+ }
+ fclose(from);
+ }
+ else {
+ // Initialize all weights to zero
+ for (i = 0; i < mNumWeights; i++) {
+ mWeights[i] = 0;
+ }
+ }
+}
+
+void
+nsPerceptron::SaveWeights()
+{
+ nsCOMPtr<nsIFile> file;
+ FILE* to = 0;
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(file));
+
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsILocalFile> localFile(do_QueryInterface(file));
+ localFile->AppendNative(NS_LITERAL_CSTRING("ac-weights.txt"));
+ localFile->OpenANSIFileDesc("w", &to);
+ }
+
+ if (to) {
+ PRInt32 i;
+ for (i = 0; i < mNumWeights; i++) {
+ fprintf(to, "%.16lf\n", mWeights[i]);
+ }
+ fclose(to);
+ }
+}
+
+nsSigmoidPerceptron::nsSigmoidPerceptron(PRInt32 aNumFeatures)
+: nsPerceptron(aNumFeatures)
+{
+ // empty
+}
+
+void
+nsSigmoidPerceptron::Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput)
+{
+ nsPerceptron::Train(aInputs, aNumInputs, aTargetOutput);
+}
+
+void
+nsSigmoidPerceptron::Test(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput)
+{
+ nsPerceptron::Test(aInputs, aNumInputs, aOutput);
+ *aOutput = Sigmoid(*aOutput);
+}
+
+PRFloat64 nsSigmoidPerceptron::Sigmoid(PRFloat64 aNum)
+{
+ return (1.0 / (1.0 + exp(-aNum)));
+}
+
+
+//----------------------------------------------------------------------
+
//----------------------------------------------------------------------
//
// CIDs
@@ -512,9 +695,12 @@ nsGlobalHistory::nsGlobalHistory()
mAutocompleteOnlyTyped(PR_FALSE),
mBatchesInProgress(0),
mNowValid(PR_FALSE),
- mDirty(PR_FALSE),
- mEnv(nsnull),
- mStore(nsnull),
+ mDirty(PR_FALSE),
+ mAutoCompleteLearner(nsnull),
+ mACFeatures(nsnull),
+ mURLDataFile(nsnull),
+ mEnv(nsnull),
+ mStore(nsnull),
mTable(nsnull)
{
LL_I2L(mFileSizeOnDisk, 0);
@@ -558,6 +744,9 @@ nsGlobalHistory::~nsGlobalHistory()
NS_IF_RELEASE(kNC_URL);
NS_IF_RELEASE(kNC_HistoryRoot);
NS_IF_RELEASE(kNC_HistoryByDate);
+ NS_IF_RELEASE(kNC_BookmarkAddDate);
+ NS_IF_RELEASE(kNC_Bookmark);
+ NS_IF_RELEASE(kRDF_Type);
NS_IF_RELEASE(gMdbFactory);
NS_IF_RELEASE(gPrefBranch);
@@ -570,6 +759,23 @@ nsGlobalHistory::~nsGlobalHistory()
if (mExpireNowTimer)
mExpireNowTimer->Cancel();
+ if (mURLDataFile) {
+ nsCAutoString dateStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ fprintf(mURLDataFile, "<shutdown time='%s'/>\n", dateStr.get());
+
+ fclose(mURLDataFile);
+ }
+
+ if (mAutoCompleteLearner) {
+ delete mAutoCompleteLearner;
+ mAutoCompleteLearner = nsnull;
+ }
+
+ if (mACFeatures) {
+ delete [] mACFeatures;
+ mACFeatures = nsnull;
+ }
}
@@ -653,7 +859,7 @@ nsGlobalHistory::AddPageToDatabase(const char *aURL,
// update the database, and get the old info back
PRInt64 oldDate;
PRInt32 oldCount;
- rv = AddExistingPageToDatabase(row, aDate, &oldDate, &oldCount);
+ rv = AddExistingPageToDatabase(row, aURL, aDate, &oldDate, &oldCount);
NS_ASSERTION(NS_SUCCEEDED(rv), "AddExistingPageToDatabase failed; see bug 88961");
if (NS_FAILED(rv)) return rv;
@@ -703,6 +909,7 @@ nsGlobalHistory::AddPageToDatabase(const char *aURL,
nsresult
nsGlobalHistory::AddExistingPageToDatabase(nsIMdbRow *row,
+ const char *aURL,
PRInt64 aDate,
PRInt64 *aOldDate,
PRInt32 *aOldCount)
@@ -729,6 +936,45 @@ nsGlobalHistory::AddExistingPageToDatabase(nsIMdbRow *row,
SetRowValue(row, kToken_LastVisitDateColumn, aDate);
SetRowValue(row, kToken_VisitCountColumn, (*aOldCount) + 1);
+ if (mLearningMode > AUTOCOMPLETE_NO_LEARNING ||
+ mDataCaptureMode > URLDATACAPTURE_NONE) {
+ // Update the two Frequency-Recency metrics
+ PRFloat64 m;
+ PRInt32 ageInDays = GetAgeInDays(NormalizeTime(GetNow()), *aOldDate);
+ rv = GetRowValue(row, kToken_FRFastDecayColumn, &m);
+ if (NS_FAILED(rv)) return rv;
+ m = 1.0 + (PRFloat64) (pow(HISTORY_FAST_DECAY_CONSTANT, (PRFloat64) ageInDays)) * m;
+ SetRowValue(row, kToken_FRFastDecayColumn, m);
+ rv = GetRowValue(row, kToken_FRSlowDecayColumn, &m);
+ if (NS_FAILED(rv)) return rv;
+ m = 1.0 + (PRFloat64) (pow(HISTORY_SLOW_DECAY_CONSTANT, (PRFloat64) ageInDays)) * m;
+ SetRowValue(row, kToken_FRSlowDecayColumn, m);
+ }
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ fprintf(mURLDataFile, "<add-existing-url>\n");
+ nsAutoString url = NS_ConvertUTF8toUCS2(aURL);
+ rv = FillInputFeatures(url, mACFeatures);
+ if (NS_SUCCEEDED(rv))
+ WriteURLData(url, mACFeatures);
+ fprintf(mURLDataFile, "</add-existing-url>\n");
+ fflush(mURLDataFile);
+ }
+
+ return NS_OK;
+}
+
+nsresult
+nsGlobalHistory::AssignUniqueURLID(nsIMdbRow *aRow, PRInt64 *aID)
+{
+ nsCOMPtr<nsIMdbRow> oldRow;
+ nsresult rv = NS_OK;
+ *aID = PR_Now();
+ do {
+ rv = FindRow(kToken_URLIDColumn, ++(*aID), getter_AddRefs(oldRow));
+ } while (NS_SUCCEEDED(rv));
+ SetRowValue(aRow, kToken_URLIDColumn, *aID);
+
return NS_OK;
}
@@ -759,6 +1005,24 @@ nsGlobalHistory::AddNewPageToDatabase(const char *aURL,
SetRowValue(row, kToken_LastVisitDateColumn, aDate);
SetRowValue(row, kToken_FirstVisitDateColumn, aDate);
+ if (mLearningMode > AUTOCOMPLETE_NO_LEARNING ||
+ mDataCaptureMode > URLDATACAPTURE_NONE) {
+ // Initialize the Frequency-Recency metrics
+ SetRowValue(row, kToken_FRFastDecayColumn, (PRFloat64) 1.0);
+ SetRowValue(row, kToken_FRSlowDecayColumn, (PRFloat64) 1.0);
+ }
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ fprintf(mURLDataFile, "<add-new-url>\n");
+ nsAutoString url = NS_ConvertUTF8toUCS2(aURL);
+ nsresult rv;
+ rv = FillInputFeatures(url, mACFeatures);
+ if (NS_SUCCEEDED(rv))
+ WriteURLData(url, mACFeatures);
+ fprintf(mURLDataFile, "</add-new-url>\n");
+ fflush(mURLDataFile);
+ }
+
nsCOMPtr<nsIURI> uri;
NS_NewURI(getter_AddRefs(uri), nsDependentCString(aURL), nsnull, nsnull);
nsCAutoString hostname;
@@ -842,6 +1106,22 @@ nsGlobalHistory::SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const PRInt32 aVa
}
nsresult
+nsGlobalHistory::SetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRFloat64 aValue)
+{
+ mdb_err err;
+
+ nsCAutoString buf; buf.AppendFloat(aValue);
+ mdbYarn yarn = { (void *)buf.get(), buf.Length(), buf.Length(), 0, 0, nsnull };
+
+ err = aRow->AddColumn(mEnv, aCol, &yarn);
+
+ if (err != 0) return NS_ERROR_FAILURE;
+
+ return NS_OK;
+
+}
+
+nsresult
nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
nsAString& aResult)
{
@@ -909,6 +1189,30 @@ nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
nsresult
nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
+ PRFloat64 *aResult)
+{
+ mdb_err err;
+ char *next = NULL;
+ nsresult rv = NS_OK;
+
+ mdbYarn yarn;
+ err = aRow->AliasCellYarn(mEnv, aCol, &yarn);
+ if (err != 0)
+ return NS_ERROR_FAILURE;
+
+ if (yarn.mYarn_Buf) {
+ *aResult = PR_strtod((const char *)yarn.mYarn_Buf, &next);
+ if (next == yarn.mYarn_Buf) {
+ rv = NS_ERROR_CANNOT_CONVERT_DATA;
+ }
+ }
+
+ return rv;
+}
+
+
+nsresult
+nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
nsACString& aResult)
{
mdb_err err;
@@ -1240,6 +1544,27 @@ nsGlobalHistory::HidePage(const char *aURL)
rv = SetRowValue(row, kToken_HiddenColumn, 1);
if (NS_FAILED(rv)) return rv;
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ PRInt64 id;
+ nsCAutoString dateStr, IDStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ GetRowValue(row, kToken_URLIDColumn, &id);
+
+ if (!id) {
+ AssignUniqueURLID(row, &id);
+ }
+
+ PRInt64ToChars(id, IDStr);
+ fprintf(mURLDataFile, "<hide-url id='%s' time='%s'", IDStr.get(), dateStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s'", aURL);
+ }
+
+ fprintf(mURLDataFile, "/>\n");
+ fflush(mURLDataFile);
+ }
+
// now pretend as if this row was deleted
// HasAssertion() correctly checks the Hidden column to show that
// the row is hidden
@@ -1267,10 +1592,92 @@ nsGlobalHistory::MarkPageAsTyped(const char* aURL)
rv = SetRowValue(row, kToken_HiddenColumn, 1);
if (NS_FAILED(rv)) return rv;
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ PRInt64 id;
+ nsCAutoString dateStr, IDStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ GetRowValue(row, kToken_URLIDColumn, &id);
+
+ if (!id) {
+ AssignUniqueURLID(row, &id);
+ }
+
+ PRInt64ToChars(id, IDStr);
+ fprintf(mURLDataFile, "<typed-url id='%s' time='%s'", IDStr.get(), dateStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s'", aURL);
+ }
+
+ fprintf(mURLDataFile, "/>\n");
+ fflush(mURLDataFile);
+ }
+
return SetRowValue(row, kToken_TypedColumn, 1);
}
+NS_IMETHODIMP
+nsGlobalHistory::OutputReferrerURL(const char *aURL, const char *aReferrer)
+{
+ nsresult rv;
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ // If history is set to expire after 0 days,
+ // then it's technically disabled. Don't even
+ // bother adding the page
+ if (mExpireDays == 0)
+ return NS_OK;
+
+ NS_ENSURE_ARG_POINTER(aURL);
+ NS_ENSURE_ARG_POINTER(aReferrer);
+ NS_ENSURE_SUCCESS(OpenDB(), NS_ERROR_FAILURE);
+
+ if (!*aURL)
+ return NS_ERROR_INVALID_ARG;
+
+ nsCOMPtr<nsIMdbRow> urlRow;
+ rv = FindRow(kToken_URLColumn, aURL, getter_AddRefs(urlRow));
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsIMdbRow> referrerRow;
+ rv = FindRow(kToken_URLColumn, aReferrer, getter_AddRefs(referrerRow));
+ if (NS_SUCCEEDED(rv)) {
+ PRInt64 id;
+ nsCAutoString dateStr, urlIDStr, referrerIDStr;
+
+ PRInt64ToChars(PR_Now(), dateStr);
+
+ GetRowValue(referrerRow, kToken_URLIDColumn, &id);
+ if (!id) {
+ AssignUniqueURLID(referrerRow, &id);
+ }
+
+ PRInt64ToChars(id, referrerIDStr);
+
+ GetRowValue(urlRow, kToken_URLIDColumn, &id);
+ if (!id) {
+ AssignUniqueURLID(urlRow, &id);
+ }
+
+ PRInt64ToChars(id, urlIDStr);
+
+ fprintf(mURLDataFile, "<referrer-url id='%s' url-id='%s' time='%s'",
+ referrerIDStr.get(), urlIDStr.get(), dateStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s' url-path='%s'", aReferrer, aURL);
+ }
+
+ fprintf(mURLDataFile, "/>\n");
+ fflush(mURLDataFile);
+ }
+ }
+ }
+
+ return NS_OK;
+}
+
+
//----------------------------------------------------------------------
//
// nsGlobalHistory
@@ -2299,6 +2706,60 @@ nsGlobalHistory::Init()
gPrefBranch->GetIntPref(PREF_BROWSER_HISTORY_EXPIRE_DAYS, &mExpireDays);
gPrefBranch->GetBoolPref(PREF_AUTOCOMPLETE_ONLY_TYPED, &mAutocompleteOnlyTyped);
+ gPrefBranch->GetIntPref(PREF_HISTORY_DATACAPTURE_MODE, &mDataCaptureMode);
+ gPrefBranch->GetIntPref(PREF_AUTOCOMPLETE_LEARNING_MODE, &mLearningMode);
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE) {
+ nsCOMPtr<nsIFile> file;
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(file));
+
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsILocalFile> localFile(do_QueryInterface(file));
+ localFile->AppendNative(NS_LITERAL_CSTRING("url-data.txt"));
+ localFile->OpenANSIFileDesc("a", &mURLDataFile);
+
+ if (mURLDataFile) {
+ nsCAutoString dateStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ fprintf(mURLDataFile, "\n<startup time='%s'/>\n", dateStr.get());
+ fflush(mURLDataFile);
+ }
+ else {
+ mDataCaptureMode = URLDATACAPTURE_NONE;
+ mURLDataFile = nsnull;
+ }
+ }
+ else {
+ // Disable data capture
+ mDataCaptureMode = URLDATACAPTURE_NONE;
+ mURLDataFile = nsnull;
+ }
+ }
+ else {
+ mURLDataFile = nsnull;
+ }
+
+ if (mLearningMode > AUTOCOMPLETE_NO_LEARNING ||
+ mDataCaptureMode > URLDATACAPTURE_NONE) {
+ // Create perceptron and feature array.
+ mAutoCompleteLearner = new nsSigmoidPerceptron(AC_NUM_URL_FEATURES);
+ if (mAutoCompleteLearner) {
+ mACFeatures = new PRFloat64[AC_NUM_URL_FEATURES];
+ if (!mACFeatures) {
+ delete mAutoCompleteLearner;
+ mAutoCompleteLearner = nsnull;
+ mLearningMode = AUTOCOMPLETE_NO_LEARNING;
+ }
+ }
+ else {
+ mLearningMode = AUTOCOMPLETE_NO_LEARNING;
+ }
+ }
+ else {
+ mAutoCompleteLearner = nsnull;
+ mACFeatures = nsnull;
+ }
+
nsCOMPtr<nsIPrefBranchInternal> pbi = do_QueryInterface(gPrefBranch);
if (pbi) {
pbi->AddObserver(PREF_AUTOCOMPLETE_ONLY_TYPED, this, PR_FALSE);
@@ -2327,6 +2788,9 @@ nsGlobalHistory::Init()
gRDFService->GetResource(NC_NAMESPACE_URI "URL", &kNC_URL);
gRDFService->GetResource("NC:HistoryRoot", &kNC_HistoryRoot);
gRDFService->GetResource("NC:HistoryByDate", &kNC_HistoryByDate);
+ gRDFService->GetResource(NC_NAMESPACE_URI "BookmarkAddDate", &kNC_BookmarkAddDate);
+ gRDFService->GetResource(NC_NAMESPACE_URI "Bookmark", &kNC_Bookmark);
+ gRDFService->GetResource(RDF_NAMESPACE_URI "type", &kRDF_Type);
}
// register this as a named data source with the RDF service
@@ -2677,6 +3141,15 @@ nsGlobalHistory::CreateTokens()
err = mStore->StringToToken(mEnv, "Typed", &kToken_TypedColumn);
if (err != 0) return NS_ERROR_FAILURE;
+ err = mStore->StringToToken(mEnv, "FRFastDecay", &kToken_FRFastDecayColumn);
+ if (err != 0) return NS_ERROR_FAILURE;
+
+ err = mStore->StringToToken(mEnv, "FRSlowDecay", &kToken_FRSlowDecayColumn);
+ if (err != 0) return NS_ERROR_FAILURE;
+
+ err = mStore->StringToToken(mEnv, "URLID", &kToken_URLIDColumn);
+ if (err != 0) return NS_ERROR_FAILURE;
+
// meta-data tokens
err = mStore->StringToToken(mEnv, "LastPageVisited", &kToken_LastPageVisited);
@@ -2833,11 +3306,58 @@ nsGlobalHistory::FindRow(mdb_column aCol,
if (!hasRow) return NS_ERROR_NOT_AVAILABLE;
*aResult = row;
- (*aResult)->AddRef();
+ NS_ADDREF(*aResult);
return NS_OK;
}
+nsresult
+nsGlobalHistory::FindRow(mdb_column aCol,
+ PRInt64 aValue, nsIMdbRow **aResult)
+{
+ if (! mStore)
+ return NS_ERROR_NOT_INITIALIZED;
+
+ mdb_err err;
+ nsCAutoString val;
+ PRInt64ToChars(aValue, val);
+
+ mdbYarn yarn = { (void *)val.get(), val.Length(), val.Length(), 0, 0, nsnull };
+
+ mdbOid rowId;
+ nsCOMPtr<nsIMdbRow> row;
+ err = mStore->FindRow(mEnv, kToken_HistoryRowScope,
+ aCol, &yarn,
+ &rowId, getter_AddRefs(row));
+
+ if (err) return NS_ERROR_FAILURE;
+
+ if (!row) return NS_ERROR_NOT_AVAILABLE;
+
+ // make sure it's actually stored in the main table
+ mdb_bool hasRow;
+ mTable->HasRow(mEnv, row, &hasRow);
+
+ if (!hasRow) return NS_ERROR_NOT_AVAILABLE;
+
+ *aResult = row;
+ NS_ADDREF(*aResult);
+
+ return NS_OK;
+}
+
+nsresult
+nsGlobalHistory::FindRowAndID(mdb_column aCol,const char *aValue,
+ nsIMdbRow **aResult, PRInt64 *aRowID)
+{
+ nsresult rv;
+ rv = FindRow(aCol, aValue, aResult);
+ if (NS_SUCCEEDED(rv)) {
+ rv = GetRowValue(*aResult, kToken_URLIDColumn, aRowID);
+ }
+ return rv;
+}
+
PRBool
nsGlobalHistory::IsURLInHistory(nsIRDFResource* aResource)
{
@@ -3980,12 +4500,564 @@ nsGlobalHistory::OnStopLookup()
return NS_OK;
}
+
+/**
+ *
+ * The input features into the autocomplete perceptron are as follows:
+ *
+ * Features 1 = Frequency and recency metric for page in history
+ * (domain = positive real numbers)
+ * Value decays fast with age of page
+ * Uses HISTORY_FAST_DECAY_CONSTANT
+ * Features 2 = Frequency and recency metric for page in history
+ * (high for newer, more accessed pages)
+ * Value decays slowly with age of page
+ * Uses HISTORY_SLOW_DECAY_CONSTANT
+ * Features 3 = Was the url typed by the user?
+ * (domain = 0 or 1)
+ * Features 4 = Recency metric for page in bookmarks
+ * (domain = real number between 0 and 1)
+ * Value decays fast with age of bookmark
+ * Uses BOOKMARKS_FAST_DECAY_CONSTANT
+ * Features 5 = Recency metric for page in bookmarks
+ * (domain = real number between 0 and 1)
+ * Value decays slowly with age of bookmark
+ * Uses BOOKMARKS_SLOW_DECAY_CONSTANT
+ *
+ * Features 1 and Feature 2 details:
+ *
+ * As an example, say a page was first seen on Day 1 and accessed from then
+ * until today (Day 4) with the following schedule:
+ *
+ * (Day 1, D times), (Day 2, C times), (Day 3, B times), (Day 4, A times)
+ *
+ * Then, the frequency+recency metric calculation for the page will be:
+ *
+ * FRMetric = A + (B * G) + (C * G^2) + (D * G^3)
+ *
+ * where G is the decay constant that takes values between 0 and 1.
+ * Values close to 1 lead to slow decay with age.
+ * Values close to 0 lead to fast decay with age.
+ *
+ * Feature 4 and Feature 5 only care about recency not frequency.
+ *
+ * So, if a bookmark was added X days earlier,
+ *
+ * Bookmark Feature Value = G^X.
+ *
+ * where G is the decay constant that takes values between 0 and 1.
+ * Values close to 1 lead to slow decay with age.
+ * Values close to 0 lead to fast decay with age.
+ *
+ * The rest of the url related features:
+ *
+ * Feature 6: Whether url ends in .htm or .html
+ * Feature 7: Is it a .com URL?
+ * Feature 8: Is it a .edu URL?
+ * Feature 9: Is it a .org URL?
+ * Feature 10: Is it a .net URL?
+ * Feature 11: Is it a .gov URL?
+ * Feature 12: Does the URL contain a ~ ?
+ * Feature 13: Does the URL start with http:* ?
+ * Feature 14: Does the URL start with ftp:// ?
+ * Feature 15: Does the URL start with file:// ?
+ * Feature 16: Does the URL start with gopher:// ?
+ * Feature 17: Does the URL start with https:// ?
+ * Feature 18: Does the host name end in a two letter country code?
+ * Feature 19: Number of /s in the URL.
+ * Feature 20: Number of ?s in the URL.
+ * Feature 21: Number of &s in the URL.
+ * Feature 22: Number of =s in the URL.
+ * Feature 23: Number of #s in the URL.
+ * Feature 24: Number of +s in the URL.
+ * Feature 25: Number of .s in the URL.
+ * Feature 26: Number of numerical [0-9] characters in the URL
+ * Feature 27: Number of alphabetical [a-zA-Z] characters in the URL
+ * Feature 28: Number of non-alphanumeric, non-[/?&=#+.] characters in the URL
+ * Feature 29: Number of .s in the hostname
+ * Feature 30: Number of numerical [0-9] characters in the hostname
+ * Feature 31: Number of alphabetical [a-zA-Z] characters in the hostname
+ * Feature 32: Number of non-alphanumeric, non-[/?&=#+.] characters in the hostname
+ * Feature 33: Number of .s in the hostname if we omit initial "www." or "ftp." (if any)
+ * Feature 34: Number of .s in the hostname if we omit ending ".XX" country code (if any)
+ * Feature 35: Number of .s in the hostname if we omit initial "www." or "ftp"
+ * and ending ".XX" country code (if any)
+ * Feature 36: Number of characters in URL
+ * Feature 37: Number of characters in hostname
+ * Feature 38: Number of characters in hostname excluding initial "www." or "ftp."
+ * Feature 39: Number of characters in URL excluding hostname
+ * Feature 40: Number of characters in web page title
+ * Feature 41: Is this a google search url?
+ * Feature 42: Is this a netscape search url?
+ * Feature 43: Is this a yahoo search url?
+ * Feature 44: Dummy input hardcoded to 1
+ */
+
+nsresult
+nsGlobalHistory::FillInputFeatures(nsAString &aUrl,
+ PRFloat64 *aFeatures)
+{
+ nsCOMPtr<nsIMdbRow> row;
+ nsresult rv = NS_OK;
+ PRInt32 ageInDays;
+ PRInt64 lastDate;
+ static nsCOMPtr<nsIBookmarksService> bs =
+ do_GetService(NS_BOOKMARKS_SERVICE_CONTRACTID, &rv);
+
+ nsCOMPtr<nsIURI> uri;
+ nsCAutoString curl, chost, cpath;
+ rv = NS_NewURI(getter_AddRefs(uri), aUrl);
+ if (NS_SUCCEEDED(rv) && uri) {
+ uri->GetSpec(curl);
+ uri->GetHost(chost);
+ uri->GetPath(cpath);
+ }
+ nsAutoString url(NS_ConvertUTF8toUCS2(curl).get());
+ nsAutoString path(NS_ConvertUTF8toUCS2(cpath).get());
+ nsAutoString host(NS_ConvertUTF8toUCS2(chost).get());
+ ToLowerCase(url);
+ ToLowerCase(host);
+ ToLowerCase(path);
+
+ // Calculate the input features for this training example.
+ rv = FindRow(kToken_URLColumn, curl.get(),
+ getter_AddRefs(row));
+ if (NS_FAILED(rv)) return rv;
+
+ // First, get the page in history related input features
+ rv = GetRowValue(row, kToken_FRFastDecayColumn, &aFeatures[0]);
+ if (NS_FAILED(rv)) return rv;
+
+ rv = GetRowValue(row, kToken_LastVisitDateColumn, &lastDate);
+ if (NS_FAILED(rv)) return rv;
+
+ ageInDays = GetAgeInDays(NormalizeTime(GetNow()), lastDate);
+
+ aFeatures[0] *= pow(HISTORY_FAST_DECAY_CONSTANT, (PRFloat64) ageInDays);
+
+ rv = GetRowValue(row, kToken_FRSlowDecayColumn, &aFeatures[1]);
+ if (NS_FAILED(rv)) return rv;
+
+ aFeatures[1] *= pow(HISTORY_SLOW_DECAY_CONSTANT, (PRFloat64) ageInDays);
+
+ aFeatures[2] = HasCell(mEnv, row, kToken_TypedColumn);
+
+ // Second, calculate the bookmark related input features.
+ aFeatures[3] = aFeatures[4] = 0;
+ if (bs) {
+ PRBool bookmarked;
+ rv = bs->IsBookmarked(curl.get(), &bookmarked);
+ if (NS_SUCCEEDED(rv) && bookmarked) {
+ // Get the date when the bookmark was added.
+ PRInt64 addDate;
+ nsCOMPtr<nsIRDFResource> rdfRes;
+
+ if (NS_SUCCEEDED(rv = gRDFService->GetResource(curl.get(),
+ getter_AddRefs(rdfRes)))) {
+ nsCOMPtr<nsIRDFDataSource> bookmarkDS = do_QueryInterface(bs, &rv);
+ if (NS_SUCCEEDED(rv) && bookmarkDS) {
+ nsCOMPtr<nsIRDFNode> nodeType;
+ rv = bookmarkDS->GetTarget(rdfRes, kRDF_Type, PR_TRUE,
+ getter_AddRefs(nodeType));
+ if (NS_SUCCEEDED(rv)) {
+ if (nodeType == kNC_Bookmark) {
+ nsCOMPtr<nsIRDFNode> node;
+ rv = bookmarkDS->GetTarget(rdfRes, kNC_BookmarkAddDate, PR_TRUE,
+ getter_AddRefs(node));
+ if (rv != NS_RDF_NO_VALUE && node) {
+ nsCOMPtr<nsIRDFDate> rdfDate = do_QueryInterface(node, &rv);
+ if (NS_SUCCEEDED(rv) && rdfDate) {
+ rv = rdfDate->GetValue(&addDate);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (NS_SUCCEEDED(rv)) {
+ ageInDays = GetAgeInDays(NormalizeTime(GetNow()), addDate);
+ aFeatures[3] = pow(BOOKMARK_FAST_DECAY_CONSTANT, ageInDays);
+ aFeatures[4] = pow(BOOKMARK_SLOW_DECAY_CONSTANT, ageInDays);
+ }
+ }
+ }
+
+ // Feature 6: Whether url ends in .htm or .html
+ nsAString::const_iterator start, end;
+
+ path.BeginReading(start);
+ path.EndReading(end);
+ aFeatures[5] = FindInReadable(NS_LITERAL_STRING(".htm"), start, end);
+
+ // Feature 7: Is it a .com URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[6] = FindInReadable(NS_LITERAL_STRING(".com"), start, end);
+
+ // Feature 8: Is it a .edu URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[7] = FindInReadable(NS_LITERAL_STRING(".edu"), start, end);
+
+ // Feature 9: Is it a .org URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[8] = FindInReadable(NS_LITERAL_STRING(".org"), start, end);
+
+ // Feature 10: Is it a .net URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[9] = FindInReadable(NS_LITERAL_STRING(".net"), start, end);
+
+ // Feature 11: Is it a .gov URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[10] = FindInReadable(NS_LITERAL_STRING(".gov"), start, end);
+
+ // Feature 12: Does the URL contain a ~ ?
+ path.BeginReading(start);
+ path.EndReading(end);
+ aFeatures[11] = FindInReadable(NS_LITERAL_STRING("~"), start, end);
+
+ // Feature 13: Does the URL start with http:// ?
+ PRBool isScheme;
+ aFeatures[12] = aFeatures[13] = aFeatures[14] = aFeatures[15] = aFeatures[16] = 0;
+ if (NS_SUCCEEDED(uri->SchemeIs("http", &isScheme))) {
+ aFeatures[12] = isScheme;
+ }
+ // Feature 14: Does the URL start with ftp:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("ftp", &isScheme))) {
+ aFeatures[13] = isScheme;
+ }
+ // Feature 15: Does the URL start with file:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("file", &isScheme))) {
+ aFeatures[14] = isScheme;
+ }
+ // Feature 16: Does the URL start with gopher:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("gopher", &isScheme))) {
+ aFeatures[15] = isScheme;
+ }
+ // Feature 17: Does the URL start with https:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("https", &isScheme))) {
+ aFeatures[16] = isScheme;
+ }
+
+ // Feature 18: Does the host name end in a two letter country code?
+ PRInt32 hostLength = host.Length();
+ if (host[hostLength - 1] == '.') {
+ // Skip trailing dots in hostname if it exists. This will catch cases like
+ // http://www.state.ca.us./state/portal/myca_homepage.jsp
+ aFeatures[17] = (host.RFindChar('.', hostLength - 2) == (hostLength - 4));
+ }
+ else {
+ aFeatures[17] = (host.RFindChar('.') == ((hostLength - 1) - 2));
+ }
+
+ // Feature 19: Number of /s in the URL.
+ aFeatures[18] = 0;
+ // Feature 20: Number of ?s in the URL.
+ aFeatures[19] = 0;
+ // Feature 21: Number of &s in the URL.
+ aFeatures[20] = 0;
+ // Feature 22: Number of =s in the URL.
+ aFeatures[21] = 0;
+ // Feature 23: Number of #s in the URL.
+ aFeatures[22] = 0;
+ // Feature 24: Number of +s in the URL.
+ aFeatures[23] = 0;
+ // Feature 25: Number of .s in the URL.
+ aFeatures[24] = 0;
+ // Feature 26: Number of numerical [0-9] characters in the URL
+ aFeatures[25] = 0;
+ // Feature 27: Number of alphabetical [a-zA-Z] characters in the URL
+ aFeatures[26] = 0;
+ // Feature 28: Number of non-alphanumeric, non-[/?&=#+.] characters in the URL
+ aFeatures[27] = 0;
+
+ url.BeginReading(start);
+ url.EndReading(end);
+
+ PRUint32 size, i;
+ for ( ; start != end; start.advance(size)) {
+ const PRUnichar* buf = start.get();
+ size = start.size_forward();
+
+ // fragment at 'buf' is 'size' characters long
+ for (i = 0; i < size; *buf++, i++) {
+ switch (*buf) {
+ case '/':
+ ++aFeatures[18];
+ break;
+
+ case '?':
+ ++aFeatures[19];
+ break;
+
+ case '&':
+ ++aFeatures[20];
+ break;
+
+ case '=':
+ ++aFeatures[21];
+ break;
+
+ case '#':
+ ++aFeatures[22];
+ break;
+
+ case '+':
+ ++aFeatures[23];
+ break;
+
+ case '.':
+ ++aFeatures[24];
+ break;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ ++aFeatures[25];
+ break;
+
+ default:
+ if (isalpha(*buf))
+ ++aFeatures[26];
+ else
+ ++aFeatures[27];
+ }
+ }
+ }
+
+ // Calculate a bunch of hostname related features.
+
+ // Feature 29: Number of .s in the hostname
+ aFeatures[28] = 0;
+ // Feature 30: Number of numerical [0-9] characters in the hostname
+ aFeatures[29] = 0;
+ // Feature 31: Number of alphabetical [a-zA-Z] characters in the hostname
+ aFeatures[30] = 0;
+ // Feature 32: Number of non-alphanumeric, non-[.] characters in the hostname
+ aFeatures[31] = 0;
+
+ size = chost.Length();
+ for (i = 0; i < size; i++) {
+ switch (chost[i]) {
+ case '.':
+ ++aFeatures[28];
+ break;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ ++aFeatures[29];
+ break;
+
+ default:
+ if (isalpha(chost[i]))
+ ++aFeatures[30];
+ else
+ ++aFeatures[31];
+ }
+ }
+
+ // Feature 33: Number of .s in the hostname if we omit initial "www." or "ftp." (if any)
+ aFeatures[32] = aFeatures[28];
+ // Feature 34: Number of .s in the hostname if we omit ending ".XX" country code (if any)
+ aFeatures[33] = aFeatures[28];
+ // Feature 35: Number of .s in the hostname if we omit initial "www." or "ftp"
+ // and ending ".XX" country code (if any)
+ aFeatures[34] = aFeatures[28];
+ // Feature 36: Number of characters in hostname
+ aFeatures[35] = chost.Length();
+ // Feature 37: Number of characters in hostname excluding initial "www." or "ftp."
+ aFeatures[36] = aFeatures[35];
+
+ if (chost.Find("www.") == 0 || chost.Find("ftp.") == 0) {
+ --aFeatures[32];
+ --aFeatures[34];
+ aFeatures[36] -= 4;
+ }
+
+ if (aFeatures[17]) {
+ --aFeatures[33];
+ --aFeatures[34];
+ }
+
+ // Feature 38: Number of characters in URL
+ aFeatures[37] = url.Length();
+
+ // Feature 39: Number of characters in URL excluding hostname
+ aFeatures[38] = aFeatures[37] - aFeatures[35];
+
+ // Feature 40: Number of characters in web page title
+ nsAutoString title;
+ rv = GetRowValue(row, kToken_NameColumn, title);
+ if (NS_FAILED(rv)) return rv;
+ aFeatures[39] = title.Length();
+
+ // Feature 41: Is this a google search url?
+ url.BeginReading(start);
+ url.EndReading(end);
+ aFeatures[40] = FindInReadable(NS_LITERAL_STRING("http://www.google.com/search"), start, end);
+
+ // Feature 42: Is this a netscape search url?
+ url.BeginReading(start);
+ url.EndReading(end);
+ aFeatures[41] = FindInReadable(NS_LITERAL_STRING("http://search.netscape.com/nscp_results.adp"), start, end);
+
+ // Feature 43: Is this a yahoo search url?
+ url.BeginReading(start);
+ url.EndReading(end);
+ aFeatures[42] = FindInReadable(NS_LITERAL_STRING("http://search.yahoo.com/bin/search"), start, end);
+
+ // Feature 44: This is a dummy input hardcoded to 1. It allows
+ // the perceptron to represent functions that do not pass through the
+ // origin.
+ aFeatures[43] = 1;
+
+ return rv;
+}
+
+nsresult
+nsGlobalHistory::WriteURLData(nsAString& aURL, PRFloat64* aURLFeatures)
+{
+ nsCOMPtr<nsIMdbRow> row;
+ nsresult rv = NS_OK;
+ nsCAutoString dateStr, IDStr;
+ PRInt64 rowID;
+
+ if (!mURLDataFile || !aURLFeatures)
+ return NS_ERROR_FAILURE;
+
+ // Calculate the input features for this training example.
+ rv = FindRowAndID(kToken_URLColumn, NS_ConvertUCS2toUTF8(aURL).get(),
+ getter_AddRefs(row), &rowID);
+ if (NS_FAILED(rv)) return rv;
+
+ if (!rowID) {
+ AssignUniqueURLID(row, &rowID);
+ }
+
+ PRInt64ToChars(rowID, IDStr);
+
+ fprintf(mURLDataFile, "<url id='%s'", IDStr.get());
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s'", NS_ConvertUCS2toUTF8(aURL).get());
+ }
+ PRInt64ToChars(PR_Now(), dateStr);
+ fprintf(mURLDataFile, " time='%s'>\n", dateStr.get());
+
+ PRInt32 i;
+ for (i = 0; i < AC_NUM_URL_FEATURES - 1; i++) {
+ fprintf(mURLDataFile, "%.2f, ", aURLFeatures[i]);
+ }
+
+ fprintf(mURLDataFile, "%.2f\n</url>\n", aURLFeatures[i]);
+
+ return NS_OK;
+}
+
NS_IMETHODIMP
nsGlobalHistory::OnAutoComplete(const PRUnichar *searchString,
nsIAutoCompleteResults *previousSearchResult,
nsIAutoCompleteListener *listener)
-{
- return NS_OK;
+{
+ nsCOMPtr<nsISupportsArray> results;
+ nsCOMPtr<nsIAutoCompleteItem> item;
+ PRUint32 count = 0;
+ nsAutoString value;
+ PRBool found = PR_FALSE;
+ PRUint32 i;
+ nsresult rv = NS_OK;
+
+ if (mLearningMode == AUTOCOMPLETE_NO_LEARNING &&
+ mDataCaptureMode == URLDATACAPTURE_NONE)
+ return rv;
+
+ // See if searchString exists in the previous search results.
+ if (previousSearchResult) {
+ rv = previousSearchResult->GetItems(getter_AddRefs(results));
+ if (NS_FAILED(rv)) return rv;
+
+ if (results)
+ results->Count(&count);
+ }
+
+ for (i = 0; i < count; i++) {
+ rv = results->GetElementAt(i, getter_AddRefs(item));
+ if (NS_FAILED(rv)) return rv;
+
+ item->GetValue(value);
+ if (value.Equals(nsDependentString(searchString))) {
+ found = PR_TRUE;
+ break;
+ }
+ }
+
+ // If searchString found in the previous search results, assume
+ // that the user selected that url (searchString) from the previous
+ // list of autocomplete results.
+ if (found) {
+ // Train the sigmoid perceptron
+ previousSearchResult->GetItems(getter_AddRefs(results));
+ results->Count(&count);
+
+ if (mDataCaptureMode >= URLDATACAPTURE_WITHOUT_URL_INFO && mURLDataFile) {
+ nsCAutoString nowStr;
+ nsCOMPtr<nsIMdbRow> row;
+ PRInt64 rowID;
+
+ PRInt64ToChars(PR_Now(), nowStr);
+ fprintf(mURLDataFile, "<autocomplete time='%s'", nowStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " url='%s'",
+ NS_ConvertUCS2toUTF8(searchString).get());
+ }
+
+ if (NS_SUCCEEDED(FindRowAndID(kToken_URLColumn,
+ NS_ConvertUCS2toUTF8(searchString).get(),
+ getter_AddRefs(row), &rowID))) {
+ if (!rowID) {
+ AssignUniqueURLID(row, &rowID);
+ }
+ nsCAutoString IDStr;
+ PRInt64ToChars(rowID, IDStr);
+ fprintf(mURLDataFile, " url-id='%s'", IDStr.get());
+ }
+
+ fprintf(mURLDataFile, ">\n");
+ }
+
+ for (i = 0; i < count; i++) {
+ results->GetElementAt(i, getter_AddRefs(item));
+ item->GetValue(value);
+
+ rv = FillInputFeatures(value, mACFeatures);
+
+ if (NS_SUCCEEDED(rv) &&
+ mDataCaptureMode >= URLDATACAPTURE_WITHOUT_URL_INFO) {
+ WriteURLData(value, mACFeatures);
+ }
+
+ if (NS_SUCCEEDED(rv) && mLearningMode >= AUTOCOMPLETE_ENABLE_TRAINING) {
+ if (value.Equals(nsDependentString(searchString)))
+ mAutoCompleteLearner->Train(mACFeatures, AC_NUM_URL_FEATURES, 1);
+ else
+ mAutoCompleteLearner->Train(mACFeatures, AC_NUM_URL_FEATURES, 0);
+ }
+ }
+
+ if (mDataCaptureMode >= URLDATACAPTURE_WITHOUT_URL_INFO && mURLDataFile) {
+ fprintf(mURLDataFile, "</autocomplete>\n");
+ fflush(mURLDataFile);
+ }
+
+ if (mLearningMode >= AUTOCOMPLETE_ENABLE_TRAINING) {
+ mAutoCompleteLearner->SaveWeights();
+ }
+ }
+
+ return rv;
}
//----------------------------------------------------------------------
@@ -4214,6 +5286,26 @@ nsGlobalHistory::AutoCompleteCompare(nsAString& aHistoryURL,
return Substring(aHistoryURL, 0, aUserURL.Length()).Equals(aUserURL);
}
+
+// Prefixes the comment field of the autocomplete item with an asterisk
+// if there is no asterisk there already. This is a quick hack to show
+// that this item (url) was selected by the perceptron as one that is likely
+// to be selected by the user.
+static void
+PrefixItemWithAsterisk(nsIAutoCompleteItem *aItem)
+{
+ PRUnichar *comment;
+ nsAutoString commentStr;
+ NS_NAMED_LITERAL_STRING(asterisk, "*");
+
+ aItem->GetComment(&comment);
+ commentStr.Assign(comment);
+ if (!Substring(commentStr, 0, 1).Equals(asterisk)) {
+ aItem->SetComment(PromiseFlatString(asterisk + commentStr).get());
+ }
+ nsMemory::Free(comment);
+}
+
int PR_CALLBACK
nsGlobalHistory::AutoCompleteSortComparison(const void *v1, const void *v2,
void *closureVoid)
@@ -4249,6 +5341,30 @@ nsGlobalHistory::AutoCompleteSortComparison(const void *v1, const void *v2,
item1->GetValue(url1);
item2->GetValue(url2);
+ if (closure->history->mLearningMode == AUTOCOMPLETE_AFFECT_URL_LIST) {
+ // If the sigmoid perceptron thinks that the user will select the url,
+ // prefix an asterisk to the comment title.
+ PRFloat64* features = closure->history->mACFeatures;
+ PRFloat64 output = 0;
+ nsresult rv = NS_OK;
+
+ rv = closure->history->FillInputFeatures(url1, &features[0]);
+ if (NS_SUCCEEDED(rv)) {
+ closure->history->mAutoCompleteLearner->Test(features,
+ AC_NUM_URL_FEATURES, &output);
+ if (output >= 0.9)
+ PrefixItemWithAsterisk(item1);
+ }
+
+ rv = closure->history->FillInputFeatures(url2, &features[0]);
+ if (NS_SUCCEEDED(rv)) {
+ closure->history->mAutoCompleteLearner->Test(features,
+ AC_NUM_URL_FEATURES, &output);
+ if (output >= 0.9)
+ PrefixItemWithAsterisk(item2);
+ }
+ }
+
// Favour websites and webpaths more than webpages by boosting
// their visit counts. This assumes that URLs have been normalized,
// appending a trailing '/'.
View
81 xpfe/components/history/src/nsGlobalHistory.h
@@ -108,6 +108,56 @@ class searchTerm;
// Size of visit count boost to give to urls which are sites or paths
#define AUTOCOMPLETE_NONPAGE_VISIT_COUNT_BOOST 5
+
+//----------------------------------------------------------------------
+// Perceptron definitions
+// XXX The class definitions need to go in a separate header file
+
+class nsPerceptron
+{
+private:
+ nsPerceptron();
+public:
+ nsPerceptron(PRInt32 aNumFeatures);
+ virtual ~nsPerceptron()
+ {
+ SaveWeights();
+ if (mWeights) {
+ delete [] mWeights;
+ }
+ mNumWeights = 0;
+ }
+
+ virtual void Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput);
+ virtual void Test (PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput);
+ void SaveWeights();
+
+protected:
+
+ void LoadWeights();
+
+ PRFloat64* mWeights; // array of weights
+ PRInt32 mNumWeights;
+};
+
+class nsSigmoidPerceptron : public nsPerceptron
+{
+private:
+ nsSigmoidPerceptron();
+public:
+ nsSigmoidPerceptron(PRInt32 aNumFeatures);
+ virtual ~nsSigmoidPerceptron() {}
+
+ virtual void Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput);
+ virtual void Test(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput);
+
+protected:
+ PRFloat64 Sigmoid(PRFloat64 aNum);
+};
+
+//----------------------------------------------------------------------
+
+
//----------------------------------------------------------------------
//
// nsGlobalHistory
@@ -269,6 +319,19 @@ class nsGlobalHistory : nsSupportsWeakReference,
nsresult NotifyUnassert(nsIRDFResource* aSource, nsIRDFResource* aProperty, nsIRDFNode* aValue);
nsresult NotifyChange(nsIRDFResource* aSource, nsIRDFResource* aProperty, nsIRDFNode* aOldValue, nsIRDFNode* aNewValue);
+ // Autocomplete learning related
+ PRInt32 mDataCaptureMode;
+ PRInt32 mLearningMode;
+ // The learning engine used to learn a user's autocomplete behavior
+ nsSigmoidPerceptron* mAutoCompleteLearner;
+ PRFloat64* mACFeatures;
+ nsresult FillInputFeatures(nsAString &aUrl, PRFloat64 *aFeatures);
+
+ // URL data capture related
+ FILE* mURLDataFile;
+ nsresult WriteURLData(nsAString& aURL, PRFloat64* aURLFeatures);
+ nsresult AssignUniqueURLID(nsIMdbRow *aRow, PRInt64 *aID);
+
//
// row-oriented stuff
//
@@ -293,6 +356,15 @@ class nsGlobalHistory : nsSupportsWeakReference,
mdb_column kToken_HiddenColumn;
mdb_column kToken_TypedColumn;
+ // Frequency-Recency metrics for url
+ mdb_column kToken_FRFastDecayColumn;
+ mdb_column kToken_FRSlowDecayColumn;
+
+ // Unique ID of url. Needed to identify urls output to
+ // mURLDataFile when the data capture mode doesn't allow the
+ // url path to be output
+ mdb_column kToken_URLIDColumn;
+
// meta-data tokens
mdb_column kToken_LastPageVisited;
@@ -302,6 +374,7 @@ class nsGlobalHistory : nsSupportsWeakReference,
nsresult AddPageToDatabase(const char *aURL,
PRInt64 aDate);
nsresult AddExistingPageToDatabase(nsIMdbRow *row,
+ const char *aURL,
PRInt64 aDate,
PRInt64 *aOldDate,
PRInt32 *aOldCount);
@@ -315,13 +388,18 @@ class nsGlobalHistory : nsSupportsWeakReference,
nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const PRInt32 aValue);
nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const char *aValue);
nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const PRUnichar *aValue);
+ nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRFloat64 aValue);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, nsAString& aResult);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, nsACString& aResult);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRInt64* aResult);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRInt32* aResult);
+ nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRFloat64* aResult);
nsresult FindRow(mdb_column aCol, const char *aURL, nsIMdbRow **aResult);
+ nsresult FindRow(mdb_column aCol, PRInt64 aValue, nsIMdbRow **aResult);
+ nsresult FindRowAndID(mdb_column aCol, const char *aURL,
+ nsIMdbRow **aResult, PRInt64 *aRowID);
//
// misc unrelated stuff
@@ -346,6 +424,9 @@ class nsGlobalHistory : nsSupportsWeakReference,
static nsIRDFResource* kNC_URL; // XXX do we need?
static nsIRDFResource* kNC_HistoryRoot;
static nsIRDFResource* kNC_HistoryByDate;
+ static nsIRDFResource* kNC_BookmarkAddDate;
+ static nsIRDFResource* kNC_Bookmark;
+ static nsIRDFResource* kRDF_Type;
static nsIMdbFactory* gMdbFactory;
static nsIPrefBranch* gPrefBranch;
Please sign in to comment.
Something went wrong with that request. Please try again.