Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Ongoing work for bug 182366. Use machine learning techniques to sort …

…autocomplete results. r=heikki. sr=hewitt.
  • Loading branch information...
commit 7b38fab678d949ecaaac833ff9d367d043ae987c 1 parent f0c4a3b
nisheeth%netscape.com authored
View
23 docshell/base/nsDocShell.cpp
@@ -4186,7 +4186,7 @@ nsDocShell::OnStateChange(nsIWebProgress * aProgress, nsIRequest * aRequest,
// Add the original url to global History so that
// visited url color changes happen.
if (uri)
- AddToGlobalHistory(uri, PR_TRUE);
+ AddToGlobalHistory(channel, uri, PR_TRUE);
} // channel
} // aProgress
}
@@ -5829,7 +5829,7 @@ nsDocShell::OnNewURI(nsIURI * aURI, nsIChannel * aChannel,
}
// Update Global history
- AddToGlobalHistory(aURI, IsFrame());
+ AddToGlobalHistory(aChannel, aURI, IsFrame());
}
// If this was a history load, update the index in
@@ -6368,8 +6368,9 @@ NS_IMETHODIMP nsDocShell::MakeEditable(PRBool inWaitForUriLoad)
}
nsresult
-nsDocShell::AddToGlobalHistory(nsIURI * aURI, PRBool aHidden)
+nsDocShell::AddToGlobalHistory(nsIChannel* aChannel, nsIURI * aURI, PRBool aHidden)
{
+ nsresult rv;
// first check if we should be adding it
PRBool updateHistory;
ShouldAddToGlobalHistory(aURI, &updateHistory);
@@ -6382,6 +6383,22 @@ nsDocShell::AddToGlobalHistory(nsIURI * aURI, PRBool aHidden)
NS_ENSURE_SUCCESS(mGlobalHistory->AddPage(spec.get()), NS_ERROR_FAILURE);
+ nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(aChannel, &rv));
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsIURI> referrer;
+ rv = httpChannel->GetReferrer(getter_AddRefs(referrer));
+ if (NS_SUCCEEDED(rv) && referrer) {
+ nsCAutoString referrerSpec;
+ rv = referrer->GetSpec(referrerSpec);
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsIBrowserHistory> browserHistory =
+ do_QueryInterface(mGlobalHistory);
+ browserHistory->OutputReferrerURL(spec.get(),
+ referrerSpec.get());
+ }
+ }
+ }
+
// this is a redirect, so hide the page from
// being enumerated in history
if (aHidden) {
View
2  docshell/base/nsDocShell.h
@@ -235,7 +235,7 @@ friend class nsDSURIContentListener;
// Global History
nsresult ShouldAddToGlobalHistory(nsIURI * aURI, PRBool * aShouldAdd);
- nsresult AddToGlobalHistory(nsIURI * aURI, PRBool );
+ nsresult AddToGlobalHistory(nsIChannel* aChannel, nsIURI * aURI, PRBool aHidden);
// Helper Routines
NS_IMETHOD GetPromptAndStringBundle(nsIPrompt ** aPrompt,
View
2  xpcom/reflect/xptinfo/src/xptiInterfaceInfoManager.cpp
@@ -260,7 +260,7 @@ PRBool xptiInterfaceInfoManager::BuildFileSearchPath(nsISupportsArray** aPath)
nsCOMPtr<nsILocalFile> greComponentDirectory;
nsresult rv = GetDirectoryFromDirService(NS_GRE_COMPONENT_DIR,
getter_AddRefs(greComponentDirectory));
- if (NS_SUCCEEDED(rv))
+ if (NS_SUCCEEDED(rv) && greComponentDirectory)
{
// make sure we only append a directory if its a different one
PRBool equalsCompDir = PR_FALSE;
View
6 xpfe/components/autocomplete/resources/content/autocomplete.xml
@@ -707,6 +707,9 @@
}
}
+ if (!this.noMatch)
+ this.autoComplete();
+
this.closeResultPopup();
}
@@ -739,6 +742,9 @@
}
}
+ if (!this.noMatch)
+ this.autoComplete();
+
this.mNeedToFinish = false;
this.mNeedToComplete = false;
View
17 xpfe/components/history/public/nsIBrowserHistory.idl
@@ -117,6 +117,23 @@ interface nsIBrowserHistory : nsISupports
*/
void markPageAsTyped(in string url);
+
+ /**
+ * outputReferrerURL
+ * Prints out referrer information for a url to a
+ * data file if browser.history.url.datacapture.mode is set
+ * to 1 or 2 (see the comment in nsGlobalHistory.cpp for
+ * more details).
+ *
+ * This method is a big hack and only temporary. Please do NOT use
+ * it in your code. It will be removed after the data
+ * collection phase of the project described in bug 182366 is
+ * over.
+ *
+ * @param aURL a url in global history
+ * @param aReferrer the referrer url to aURL
+ */
+ void outputReferrerURL(in string aURL, in string aReferrer);
};
%{ C++
View
1,130 xpfe/components/history/src/nsGlobalHistory.cpp
@@ -86,6 +86,9 @@
#include "nsIPrefBranchInternal.h"
#include "nsIObserverService.h"
+#include "prdtoa.h"
+#include "nsIBookmarksService.h"
+#include <math.h>
PRInt32 nsGlobalHistory::gRefCnt;
nsIRDFService* nsGlobalHistory::gRDFService;
@@ -102,6 +105,9 @@ nsIRDFResource* nsGlobalHistory::kNC_child;
nsIRDFResource* nsGlobalHistory::kNC_URL;
nsIRDFResource* nsGlobalHistory::kNC_HistoryRoot;
nsIRDFResource* nsGlobalHistory::kNC_HistoryByDate;
+nsIRDFResource* nsGlobalHistory::kNC_BookmarkAddDate;
+nsIRDFResource* nsGlobalHistory::kNC_Bookmark;
+nsIRDFResource* nsGlobalHistory::kRDF_Type;
nsIMdbFactory* nsGlobalHistory::gMdbFactory = nsnull;
nsIPrefBranch* nsGlobalHistory::gPrefBranch = nsnull;
@@ -110,6 +116,8 @@ nsIPrefBranch* nsGlobalHistory::gPrefBranch = nsnull;
#define PREF_BROWSER_STARTUP_PAGE "startup.page"
#define PREF_AUTOCOMPLETE_ONLY_TYPED "urlbar.matchOnlyTyped"
#define PREF_AUTOCOMPLETE_ENABLED "urlbar.autocomplete.enabled"
+#define PREF_AUTOCOMPLETE_LEARNING_MODE "urlbar.autocomplete.learning.mode"
+#define PREF_HISTORY_DATACAPTURE_MODE "history.url.datacapture.mode"
#define FIND_BY_AGEINDAYS_PREFIX "find:datasource=history&match=AgeInDays&method="
@@ -122,6 +130,181 @@ nsIPrefBranch* nsGlobalHistory::gPrefBranch = nsnull;
#define MSECS_PER_DAY (PR_MSEC_PER_SEC * 60 * 60 * 24)
+// ---------------------------
+// Autocomplete learning modes
+// ---------------------------
+
+// No learning
+#define AUTOCOMPLETE_NO_LEARNING 0
+// Only train the neural network. No user visible changes.
+#define AUTOCOMPLETE_ENABLE_TRAINING 1
+// Train the neural network and show its recommendation to the user.
+#define AUTOCOMPLETE_AFFECT_URL_LIST 2
+
+// --------------------------
+// URL data capture modes
+// --------------------------
+
+// URL information is captured for all urls that are loaded by the user
+// as well as for all urls matched by the autocomplete search engine
+// when the user types a partial url in the urlbar.
+
+// No data capture
+#define URLDATACAPTURE_NONE 0
+// Capture url features only. Don't store the url. The url features are
+// numeric and the original url *cannot* be reconstructed from it.
+#define URLDATACAPTURE_WITHOUT_URL_INFO 1
+// Capture url features and the url.
+#define URLDATACAPTURE_WITH_URL_INFO 2
+
+// ---------------------------
+
+#define AC_NUM_URL_FEATURES 44
+
+// This is the learning rate for the perceptron. The range of values is [0, 1]
+// It is used in to update the internal weights of the perceptron.
+// The perceptron update rule is:
+//
+// weight[i] = weight[i] + LEARN_RATE * ERROR * Input[i]
+//
+// See http://diwww.epfl.ch/mantra/tutorial/english/perceptron/html/learning.html
+// for an explanation of the perceptron update rule.
+const PRFloat64 LEARN_RATE = 0.5;
+
+// The following 4 constants are explained in the big comment
+// before FillInputFeatures.
+const PRFloat64 HISTORY_FAST_DECAY_CONSTANT = 0.2;
+const PRFloat64 HISTORY_SLOW_DECAY_CONSTANT = 0.8;
+const PRFloat64 BOOKMARK_FAST_DECAY_CONSTANT = 0.2;
+const PRFloat64 BOOKMARK_SLOW_DECAY_CONSTANT = 0.8;
+
+#define NS_AUTOCOMPLETE_WEIGHTS_FILE "ac-weights.txt"
+
+//----------------------------------------------------------------------
+// Perceptron implementation
+// XXX The implementations need to be moved out to their own .cpp file
+
+// XXX This should move to an Init method so that error handling can happen.
+nsPerceptron::nsPerceptron(PRInt32 aNumFeatures)
+{
+ mWeights = nsnull;
+ mNumWeights = 0;
+ if (aNumFeatures > 0) {
+ mWeights = new PRFloat64[aNumFeatures];
+ if (mWeights) {
+ mNumWeights = aNumFeatures;
+ LoadWeights();
+ }
+ }
+}
+
+void
+nsPerceptron::Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput)
+{
+ double output = 0.0;
+ double delta = 0.0;
+
+ // Calculate output
+ Test(aInputs, aNumInputs, &output);
+
+ delta = (double) aTargetOutput - (double) output;
+
+ // Update weights based on delta
+ PRInt32 i;
+ for (i = 0; i < mNumWeights; i++) {
+ mWeights[i] += LEARN_RATE * delta * aInputs[i];
+ }
+}
+
+void
+nsPerceptron::Test(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput)
+{
+ *aOutput = 0;
+
+ // Calculate output
+ PRInt32 i;
+ for (i = 0; i < aNumInputs; i++) {
+ *aOutput += mWeights[i] * aInputs[i];
+ }
+}
+
+void
+nsPerceptron::LoadWeights()
+{
+ nsCOMPtr<nsIFile> file;
+ FILE* from = 0;
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(file));
+
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsILocalFile> localFile(do_QueryInterface(file));
+ localFile->AppendNative(NS_LITERAL_CSTRING(NS_AUTOCOMPLETE_WEIGHTS_FILE));
+ localFile->OpenANSIFileDesc("r", &from);
+ }
+
+ PRInt32 i;
+ if (from) {
+ for (i = 0; i < mNumWeights; i++) {
+ fscanf(from, "%lf", &mWeights[i]);
+ }
+ fclose(from);
+ }
+ else {
+ // Initialize all weights to zero
+ for (i = 0; i < mNumWeights; i++) {
+ mWeights[i] = 0;
+ }
+ }
+}
+
+void
+nsPerceptron::SaveWeights()
+{
+ nsCOMPtr<nsIFile> file;
+ FILE* to = 0;
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(file));
+
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsILocalFile> localFile(do_QueryInterface(file));
+ localFile->AppendNative(NS_LITERAL_CSTRING("ac-weights.txt"));
+ localFile->OpenANSIFileDesc("w", &to);
+ }
+
+ if (to) {
+ PRInt32 i;
+ for (i = 0; i < mNumWeights; i++) {
+ fprintf(to, "%.16lf\n", mWeights[i]);
+ }
+ fclose(to);
+ }
+}
+
+nsSigmoidPerceptron::nsSigmoidPerceptron(PRInt32 aNumFeatures)
+: nsPerceptron(aNumFeatures)
+{
+ // empty
+}
+
+void
+nsSigmoidPerceptron::Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput)
+{
+ nsPerceptron::Train(aInputs, aNumInputs, aTargetOutput);
+}
+
+void
+nsSigmoidPerceptron::Test(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput)
+{
+ nsPerceptron::Test(aInputs, aNumInputs, aOutput);
+ *aOutput = Sigmoid(*aOutput);
+}
+
+PRFloat64 nsSigmoidPerceptron::Sigmoid(PRFloat64 aNum)
+{
+ return (1.0 / (1.0 + exp(-aNum)));
+}
+
+
+//----------------------------------------------------------------------
+
//----------------------------------------------------------------------
//
// CIDs
@@ -512,9 +695,12 @@ nsGlobalHistory::nsGlobalHistory()
mAutocompleteOnlyTyped(PR_FALSE),
mBatchesInProgress(0),
mNowValid(PR_FALSE),
- mDirty(PR_FALSE),
- mEnv(nsnull),
- mStore(nsnull),
+ mDirty(PR_FALSE),
+ mAutoCompleteLearner(nsnull),
+ mACFeatures(nsnull),
+ mURLDataFile(nsnull),
+ mEnv(nsnull),
+ mStore(nsnull),
mTable(nsnull)
{
LL_I2L(mFileSizeOnDisk, 0);
@@ -558,6 +744,9 @@ nsGlobalHistory::~nsGlobalHistory()
NS_IF_RELEASE(kNC_URL);
NS_IF_RELEASE(kNC_HistoryRoot);
NS_IF_RELEASE(kNC_HistoryByDate);
+ NS_IF_RELEASE(kNC_BookmarkAddDate);
+ NS_IF_RELEASE(kNC_Bookmark);
+ NS_IF_RELEASE(kRDF_Type);
NS_IF_RELEASE(gMdbFactory);
NS_IF_RELEASE(gPrefBranch);
@@ -570,6 +759,23 @@ nsGlobalHistory::~nsGlobalHistory()
if (mExpireNowTimer)
mExpireNowTimer->Cancel();
+ if (mURLDataFile) {
+ nsCAutoString dateStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ fprintf(mURLDataFile, "<shutdown time='%s'/>\n", dateStr.get());
+
+ fclose(mURLDataFile);
+ }
+
+ if (mAutoCompleteLearner) {
+ delete mAutoCompleteLearner;
+ mAutoCompleteLearner = nsnull;
+ }
+
+ if (mACFeatures) {
+ delete [] mACFeatures;
+ mACFeatures = nsnull;
+ }
}
@@ -653,7 +859,7 @@ nsGlobalHistory::AddPageToDatabase(const char *aURL,
// update the database, and get the old info back
PRInt64 oldDate;
PRInt32 oldCount;
- rv = AddExistingPageToDatabase(row, aDate, &oldDate, &oldCount);
+ rv = AddExistingPageToDatabase(row, aURL, aDate, &oldDate, &oldCount);
NS_ASSERTION(NS_SUCCEEDED(rv), "AddExistingPageToDatabase failed; see bug 88961");
if (NS_FAILED(rv)) return rv;
@@ -703,6 +909,7 @@ nsGlobalHistory::AddPageToDatabase(const char *aURL,
nsresult
nsGlobalHistory::AddExistingPageToDatabase(nsIMdbRow *row,
+ const char *aURL,
PRInt64 aDate,
PRInt64 *aOldDate,
PRInt32 *aOldCount)
@@ -729,6 +936,45 @@ nsGlobalHistory::AddExistingPageToDatabase(nsIMdbRow *row,
SetRowValue(row, kToken_LastVisitDateColumn, aDate);
SetRowValue(row, kToken_VisitCountColumn, (*aOldCount) + 1);
+ if (mLearningMode > AUTOCOMPLETE_NO_LEARNING ||
+ mDataCaptureMode > URLDATACAPTURE_NONE) {
+ // Update the two Frequency-Recency metrics
+ PRFloat64 m;
+ PRInt32 ageInDays = GetAgeInDays(NormalizeTime(GetNow()), *aOldDate);
+ rv = GetRowValue(row, kToken_FRFastDecayColumn, &m);
+ if (NS_FAILED(rv)) return rv;
+ m = 1.0 + (PRFloat64) (pow(HISTORY_FAST_DECAY_CONSTANT, (PRFloat64) ageInDays)) * m;
+ SetRowValue(row, kToken_FRFastDecayColumn, m);
+ rv = GetRowValue(row, kToken_FRSlowDecayColumn, &m);
+ if (NS_FAILED(rv)) return rv;
+ m = 1.0 + (PRFloat64) (pow(HISTORY_SLOW_DECAY_CONSTANT, (PRFloat64) ageInDays)) * m;
+ SetRowValue(row, kToken_FRSlowDecayColumn, m);
+ }
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ fprintf(mURLDataFile, "<add-existing-url>\n");
+ nsAutoString url = NS_ConvertUTF8toUCS2(aURL);
+ rv = FillInputFeatures(url, mACFeatures);
+ if (NS_SUCCEEDED(rv))
+ WriteURLData(url, mACFeatures);
+ fprintf(mURLDataFile, "</add-existing-url>\n");
+ fflush(mURLDataFile);
+ }
+
+ return NS_OK;
+}
+
+nsresult
+nsGlobalHistory::AssignUniqueURLID(nsIMdbRow *aRow, PRInt64 *aID)
+{
+ nsCOMPtr<nsIMdbRow> oldRow;
+ nsresult rv = NS_OK;
+ *aID = PR_Now();
+ do {
+ rv = FindRow(kToken_URLIDColumn, ++(*aID), getter_AddRefs(oldRow));
+ } while (NS_SUCCEEDED(rv));
+ SetRowValue(aRow, kToken_URLIDColumn, *aID);
+
return NS_OK;
}
@@ -759,6 +1005,24 @@ nsGlobalHistory::AddNewPageToDatabase(const char *aURL,
SetRowValue(row, kToken_LastVisitDateColumn, aDate);
SetRowValue(row, kToken_FirstVisitDateColumn, aDate);
+ if (mLearningMode > AUTOCOMPLETE_NO_LEARNING ||
+ mDataCaptureMode > URLDATACAPTURE_NONE) {
+ // Initialize the Frequency-Recency metrics
+ SetRowValue(row, kToken_FRFastDecayColumn, (PRFloat64) 1.0);
+ SetRowValue(row, kToken_FRSlowDecayColumn, (PRFloat64) 1.0);
+ }
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ fprintf(mURLDataFile, "<add-new-url>\n");
+ nsAutoString url = NS_ConvertUTF8toUCS2(aURL);
+ nsresult rv;
+ rv = FillInputFeatures(url, mACFeatures);
+ if (NS_SUCCEEDED(rv))
+ WriteURLData(url, mACFeatures);
+ fprintf(mURLDataFile, "</add-new-url>\n");
+ fflush(mURLDataFile);
+ }
+
nsCOMPtr<nsIURI> uri;
NS_NewURI(getter_AddRefs(uri), nsDependentCString(aURL), nsnull, nsnull);
nsCAutoString hostname;
@@ -842,6 +1106,22 @@ nsGlobalHistory::SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const PRInt32 aVa
}
nsresult
+nsGlobalHistory::SetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRFloat64 aValue)
+{
+ mdb_err err;
+
+ nsCAutoString buf; buf.AppendFloat(aValue);
+ mdbYarn yarn = { (void *)buf.get(), buf.Length(), buf.Length(), 0, 0, nsnull };
+
+ err = aRow->AddColumn(mEnv, aCol, &yarn);
+
+ if (err != 0) return NS_ERROR_FAILURE;
+
+ return NS_OK;
+
+}
+
+nsresult
nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
nsAString& aResult)
{
@@ -909,6 +1189,30 @@ nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
nsresult
nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
+ PRFloat64 *aResult)
+{
+ mdb_err err;
+ char *next = NULL;
+ nsresult rv = NS_OK;
+
+ mdbYarn yarn;
+ err = aRow->AliasCellYarn(mEnv, aCol, &yarn);
+ if (err != 0)
+ return NS_ERROR_FAILURE;
+
+ if (yarn.mYarn_Buf) {
+ *aResult = PR_strtod((const char *)yarn.mYarn_Buf, &next);
+ if (next == yarn.mYarn_Buf) {
+ rv = NS_ERROR_CANNOT_CONVERT_DATA;
+ }
+ }
+
+ return rv;
+}
+
+
+nsresult
+nsGlobalHistory::GetRowValue(nsIMdbRow *aRow, mdb_column aCol,
nsACString& aResult)
{
mdb_err err;
@@ -1240,6 +1544,27 @@ nsGlobalHistory::HidePage(const char *aURL)
rv = SetRowValue(row, kToken_HiddenColumn, 1);
if (NS_FAILED(rv)) return rv;
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ PRInt64 id;
+ nsCAutoString dateStr, IDStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ GetRowValue(row, kToken_URLIDColumn, &id);
+
+ if (!id) {
+ AssignUniqueURLID(row, &id);
+ }
+
+ PRInt64ToChars(id, IDStr);
+ fprintf(mURLDataFile, "<hide-url id='%s' time='%s'", IDStr.get(), dateStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s'", aURL);
+ }
+
+ fprintf(mURLDataFile, "/>\n");
+ fflush(mURLDataFile);
+ }
+
// now pretend as if this row was deleted
// HasAssertion() correctly checks the Hidden column to show that
// the row is hidden
@@ -1267,10 +1592,92 @@ nsGlobalHistory::MarkPageAsTyped(const char* aURL)
rv = SetRowValue(row, kToken_HiddenColumn, 1);
if (NS_FAILED(rv)) return rv;
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ PRInt64 id;
+ nsCAutoString dateStr, IDStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ GetRowValue(row, kToken_URLIDColumn, &id);
+
+ if (!id) {
+ AssignUniqueURLID(row, &id);
+ }
+
+ PRInt64ToChars(id, IDStr);
+ fprintf(mURLDataFile, "<typed-url id='%s' time='%s'", IDStr.get(), dateStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s'", aURL);
+ }
+
+ fprintf(mURLDataFile, "/>\n");
+ fflush(mURLDataFile);
+ }
+
return SetRowValue(row, kToken_TypedColumn, 1);
}
+NS_IMETHODIMP
+nsGlobalHistory::OutputReferrerURL(const char *aURL, const char *aReferrer)
+{
+ nsresult rv;
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE && mURLDataFile) {
+ // If history is set to expire after 0 days,
+ // then it's technically disabled. Don't even
+ // bother adding the page
+ if (mExpireDays == 0)
+ return NS_OK;
+
+ NS_ENSURE_ARG_POINTER(aURL);
+ NS_ENSURE_ARG_POINTER(aReferrer);
+ NS_ENSURE_SUCCESS(OpenDB(), NS_ERROR_FAILURE);
+
+ if (!*aURL)
+ return NS_ERROR_INVALID_ARG;
+
+ nsCOMPtr<nsIMdbRow> urlRow;
+ rv = FindRow(kToken_URLColumn, aURL, getter_AddRefs(urlRow));
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsIMdbRow> referrerRow;
+ rv = FindRow(kToken_URLColumn, aReferrer, getter_AddRefs(referrerRow));
+ if (NS_SUCCEEDED(rv)) {
+ PRInt64 id;
+ nsCAutoString dateStr, urlIDStr, referrerIDStr;
+
+ PRInt64ToChars(PR_Now(), dateStr);
+
+ GetRowValue(referrerRow, kToken_URLIDColumn, &id);
+ if (!id) {
+ AssignUniqueURLID(referrerRow, &id);
+ }
+
+ PRInt64ToChars(id, referrerIDStr);
+
+ GetRowValue(urlRow, kToken_URLIDColumn, &id);
+ if (!id) {
+ AssignUniqueURLID(urlRow, &id);
+ }
+
+ PRInt64ToChars(id, urlIDStr);
+
+ fprintf(mURLDataFile, "<referrer-url id='%s' url-id='%s' time='%s'",
+ referrerIDStr.get(), urlIDStr.get(), dateStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s' url-path='%s'", aReferrer, aURL);
+ }
+
+ fprintf(mURLDataFile, "/>\n");
+ fflush(mURLDataFile);
+ }
+ }
+ }
+
+ return NS_OK;
+}
+
+
//----------------------------------------------------------------------
//
// nsGlobalHistory
@@ -2299,6 +2706,60 @@ nsGlobalHistory::Init()
gPrefBranch->GetIntPref(PREF_BROWSER_HISTORY_EXPIRE_DAYS, &mExpireDays);
gPrefBranch->GetBoolPref(PREF_AUTOCOMPLETE_ONLY_TYPED, &mAutocompleteOnlyTyped);
+ gPrefBranch->GetIntPref(PREF_HISTORY_DATACAPTURE_MODE, &mDataCaptureMode);
+ gPrefBranch->GetIntPref(PREF_AUTOCOMPLETE_LEARNING_MODE, &mLearningMode);
+
+ if (mDataCaptureMode > URLDATACAPTURE_NONE) {
+ nsCOMPtr<nsIFile> file;
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(file));
+
+ if (NS_SUCCEEDED(rv)) {
+ nsCOMPtr<nsILocalFile> localFile(do_QueryInterface(file));
+ localFile->AppendNative(NS_LITERAL_CSTRING("url-data.txt"));
+ localFile->OpenANSIFileDesc("a", &mURLDataFile);
+
+ if (mURLDataFile) {
+ nsCAutoString dateStr;
+ PRInt64ToChars(PR_Now(), dateStr);
+ fprintf(mURLDataFile, "\n<startup time='%s'/>\n", dateStr.get());
+ fflush(mURLDataFile);
+ }
+ else {
+ mDataCaptureMode = URLDATACAPTURE_NONE;
+ mURLDataFile = nsnull;
+ }
+ }
+ else {
+ // Disable data capture
+ mDataCaptureMode = URLDATACAPTURE_NONE;
+ mURLDataFile = nsnull;
+ }
+ }
+ else {
+ mURLDataFile = nsnull;
+ }
+
+ if (mLearningMode > AUTOCOMPLETE_NO_LEARNING ||
+ mDataCaptureMode > URLDATACAPTURE_NONE) {
+ // Create perceptron and feature array.
+ mAutoCompleteLearner = new nsSigmoidPerceptron(AC_NUM_URL_FEATURES);
+ if (mAutoCompleteLearner) {
+ mACFeatures = new PRFloat64[AC_NUM_URL_FEATURES];
+ if (!mACFeatures) {
+ delete mAutoCompleteLearner;
+ mAutoCompleteLearner = nsnull;
+ mLearningMode = AUTOCOMPLETE_NO_LEARNING;
+ }
+ }
+ else {
+ mLearningMode = AUTOCOMPLETE_NO_LEARNING;
+ }
+ }
+ else {
+ mAutoCompleteLearner = nsnull;
+ mACFeatures = nsnull;
+ }
+
nsCOMPtr<nsIPrefBranchInternal> pbi = do_QueryInterface(gPrefBranch);
if (pbi) {
pbi->AddObserver(PREF_AUTOCOMPLETE_ONLY_TYPED, this, PR_FALSE);
@@ -2327,6 +2788,9 @@ nsGlobalHistory::Init()
gRDFService->GetResource(NC_NAMESPACE_URI "URL", &kNC_URL);
gRDFService->GetResource("NC:HistoryRoot", &kNC_HistoryRoot);
gRDFService->GetResource("NC:HistoryByDate", &kNC_HistoryByDate);
+ gRDFService->GetResource(NC_NAMESPACE_URI "BookmarkAddDate", &kNC_BookmarkAddDate);
+ gRDFService->GetResource(NC_NAMESPACE_URI "Bookmark", &kNC_Bookmark);
+ gRDFService->GetResource(RDF_NAMESPACE_URI "type", &kRDF_Type);
}
// register this as a named data source with the RDF service
@@ -2677,6 +3141,15 @@ nsGlobalHistory::CreateTokens()
err = mStore->StringToToken(mEnv, "Typed", &kToken_TypedColumn);
if (err != 0) return NS_ERROR_FAILURE;
+ err = mStore->StringToToken(mEnv, "FRFastDecay", &kToken_FRFastDecayColumn);
+ if (err != 0) return NS_ERROR_FAILURE;
+
+ err = mStore->StringToToken(mEnv, "FRSlowDecay", &kToken_FRSlowDecayColumn);
+ if (err != 0) return NS_ERROR_FAILURE;
+
+ err = mStore->StringToToken(mEnv, "URLID", &kToken_URLIDColumn);
+ if (err != 0) return NS_ERROR_FAILURE;
+
// meta-data tokens
err = mStore->StringToToken(mEnv, "LastPageVisited", &kToken_LastPageVisited);
@@ -2833,11 +3306,58 @@ nsGlobalHistory::FindRow(mdb_column aCol,
if (!hasRow) return NS_ERROR_NOT_AVAILABLE;
*aResult = row;
- (*aResult)->AddRef();
+ NS_ADDREF(*aResult);
return NS_OK;
}
+nsresult
+nsGlobalHistory::FindRow(mdb_column aCol,
+ PRInt64 aValue, nsIMdbRow **aResult)
+{
+ if (! mStore)
+ return NS_ERROR_NOT_INITIALIZED;
+
+ mdb_err err;
+ nsCAutoString val;
+ PRInt64ToChars(aValue, val);
+
+ mdbYarn yarn = { (void *)val.get(), val.Length(), val.Length(), 0, 0, nsnull };
+
+ mdbOid rowId;
+ nsCOMPtr<nsIMdbRow> row;
+ err = mStore->FindRow(mEnv, kToken_HistoryRowScope,
+ aCol, &yarn,
+ &rowId, getter_AddRefs(row));
+
+ if (err) return NS_ERROR_FAILURE;
+
+ if (!row) return NS_ERROR_NOT_AVAILABLE;
+
+ // make sure it's actually stored in the main table
+ mdb_bool hasRow;
+ mTable->HasRow(mEnv, row, &hasRow);
+
+ if (!hasRow) return NS_ERROR_NOT_AVAILABLE;
+
+ *aResult = row;
+ NS_ADDREF(*aResult);
+
+ return NS_OK;
+}
+
+nsresult
+nsGlobalHistory::FindRowAndID(mdb_column aCol,const char *aValue,
+ nsIMdbRow **aResult, PRInt64 *aRowID)
+{
+ nsresult rv;
+ rv = FindRow(aCol, aValue, aResult);
+ if (NS_SUCCEEDED(rv)) {
+ rv = GetRowValue(*aResult, kToken_URLIDColumn, aRowID);
+ }
+ return rv;
+}
+
PRBool
nsGlobalHistory::IsURLInHistory(nsIRDFResource* aResource)
{
@@ -3980,12 +4500,564 @@ nsGlobalHistory::OnStopLookup()
return NS_OK;
}
+
+/**
+ *
+ * The input features into the autocomplete perceptron are as follows:
+ *
+ * Features 1 = Frequency and recency metric for page in history
+ * (domain = positive real numbers)
+ * Value decays fast with age of page
+ * Uses HISTORY_FAST_DECAY_CONSTANT
+ * Features 2 = Frequency and recency metric for page in history
+ * (high for newer, more accessed pages)
+ * Value decays slowly with age of page
+ * Uses HISTORY_SLOW_DECAY_CONSTANT
+ * Features 3 = Was the url typed by the user?
+ * (domain = 0 or 1)
+ * Features 4 = Recency metric for page in bookmarks
+ * (domain = real number between 0 and 1)
+ * Value decays fast with age of bookmark
+ * Uses BOOKMARKS_FAST_DECAY_CONSTANT
+ * Features 5 = Recency metric for page in bookmarks
+ * (domain = real number between 0 and 1)
+ * Value decays slowly with age of bookmark
+ * Uses BOOKMARKS_SLOW_DECAY_CONSTANT
+ *
+ * Features 1 and Feature 2 details:
+ *
+ * As an example, say a page was first seen on Day 1 and accessed from then
+ * until today (Day 4) with the following schedule:
+ *
+ * (Day 1, D times), (Day 2, C times), (Day 3, B times), (Day 4, A times)
+ *
+ * Then, the frequency+recency metric calculation for the page will be:
+ *
+ * FRMetric = A + (B * G) + (C * G^2) + (D * G^3)
+ *
+ * where G is the decay constant that takes values between 0 and 1.
+ * Values close to 1 lead to slow decay with age.
+ * Values close to 0 lead to fast decay with age.
+ *
+ * Feature 4 and Feature 5 only care about recency not frequency.
+ *
+ * So, if a bookmark was added X days earlier,
+ *
+ * Bookmark Feature Value = G^X.
+ *
+ * where G is the decay constant that takes values between 0 and 1.
+ * Values close to 1 lead to slow decay with age.
+ * Values close to 0 lead to fast decay with age.
+ *
+ * The rest of the url related features:
+ *
+ * Feature 6: Whether url ends in .htm or .html
+ * Feature 7: Is it a .com URL?
+ * Feature 8: Is it a .edu URL?
+ * Feature 9: Is it a .org URL?
+ * Feature 10: Is it a .net URL?
+ * Feature 11: Is it a .gov URL?
+ * Feature 12: Does the URL contain a ~ ?
+ * Feature 13: Does the URL start with http:* ?
+ * Feature 14: Does the URL start with ftp:// ?
+ * Feature 15: Does the URL start with file:// ?
+ * Feature 16: Does the URL start with gopher:// ?
+ * Feature 17: Does the URL start with https:// ?
+ * Feature 18: Does the host name end in a two letter country code?
+ * Feature 19: Number of /s in the URL.
+ * Feature 20: Number of ?s in the URL.
+ * Feature 21: Number of &s in the URL.
+ * Feature 22: Number of =s in the URL.
+ * Feature 23: Number of #s in the URL.
+ * Feature 24: Number of +s in the URL.
+ * Feature 25: Number of .s in the URL.
+ * Feature 26: Number of numerical [0-9] characters in the URL
+ * Feature 27: Number of alphabetical [a-zA-Z] characters in the URL
+ * Feature 28: Number of non-alphanumeric, non-[/?&=#+.] characters in the URL
+ * Feature 29: Number of .s in the hostname
+ * Feature 30: Number of numerical [0-9] characters in the hostname
+ * Feature 31: Number of alphabetical [a-zA-Z] characters in the hostname
+ * Feature 32: Number of non-alphanumeric, non-[/?&=#+.] characters in the hostname
+ * Feature 33: Number of .s in the hostname if we omit initial "www." or "ftp." (if any)
+ * Feature 34: Number of .s in the hostname if we omit ending ".XX" country code (if any)
+ * Feature 35: Number of .s in the hostname if we omit initial "www." or "ftp"
+ * and ending ".XX" country code (if any)
+ * Feature 36: Number of characters in URL
+ * Feature 37: Number of characters in hostname
+ * Feature 38: Number of characters in hostname excluding initial "www." or "ftp."
+ * Feature 39: Number of characters in URL excluding hostname
+ * Feature 40: Number of characters in web page title
+ * Feature 41: Is this a google search url?
+ * Feature 42: Is this a netscape search url?
+ * Feature 43: Is this a yahoo search url?
+ * Feature 44: Dummy input hardcoded to 1
+ */
+
+nsresult
+nsGlobalHistory::FillInputFeatures(nsAString &aUrl,
+ PRFloat64 *aFeatures)
+{
+ nsCOMPtr<nsIMdbRow> row;
+ nsresult rv = NS_OK;
+ PRInt32 ageInDays;
+ PRInt64 lastDate;
+ static nsCOMPtr<nsIBookmarksService> bs =
+ do_GetService(NS_BOOKMARKS_SERVICE_CONTRACTID, &rv);
+
+ nsCOMPtr<nsIURI> uri;
+ nsCAutoString curl, chost, cpath;
+ rv = NS_NewURI(getter_AddRefs(uri), aUrl);
+ if (NS_SUCCEEDED(rv) && uri) {
+ uri->GetSpec(curl);
+ uri->GetHost(chost);
+ uri->GetPath(cpath);
+ }
+ nsAutoString url(NS_ConvertUTF8toUCS2(curl).get());
+ nsAutoString path(NS_ConvertUTF8toUCS2(cpath).get());
+ nsAutoString host(NS_ConvertUTF8toUCS2(chost).get());
+ ToLowerCase(url);
+ ToLowerCase(host);
+ ToLowerCase(path);
+
+ // Calculate the input features for this training example.
+ rv = FindRow(kToken_URLColumn, curl.get(),
+ getter_AddRefs(row));
+ if (NS_FAILED(rv)) return rv;
+
+ // First, get the page in history related input features
+ rv = GetRowValue(row, kToken_FRFastDecayColumn, &aFeatures[0]);
+ if (NS_FAILED(rv)) return rv;
+
+ rv = GetRowValue(row, kToken_LastVisitDateColumn, &lastDate);
+ if (NS_FAILED(rv)) return rv;
+
+ ageInDays = GetAgeInDays(NormalizeTime(GetNow()), lastDate);
+
+ aFeatures[0] *= pow(HISTORY_FAST_DECAY_CONSTANT, (PRFloat64) ageInDays);
+
+ rv = GetRowValue(row, kToken_FRSlowDecayColumn, &aFeatures[1]);
+ if (NS_FAILED(rv)) return rv;
+
+ aFeatures[1] *= pow(HISTORY_SLOW_DECAY_CONSTANT, (PRFloat64) ageInDays);
+
+ aFeatures[2] = HasCell(mEnv, row, kToken_TypedColumn);
+
+ // Second, calculate the bookmark related input features.
+ aFeatures[3] = aFeatures[4] = 0;
+ if (bs) {
+ PRBool bookmarked;
+ rv = bs->IsBookmarked(curl.get(), &bookmarked);
+ if (NS_SUCCEEDED(rv) && bookmarked) {
+ // Get the date when the bookmark was added.
+ PRInt64 addDate;
+ nsCOMPtr<nsIRDFResource> rdfRes;
+
+ if (NS_SUCCEEDED(rv = gRDFService->GetResource(curl.get(),
+ getter_AddRefs(rdfRes)))) {
+ nsCOMPtr<nsIRDFDataSource> bookmarkDS = do_QueryInterface(bs, &rv);
+ if (NS_SUCCEEDED(rv) && bookmarkDS) {
+ nsCOMPtr<nsIRDFNode> nodeType;
+ rv = bookmarkDS->GetTarget(rdfRes, kRDF_Type, PR_TRUE,
+ getter_AddRefs(nodeType));
+ if (NS_SUCCEEDED(rv)) {
+ if (nodeType == kNC_Bookmark) {
+ nsCOMPtr<nsIRDFNode> node;
+ rv = bookmarkDS->GetTarget(rdfRes, kNC_BookmarkAddDate, PR_TRUE,
+ getter_AddRefs(node));
+ if (rv != NS_RDF_NO_VALUE && node) {
+ nsCOMPtr<nsIRDFDate> rdfDate = do_QueryInterface(node, &rv);
+ if (NS_SUCCEEDED(rv) && rdfDate) {
+ rv = rdfDate->GetValue(&addDate);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (NS_SUCCEEDED(rv)) {
+ ageInDays = GetAgeInDays(NormalizeTime(GetNow()), addDate);
+ aFeatures[3] = pow(BOOKMARK_FAST_DECAY_CONSTANT, ageInDays);
+ aFeatures[4] = pow(BOOKMARK_SLOW_DECAY_CONSTANT, ageInDays);
+ }
+ }
+ }
+
+ // Feature 6: Whether url ends in .htm or .html
+ nsAString::const_iterator start, end;
+
+ path.BeginReading(start);
+ path.EndReading(end);
+ aFeatures[5] = FindInReadable(NS_LITERAL_STRING(".htm"), start, end);
+
+ // Feature 7: Is it a .com URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[6] = FindInReadable(NS_LITERAL_STRING(".com"), start, end);
+
+ // Feature 8: Is it a .edu URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[7] = FindInReadable(NS_LITERAL_STRING(".edu"), start, end);
+
+ // Feature 9: Is it a .org URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[8] = FindInReadable(NS_LITERAL_STRING(".org"), start, end);
+
+ // Feature 10: Is it a .net URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[9] = FindInReadable(NS_LITERAL_STRING(".net"), start, end);
+
+ // Feature 11: Is it a .gov URL?
+ host.BeginReading(start);
+ host.EndReading(end);
+ aFeatures[10] = FindInReadable(NS_LITERAL_STRING(".gov"), start, end);
+
+ // Feature 12: Does the URL contain a ~ ?
+ path.BeginReading(start);
+ path.EndReading(end);
+ aFeatures[11] = FindInReadable(NS_LITERAL_STRING("~"), start, end);
+
+ // Feature 13: Does the URL start with http:// ?
+ PRBool isScheme;
+ aFeatures[12] = aFeatures[13] = aFeatures[14] = aFeatures[15] = aFeatures[16] = 0;
+ if (NS_SUCCEEDED(uri->SchemeIs("http", &isScheme))) {
+ aFeatures[12] = isScheme;
+ }
+ // Feature 14: Does the URL start with ftp:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("ftp", &isScheme))) {
+ aFeatures[13] = isScheme;
+ }
+ // Feature 15: Does the URL start with file:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("file", &isScheme))) {
+ aFeatures[14] = isScheme;
+ }
+ // Feature 16: Does the URL start with gopher:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("gopher", &isScheme))) {
+ aFeatures[15] = isScheme;
+ }
+ // Feature 17: Does the URL start with https:// ?
+ else if (NS_SUCCEEDED(uri->SchemeIs("https", &isScheme))) {
+ aFeatures[16] = isScheme;
+ }
+
+ // Feature 18: Does the host name end in a two letter country code?
+ PRInt32 hostLength = host.Length();
+ if (host[hostLength - 1] == '.') {
+ // Skip trailing dots in hostname if it exists. This will catch cases like
+ // http://www.state.ca.us./state/portal/myca_homepage.jsp
+ aFeatures[17] = (host.RFindChar('.', hostLength - 2) == (hostLength - 4));
+ }
+ else {
+ aFeatures[17] = (host.RFindChar('.') == ((hostLength - 1) - 2));
+ }
+
+ // Feature 19: Number of /s in the URL.
+ aFeatures[18] = 0;
+ // Feature 20: Number of ?s in the URL.
+ aFeatures[19] = 0;
+ // Feature 21: Number of &s in the URL.
+ aFeatures[20] = 0;
+ // Feature 22: Number of =s in the URL.
+ aFeatures[21] = 0;
+ // Feature 23: Number of #s in the URL.
+ aFeatures[22] = 0;
+ // Feature 24: Number of +s in the URL.
+ aFeatures[23] = 0;
+ // Feature 25: Number of .s in the URL.
+ aFeatures[24] = 0;
+ // Feature 26: Number of numerical [0-9] characters in the URL
+ aFeatures[25] = 0;
+ // Feature 27: Number of alphabetical [a-zA-Z] characters in the URL
+ aFeatures[26] = 0;
+ // Feature 28: Number of non-alphanumeric, non-[/?&=#+.] characters in the URL
+ aFeatures[27] = 0;
+
+ url.BeginReading(start);
+ url.EndReading(end);
+
+ PRUint32 size, i;
+ for ( ; start != end; start.advance(size)) {
+ const PRUnichar* buf = start.get();
+ size = start.size_forward();
+
+ // fragment at 'buf' is 'size' characters long
+ for (i = 0; i < size; *buf++, i++) {
+ switch (*buf) {
+ case '/':
+ ++aFeatures[18];
+ break;
+
+ case '?':
+ ++aFeatures[19];
+ break;
+
+ case '&':
+ ++aFeatures[20];
+ break;
+
+ case '=':
+ ++aFeatures[21];
+ break;
+
+ case '#':
+ ++aFeatures[22];
+ break;
+
+ case '+':
+ ++aFeatures[23];
+ break;
+
+ case '.':
+ ++aFeatures[24];
+ break;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ ++aFeatures[25];
+ break;
+
+ default:
+ if (isalpha(*buf))
+ ++aFeatures[26];
+ else
+ ++aFeatures[27];
+ }
+ }
+ }
+
+ // Calculate a bunch of hostname related features.
+
+ // Feature 29: Number of .s in the hostname
+ aFeatures[28] = 0;
+ // Feature 30: Number of numerical [0-9] characters in the hostname
+ aFeatures[29] = 0;
+ // Feature 31: Number of alphabetical [a-zA-Z] characters in the hostname
+ aFeatures[30] = 0;
+ // Feature 32: Number of non-alphanumeric, non-[.] characters in the hostname
+ aFeatures[31] = 0;
+
+ size = chost.Length();
+ for (i = 0; i < size; i++) {
+ switch (chost[i]) {
+ case '.':
+ ++aFeatures[28];
+ break;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ ++aFeatures[29];
+ break;
+
+ default:
+ if (isalpha(chost[i]))
+ ++aFeatures[30];
+ else
+ ++aFeatures[31];
+ }
+ }
+
+ // Feature 33: Number of .s in the hostname if we omit initial "www." or "ftp." (if any)
+ aFeatures[32] = aFeatures[28];
+ // Feature 34: Number of .s in the hostname if we omit ending ".XX" country code (if any)
+ aFeatures[33] = aFeatures[28];
+ // Feature 35: Number of .s in the hostname if we omit initial "www." or "ftp"
+ // and ending ".XX" country code (if any)
+ aFeatures[34] = aFeatures[28];
+ // Feature 36: Number of characters in hostname
+ aFeatures[35] = chost.Length();
+ // Feature 37: Number of characters in hostname excluding initial "www." or "ftp."
+ aFeatures[36] = aFeatures[35];
+
+ if (chost.Find("www.") == 0 || chost.Find("ftp.") == 0) {
+ --aFeatures[32];
+ --aFeatures[34];
+ aFeatures[36] -= 4;
+ }
+
+ if (aFeatures[17]) {
+ --aFeatures[33];
+ --aFeatures[34];
+ }
+
+ // Feature 38: Number of characters in URL
+ aFeatures[37] = url.Length();
+
+ // Feature 39: Number of characters in URL excluding hostname
+ aFeatures[38] = aFeatures[37] - aFeatures[35];
+
+ // Feature 40: Number of characters in web page title
+ nsAutoString title;
+ rv = GetRowValue(row, kToken_NameColumn, title);
+ if (NS_FAILED(rv)) return rv;
+ aFeatures[39] = title.Length();
+
+ // Feature 41: Is this a google search url?
+ url.BeginReading(start);
+ url.EndReading(end);
+ aFeatures[40] = FindInReadable(NS_LITERAL_STRING("http://www.google.com/search"), start, end);
+
+ // Feature 42: Is this a netscape search url?
+ url.BeginReading(start);
+ url.EndReading(end);
+ aFeatures[41] = FindInReadable(NS_LITERAL_STRING("http://search.netscape.com/nscp_results.adp"), start, end);
+
+ // Feature 43: Is this a yahoo search url?
+ url.BeginReading(start);
+ url.EndReading(end);
+ aFeatures[42] = FindInReadable(NS_LITERAL_STRING("http://search.yahoo.com/bin/search"), start, end);
+
+ // Feature 44: This is a dummy input hardcoded to 1. It allows
+ // the perceptron to represent functions that do not pass through the
+ // origin.
+ aFeatures[43] = 1;
+
+ return rv;
+}
+
+nsresult
+nsGlobalHistory::WriteURLData(nsAString& aURL, PRFloat64* aURLFeatures)
+{
+ nsCOMPtr<nsIMdbRow> row;
+ nsresult rv = NS_OK;
+ nsCAutoString dateStr, IDStr;
+ PRInt64 rowID;
+
+ if (!mURLDataFile || !aURLFeatures)
+ return NS_ERROR_FAILURE;
+
+ // Calculate the input features for this training example.
+ rv = FindRowAndID(kToken_URLColumn, NS_ConvertUCS2toUTF8(aURL).get(),
+ getter_AddRefs(row), &rowID);
+ if (NS_FAILED(rv)) return rv;
+
+ if (!rowID) {
+ AssignUniqueURLID(row, &rowID);
+ }
+
+ PRInt64ToChars(rowID, IDStr);
+
+ fprintf(mURLDataFile, "<url id='%s'", IDStr.get());
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " path='%s'", NS_ConvertUCS2toUTF8(aURL).get());
+ }
+ PRInt64ToChars(PR_Now(), dateStr);
+ fprintf(mURLDataFile, " time='%s'>\n", dateStr.get());
+
+ PRInt32 i;
+ for (i = 0; i < AC_NUM_URL_FEATURES - 1; i++) {
+ fprintf(mURLDataFile, "%.2f, ", aURLFeatures[i]);
+ }
+
+ fprintf(mURLDataFile, "%.2f\n</url>\n", aURLFeatures[i]);
+
+ return NS_OK;
+}
+
NS_IMETHODIMP
nsGlobalHistory::OnAutoComplete(const PRUnichar *searchString,
nsIAutoCompleteResults *previousSearchResult,
nsIAutoCompleteListener *listener)
-{
- return NS_OK;
+{
+ nsCOMPtr<nsISupportsArray> results;
+ nsCOMPtr<nsIAutoCompleteItem> item;
+ PRUint32 count = 0;
+ nsAutoString value;
+ PRBool found = PR_FALSE;
+ PRUint32 i;
+ nsresult rv = NS_OK;
+
+ if (mLearningMode == AUTOCOMPLETE_NO_LEARNING &&
+ mDataCaptureMode == URLDATACAPTURE_NONE)
+ return rv;
+
+ // See if searchString exists in the previous search results.
+ if (previousSearchResult) {
+ rv = previousSearchResult->GetItems(getter_AddRefs(results));
+ if (NS_FAILED(rv)) return rv;
+
+ if (results)
+ results->Count(&count);
+ }
+
+ for (i = 0; i < count; i++) {
+ rv = results->GetElementAt(i, getter_AddRefs(item));
+ if (NS_FAILED(rv)) return rv;
+
+ item->GetValue(value);
+ if (value.Equals(nsDependentString(searchString))) {
+ found = PR_TRUE;
+ break;
+ }
+ }
+
+ // If searchString found in the previous search results, assume
+ // that the user selected that url (searchString) from the previous
+ // list of autocomplete results.
+ if (found) {
+ // Train the sigmoid perceptron
+ previousSearchResult->GetItems(getter_AddRefs(results));
+ results->Count(&count);
+
+ if (mDataCaptureMode >= URLDATACAPTURE_WITHOUT_URL_INFO && mURLDataFile) {
+ nsCAutoString nowStr;
+ nsCOMPtr<nsIMdbRow> row;
+ PRInt64 rowID;
+
+ PRInt64ToChars(PR_Now(), nowStr);
+ fprintf(mURLDataFile, "<autocomplete time='%s'", nowStr.get());
+
+ if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
+ fprintf(mURLDataFile, " url='%s'",
+ NS_ConvertUCS2toUTF8(searchString).get());
+ }
+
+ if (NS_SUCCEEDED(FindRowAndID(kToken_URLColumn,
+ NS_ConvertUCS2toUTF8(searchString).get(),
+ getter_AddRefs(row), &rowID))) {
+ if (!rowID) {
+ AssignUniqueURLID(row, &rowID);
+ }
+ nsCAutoString IDStr;
+ PRInt64ToChars(rowID, IDStr);
+ fprintf(mURLDataFile, " url-id='%s'", IDStr.get());
+ }
+
+ fprintf(mURLDataFile, ">\n");
+ }
+
+ for (i = 0; i < count; i++) {
+ results->GetElementAt(i, getter_AddRefs(item));
+ item->GetValue(value);
+
+ rv = FillInputFeatures(value, mACFeatures);
+
+ if (NS_SUCCEEDED(rv) &&
+ mDataCaptureMode >= URLDATACAPTURE_WITHOUT_URL_INFO) {
+ WriteURLData(value, mACFeatures);
+ }
+
+ if (NS_SUCCEEDED(rv) && mLearningMode >= AUTOCOMPLETE_ENABLE_TRAINING) {
+ if (value.Equals(nsDependentString(searchString)))
+ mAutoCompleteLearner->Train(mACFeatures, AC_NUM_URL_FEATURES, 1);
+ else
+ mAutoCompleteLearner->Train(mACFeatures, AC_NUM_URL_FEATURES, 0);
+ }
+ }
+
+ if (mDataCaptureMode >= URLDATACAPTURE_WITHOUT_URL_INFO && mURLDataFile) {
+ fprintf(mURLDataFile, "</autocomplete>\n");
+ fflush(mURLDataFile);
+ }
+
+ if (mLearningMode >= AUTOCOMPLETE_ENABLE_TRAINING) {
+ mAutoCompleteLearner->SaveWeights();
+ }
+ }
+
+ return rv;
}
//----------------------------------------------------------------------
@@ -4214,6 +5286,26 @@ nsGlobalHistory::AutoCompleteCompare(nsAString& aHistoryURL,
return Substring(aHistoryURL, 0, aUserURL.Length()).Equals(aUserURL);
}
+
+// Prefixes the comment field of the autocomplete item with an asterisk
+// if there is no asterisk there already. This is a quick hack to show
+// that this item (url) was selected by the perceptron as one that is likely
+// to be selected by the user.
+static void
+PrefixItemWithAsterisk(nsIAutoCompleteItem *aItem)
+{
+ PRUnichar *comment;
+ nsAutoString commentStr;
+ NS_NAMED_LITERAL_STRING(asterisk, "*");
+
+ aItem->GetComment(&comment);
+ commentStr.Assign(comment);
+ if (!Substring(commentStr, 0, 1).Equals(asterisk)) {
+ aItem->SetComment(PromiseFlatString(asterisk + commentStr).get());
+ }
+ nsMemory::Free(comment);
+}
+
int PR_CALLBACK
nsGlobalHistory::AutoCompleteSortComparison(const void *v1, const void *v2,
void *closureVoid)
@@ -4249,6 +5341,30 @@ nsGlobalHistory::AutoCompleteSortComparison(const void *v1, const void *v2,
item1->GetValue(url1);
item2->GetValue(url2);
+ if (closure->history->mLearningMode == AUTOCOMPLETE_AFFECT_URL_LIST) {
+ // If the sigmoid perceptron thinks that the user will select the url,
+ // prefix an asterisk to the comment title.
+ PRFloat64* features = closure->history->mACFeatures;
+ PRFloat64 output = 0;
+ nsresult rv = NS_OK;
+
+ rv = closure->history->FillInputFeatures(url1, &features[0]);
+ if (NS_SUCCEEDED(rv)) {
+ closure->history->mAutoCompleteLearner->Test(features,
+ AC_NUM_URL_FEATURES, &output);
+ if (output >= 0.9)
+ PrefixItemWithAsterisk(item1);
+ }
+
+ rv = closure->history->FillInputFeatures(url2, &features[0]);
+ if (NS_SUCCEEDED(rv)) {
+ closure->history->mAutoCompleteLearner->Test(features,
+ AC_NUM_URL_FEATURES, &output);
+ if (output >= 0.9)
+ PrefixItemWithAsterisk(item2);
+ }
+ }
+
// Favour websites and webpaths more than webpages by boosting
// their visit counts. This assumes that URLs have been normalized,
// appending a trailing '/'.
View
81 xpfe/components/history/src/nsGlobalHistory.h
@@ -108,6 +108,56 @@ class searchTerm;
// Size of visit count boost to give to urls which are sites or paths
#define AUTOCOMPLETE_NONPAGE_VISIT_COUNT_BOOST 5
+
+//----------------------------------------------------------------------
+// Perceptron definitions
+// XXX The class definitions need to go in a separate header file
+
+class nsPerceptron
+{
+private:
+ nsPerceptron();
+public:
+ nsPerceptron(PRInt32 aNumFeatures);
+ virtual ~nsPerceptron()
+ {
+ SaveWeights();
+ if (mWeights) {
+ delete [] mWeights;
+ }
+ mNumWeights = 0;
+ }
+
+ virtual void Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput);
+ virtual void Test (PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput);
+ void SaveWeights();
+
+protected:
+
+ void LoadWeights();
+
+ PRFloat64* mWeights; // array of weights
+ PRInt32 mNumWeights;
+};
+
+class nsSigmoidPerceptron : public nsPerceptron
+{
+private:
+ nsSigmoidPerceptron();
+public:
+ nsSigmoidPerceptron(PRInt32 aNumFeatures);
+ virtual ~nsSigmoidPerceptron() {}
+
+ virtual void Train(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64 aTargetOutput);
+ virtual void Test(PRFloat64* aInputs, PRInt32 aNumInputs, PRFloat64* aOutput);
+
+protected:
+ PRFloat64 Sigmoid(PRFloat64 aNum);
+};
+
+//----------------------------------------------------------------------
+
+
//----------------------------------------------------------------------
//
// nsGlobalHistory
@@ -269,6 +319,19 @@ class nsGlobalHistory : nsSupportsWeakReference,
nsresult NotifyUnassert(nsIRDFResource* aSource, nsIRDFResource* aProperty, nsIRDFNode* aValue);
nsresult NotifyChange(nsIRDFResource* aSource, nsIRDFResource* aProperty, nsIRDFNode* aOldValue, nsIRDFNode* aNewValue);
+ // Autocomplete learning related
+ PRInt32 mDataCaptureMode;
+ PRInt32 mLearningMode;
+ // The learning engine used to learn a user's autocomplete behavior
+ nsSigmoidPerceptron* mAutoCompleteLearner;
+ PRFloat64* mACFeatures;
+ nsresult FillInputFeatures(nsAString &aUrl, PRFloat64 *aFeatures);
+
+ // URL data capture related
+ FILE* mURLDataFile;
+ nsresult WriteURLData(nsAString& aURL, PRFloat64* aURLFeatures);
+ nsresult AssignUniqueURLID(nsIMdbRow *aRow, PRInt64 *aID);
+
//
// row-oriented stuff
//
@@ -293,6 +356,15 @@ class nsGlobalHistory : nsSupportsWeakReference,
mdb_column kToken_HiddenColumn;
mdb_column kToken_TypedColumn;
+ // Frequency-Recency metrics for url
+ mdb_column kToken_FRFastDecayColumn;
+ mdb_column kToken_FRSlowDecayColumn;
+
+ // Unique ID of url. Needed to identify urls output to
+ // mURLDataFile when the data capture mode doesn't allow the
+ // url path to be output
+ mdb_column kToken_URLIDColumn;
+
// meta-data tokens
mdb_column kToken_LastPageVisited;
@@ -302,6 +374,7 @@ class nsGlobalHistory : nsSupportsWeakReference,
nsresult AddPageToDatabase(const char *aURL,
PRInt64 aDate);
nsresult AddExistingPageToDatabase(nsIMdbRow *row,
+ const char *aURL,
PRInt64 aDate,
PRInt64 *aOldDate,
PRInt32 *aOldCount);
@@ -315,13 +388,18 @@ class nsGlobalHistory : nsSupportsWeakReference,
nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const PRInt32 aValue);
nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const char *aValue);
nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, const PRUnichar *aValue);
+ nsresult SetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRFloat64 aValue);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, nsAString& aResult);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, nsACString& aResult);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRInt64* aResult);
nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRInt32* aResult);
+ nsresult GetRowValue(nsIMdbRow *aRow, mdb_column aCol, PRFloat64* aResult);
nsresult FindRow(mdb_column aCol, const char *aURL, nsIMdbRow **aResult);
+ nsresult FindRow(mdb_column aCol, PRInt64 aValue, nsIMdbRow **aResult);
+ nsresult FindRowAndID(mdb_column aCol, const char *aURL,
+ nsIMdbRow **aResult, PRInt64 *aRowID);
//
// misc unrelated stuff
@@ -346,6 +424,9 @@ class nsGlobalHistory : nsSupportsWeakReference,
static nsIRDFResource* kNC_URL; // XXX do we need?
static nsIRDFResource* kNC_HistoryRoot;
static nsIRDFResource* kNC_HistoryByDate;
+ static nsIRDFResource* kNC_BookmarkAddDate;
+ static nsIRDFResource* kNC_Bookmark;
+ static nsIRDFResource* kRDF_Type;
static nsIMdbFactory* gMdbFactory;
static nsIPrefBranch* gPrefBranch;
Please sign in to comment.
Something went wrong with that request. Please try again.