Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: mneedham/goose
base: 70b4a4cd3b
...
head fork: mneedham/goose
compare: 3d2ca3d69f
Checking mergeability… Don't worry, you can still create the pull request.
  • 2 commits
  • 2 files changed
  • 0 commit comments
  • 1 contributor
View
23 src/main/scala/com/gravity/goose/text/StopWords.scala
@@ -26,12 +26,9 @@ package com.gravity.goose.text
import java.util._
import com.gravity.goose.utils.FileHelper
-import scala.collection.JavaConversions._
+import scala.collection.JavaConversions._
object StopWords {
- // the confusing pattern below is basically just match any non-word character excluding white-space.
- private val PUNCTUATION: StringReplacement = StringReplacement.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]", string.empty)
-
val STOP_WORDS = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass).split("\n").toSet
def getStopWordCount(content: String): WordStats = {
@@ -40,21 +37,13 @@ object StopWords {
val candidateWords = extractCandidateWords(content)
val stopWordsInContent = seqAsJavaList(candidateWords.filter(w => STOP_WORDS.contains(w.toLowerCase)).map(_.toLowerCase))
- wordStats(candidateWords, stopWordsInContent)
+ new WordStats(stopWordsInContent, candidateWords.length)
}
- private def extractCandidateWords(content: String): Array[String] = {
- val strippedContent = PUNCTUATION.replaceAll(content)
- string.SPACE_SPLITTER.split(strippedContent)
- }
+ private def extractCandidateWords(content: String): Array[String] = string.SPACE_SPLITTER.split(stripPunctuation(content))
- private def wordStats(candidateWords: Array[String], stopWordsInContent: List[String]): WordStats = {
- val ws = new WordStats
- ws.setWordCount(candidateWords.length)
- ws.setStopWordCount(stopWordsInContent.size)
- ws.setStopWords(stopWordsInContent)
- ws
+ private def stripPunctuation(content: String) = {
+ val nonWordCharactersExcludingWhiteSpace = StringReplacement.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]", string.empty)
+ nonWordCharactersExcludingWhiteSpace.replaceAll(content)
}
-
-
}
View
13 src/main/scala/com/gravity/goose/text/WordStats.scala
@@ -30,22 +30,23 @@ object WordStats {
var EMPTY: WordStats = new WordStats
}
-class WordStats {
-
+class WordStats(_stopWords:List[String], _wordCount:Int) {
import WordStats._
-
+ def this() = this(new ArrayList(), 0)
/**
* total number of stopwords or good words that we can calculate
*/
- var stopWordCount: Int = 0
+ var stopWordCount : Int = _stopWords.size()
+
/**
* total number of words on a node
*/
- var wordCount: Int = 0
+ var wordCount: Int = _wordCount
+
/**
* holds an actual list of the stop words we found
*/
- var stopWords: List[String] = new ArrayList[String]
+ var stopWords: List[String] = _stopWords
def getStopWords: List[String] = {
stopWords

No commit comments for this range

Something went wrong with that request. Please try again.