removing debug printouts to Transcript

mark-watson · May 19, 2017 · dfa32b3 · dfa32b3
1 parent e5f9037
commit dfa32b3
Show file tree

Hide file tree

Showing 36 changed files with 516 additions and 0 deletions.
diff --git a/KBSnlp.package/.filetree b/KBSnlp.package/.filetree
@@ -0,0 +1,4 @@
+{
+	"noMethodMetaData" : true,
+	"separateMethodMetaAndSource" : false,
+	"useCypressPropertiesFile" : true }
diff --git a/KBSnlp.package/ManifestKBSnlp.class/README.md b/KBSnlp.package/ManifestKBSnlp.class/README.md
@@ -0,0 +1,3 @@
+Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
+
+See:  https://github.com/mark-watson/nlp_smalltalk
diff --git a/...package/ManifestKBSnlp.class/class/ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive.st b/...package/ManifestKBSnlp.class/class/ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive.st
@@ -0,0 +1,3 @@
+code-critics
+ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive
+	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') )
diff --git a/KBSnlp.package/ManifestKBSnlp.class/class/ruleRBToDoCollectRuleV1FalsePositive.st b/KBSnlp.package/ManifestKBSnlp.class/class/ruleRBToDoCollectRuleV1FalsePositive.st
@@ -0,0 +1,3 @@
+code-critics
+ruleRBToDoCollectRuleV1FalsePositive
+	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') )
diff --git a/KBSnlp.package/ManifestKBSnlp.class/properties.json b/KBSnlp.package/ManifestKBSnlp.class/properties.json
@@ -0,0 +1,14 @@
+{
+	"category" : "KBSnlp",
+	"classinstvars" : [
+		 ],
+	"classvars" : [
+		 ],
+	"commentStamp" : "MarkWatson 5/19/2017 06:24",
+	"instvars" : [
+		 ],
+	"name" : "ManifestKBSnlp",
+	"pools" : [
+		 ],
+	"super" : "PackageManifest",
+	"type" : "normal" }
diff --git a/KBSnlp.package/NLPcategories.class/README.md b/KBSnlp.package/NLPcategories.class/README.md
@@ -0,0 +1,5 @@
+A NLPcategories is class to categorize text.
+
+Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
+
+See:  https://github.com/mark-watson/nlp_smalltalk
diff --git a/KBSnlp.package/NLPcategories.class/class/classify..st b/KBSnlp.package/NLPcategories.class/class/classify..st
@@ -0,0 +1,26 @@
+classify
+classify: text
+	"classify text in a string"
+
+	| tokens categories scores num hash numTokens results cutoff |
+	tokens :=  NLPtagger tokenize: (text , 'XXXXXX').
+	categories := (Smalltalk at: #NlpCategoryHash) keys.
+	num := categories size.
+	numTokens := tokens size - 1.
+	scores := Array new: num.
+	1 to: num do: [ :i |
+		scores at: i put: 0.
+		hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i).
+		1 to: numTokens do: [ :j |
+			(hash includesKey: (tokens at: j))
+			  ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ].
+		hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i).
+		1 to: numTokens do: [ :j |
+			(hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1)))
+			  ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]].
+	results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. 
+	1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ].
+	cutoff := ((results at: 1) at: 1) / 2.
+	results := results select: [ :x | (x at: 1) > cutoff ].
+	^results.
+
diff --git a/KBSnlp.package/NLPcategories.class/class/initializeCategoryHash.st b/KBSnlp.package/NLPcategories.class/class/initializeCategoryHash.st
@@ -0,0 +1,8 @@
+classify
+initializeCategoryHash
+	"requires NeoJSON"
+
+     Smalltalk at: #NlpCategoryHash
+                    put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags.json') contentsOfEntireFile).
+     Smalltalk at: #NlpCategory2gramHash
+                    put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags_2gram.json') contentsOfEntireFile)
diff --git a/KBSnlp.package/NLPcategories.class/properties.json b/KBSnlp.package/NLPcategories.class/properties.json
@@ -0,0 +1,14 @@
+{
+	"category" : "KBSnlp",
+	"classinstvars" : [
+		 ],
+	"classvars" : [
+		 ],
+	"commentStamp" : "MarkWatson 5/19/2017 06:25",
+	"instvars" : [
+		 ],
+	"name" : "NLPcategories",
+	"pools" : [
+		 ],
+	"super" : "Object",
+	"type" : "normal" }
diff --git a/KBSnlp.package/NLPentities.class/README.md b/KBSnlp.package/NLPentities.class/README.md
@@ -0,0 +1,5 @@
+A NLPentities is a class to find people's names, company names, place names, etc. in text.
+
+Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
+
+See:  https://github.com/mark-watson/nlp_smalltalk
diff --git a/KBSnlp.package/NLPentities.class/class/entities..st b/KBSnlp.package/NLPentities.class/class/entities..st
@@ -0,0 +1,19 @@
+entityDetection
+entities: aString
+	"return a Dictionary of entities (keys type, values Sets"
+
+	| temp result |
+	result := Dictionary new.
+	temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString.
+	temp size > 0
+		ifTrue: [ result at: 'companies' put: temp ].
+	temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString.
+	temp size > 0
+		ifTrue: [ result at: 'products' put: temp ].
+	temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString.
+	temp size > 0
+		ifTrue: [ result at: 'places' put: temp ].
+	temp := NLPentities humanNameHelper: aString.
+	temp size > 0
+		ifTrue: [ result at: 'places' put: temp ].
+	^ result
diff --git a/KBSnlp.package/NLPentities.class/class/entityHelper.text..st b/KBSnlp.package/NLPentities.class/class/entityHelper.text..st
@@ -0,0 +1,20 @@
+entityDetection
+entityHelper: entitySet text: aString
+	"this is a helper method for everything **but** person names"
+
+	| tokens num ngram2 ngram3 results |
+	results := Set new.
+	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.
+	num := tokens size - 3.	" account for the 3 fake tokens at the end "
+	1 to: num do: [ :i | 
+		ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1).
+		ngram3 := ngram2 , ' ' , (tokens at: i + 2).	"Transcript show: ngram2; cr."
+		(entitySet includes: ngram3)
+			ifTrue: [ results add: ngram3 ]
+			ifFalse: [ 
+				(entitySet includes: ngram2)
+					ifTrue: [ results add: ngram2 ]
+					ifFalse: [ 
+						(entitySet includes: (tokens at: i))
+							ifTrue: [ results add: (tokens at: i) ] ] ] ].
+	^ results
diff --git a/KBSnlp.package/NLPentities.class/class/fileToDictionary..st b/KBSnlp.package/NLPentities.class/class/fileToDictionary..st
@@ -0,0 +1,23 @@
+entityDetection
+fileToDictionary: filePath
+
+	"Read data/lexicon.txt and build in memory lexicon"
+
+      | read count  aLine  strm  set |
+
+      Transcript show: 'Processing file ' , filePath; cr.
+
+	set := Set new.
+	read := (MultiByteFileStream fileNamed: filePath) readOnly.
+	count := 0.
+	[read atEnd]
+		whileFalse: [count := count + 1.
+			aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
+			"look for a space character: "
+			((aLine indexOf: $:) > 0)
+			  ifTrue: [ 
+				 strm := ReadStream on: aLine.
+			       aLine := strm upTo: $:].
+			set add: aLine].
+	read close.
+	^set
diff --git a/KBSnlp.package/NLPentities.class/class/humanNameHelper..st b/KBSnlp.package/NLPentities.class/class/humanNameHelper..st
@@ -0,0 +1,22 @@
+entityDetection
+humanNameHelper: aString
+	"this is a helper method for everything **but** person names"
+
+	| tokens num results |
+	results := Set new.
+	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.
+	num := tokens size - 3.	" account for the 3 fake tokens at the end "
+	1 to: num do: [ :i | 
+		((Smalltalk at: #NLPfirstNames) includes: (tokens at: i))
+			ifTrue: [ 
+				(((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1))
+					and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2)))
+					ifTrue: [ 
+						results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2).
+						i := i + 2 ]
+					ifFalse: [ 
+						((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1))
+							ifTrue: [ 
+								results add: (tokens at: i) , ' ' , (tokens at: i + 1).
+								i := i + 1 ] ] ] ].
+	^ results
diff --git a/KBSnlp.package/NLPentities.class/class/initializeEntities.st b/KBSnlp.package/NLPentities.class/class/initializeEntities.st
@@ -0,0 +1,32 @@
+entityDetection
+initializeEntities
+	"load entity name data"
+
+	" Note: place name lines of the form: Cairo:country_capital   Fixed in fileToDictionary "
+
+	Smalltalk
+		at: #NLPcompanyNames
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/company_names.txt').
+	Smalltalk
+		at: #NLPfirstNames
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/firstnames.txt').
+	Smalltalk
+		at: #NLPlastNames
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/lastnames.txt').
+	Smalltalk
+		at: #NLPhonorifics
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/honorifics.txt').
+	Smalltalk
+		at: #NLPprefixNames
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/prefixnames.txt').
+	Smalltalk
+		at: #NLPplaceNames
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/placenames.txt').
+	Smalltalk
+		at: #NLPproductNames
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/product_names.txt').
+
+	" also read in data we will need for sentence segmentation: "
+	Smalltalk
+		at: #NLPtokensWithPeriods
+		put: (NLPentities fileToDictionary: './nlp_smalltalk/tokens_with_periods.txt').
diff --git a/KBSnlp.package/NLPentities.class/properties.json b/KBSnlp.package/NLPentities.class/properties.json
@@ -0,0 +1,14 @@
+{
+	"category" : "KBSnlp",
+	"classinstvars" : [
+		 ],
+	"classvars" : [
+		 ],
+	"commentStamp" : "MarkWatson 5/19/2017 06:25",
+	"instvars" : [
+		 ],
+	"name" : "NLPentities",
+	"pools" : [
+		 ],
+	"super" : "Object",
+	"type" : "normal" }
diff --git a/KBSnlp.package/NLPsentences.class/README.md b/KBSnlp.package/NLPsentences.class/README.md
@@ -0,0 +1,5 @@
+A class to segment text into sentences.
+
+Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
+
+See:  https://github.com/mark-watson/nlp_smalltalk
diff --git a/KBSnlp.package/NLPsentences.class/class/fileToSet..st b/KBSnlp.package/NLPsentences.class/class/fileToSet..st
@@ -0,0 +1,15 @@
+utiities
+fileToSet: filePath
+	"Read file, create Set with elements being each line in file"
+
+	| read aLine set |
+	Transcript
+		show: 'Processing file ' , filePath;
+		cr.
+	set := Set new.
+	read := (MultiByteFileStream fileNamed: filePath) readOnly.
+	[ read atEnd ]
+		whileFalse: [ aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
+			set add: aLine ].
+	read close.
+	^ set
diff --git a/KBSnlp.package/NLPsentences.class/class/loadData.st b/KBSnlp.package/NLPsentences.class/class/loadData.st
@@ -0,0 +1,11 @@
+initialize
+loadData
+	"Load tokens that normally contain periods"
+
+	| aSet count reverseDictionary forwardDictionary |
+	count := 0.
+	reverseDictionary := Dictionary new.
+	forwardDictionary := Dictionary new.
+	aSet := NLPsentences fileToSet: './nlp_smalltalk/tokensWithPeriods.txt'.
+	Smalltalk at: #NLPtokensWithPeriods put: aSet.
+	^ 'tokens with periods data loaded'
diff --git a/KBSnlp.package/NLPsentences.class/class/sentences..st b/KBSnlp.package/NLPsentences.class/class/sentences..st
@@ -0,0 +1,34 @@
+segment
+sentences: someText
+	"tokenize a string into individual sentences"
+
+	| tokens aSet lastToken currentSentence allSentences |
+	aSet := Smalltalk at: #NLPtokensWithPeriods.
+	tokens := OrderedCollection new.
+	(NLPsentences tokenizeLeavePeriods: someText)
+		do: [ :token | 
+			(token includesSubstring: '.') not
+				ifTrue: [ tokens add: token ]
+				ifFalse: [ (aSet includes: token)
+						ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: '').
+							tokens add: '.' ]
+						ifTrue: [ tokens add: token ] ] ].
+	currentSentence := OrderedCollection new.
+	allSentences := OrderedCollection new.
+	lastToken := ''.
+	Transcript
+		show: tokens;
+		cr.
+	tokens
+		do: [ :token | 
+			Transcript
+				show: token;
+				cr.
+			currentSentence add: token.
+			((token = '.' and: lastToken isAllDigits not) or: token = '?')
+				ifTrue: [ allSentences addLast: currentSentence.
+					currentSentence := OrderedCollection new ].
+			lastToken := token ].
+	currentSentence isNotEmpty
+		ifTrue: [ allSentences addLast: currentSentence ].
+	^ allSentences
diff --git a/KBSnlp.package/NLPsentences.class/class/tokenizeLeavePeriods..st b/KBSnlp.package/NLPsentences.class/class/tokenizeLeavePeriods..st
@@ -0,0 +1,9 @@
+utiities
+tokenizeLeavePeriods: wordsInAString
+	"tokenizes a string"
+
+	^ wordsInAString
+		findTokens:
+			' ;:,<>[]{}!
+@#$%^&*()?'
+		keep: ';:.,<>[]{}!$?'	" keep CR in this string!! "
diff --git a/KBSnlp.package/NLPsentences.class/properties.json b/KBSnlp.package/NLPsentences.class/properties.json
@@ -0,0 +1,14 @@
+{
+	"category" : "KBSnlp",
+	"classinstvars" : [
+		 ],
+	"classvars" : [
+		 ],
+	"commentStamp" : "MarkWatson 5/19/2017 06:26",
+	"instvars" : [
+		 ],
+	"name" : "NLPsentences",
+	"pools" : [
+		 ],
+	"super" : "Object",
+	"type" : "normal" }
diff --git a/KBSnlp.package/NLPsummarizer.class/README.md b/KBSnlp.package/NLPsummarizer.class/README.md
@@ -0,0 +1,5 @@
+A class to classify English text into categories.
+
+Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
+
+See:  https://github.com/mark-watson/nlp_smalltalk
diff --git a/KBSnlp.package/NLPsummarizer.class/class/summarize..st b/KBSnlp.package/NLPsummarizer.class/class/summarize..st
@@ -0,0 +1,32 @@
+summary
+summarize: text
+	"extractive summarizer"
+
+	| sentences sentenceScores tokens scoredCategories hash x bestIndices |
+	scoredCategories := NLPcategories classify: text.
+	sentences := NLPtagger sentences: text.
+	sentenceScores := Array new: sentences size.
+	1 to: sentences size do: [ :i | 
+		sentenceScores at: i put: 0.
+		tokens := sentences at: i.
+		Transcript
+			show: (sentences at: i);
+			cr.
+		scoredCategories
+			do: [ :sc | 
+				hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2).
+				tokens
+					do: [ :token | 
+						(hash includesKey: token)
+							ifTrue: [ x := hash at: token.
+								sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ].
+	bestIndices := sentenceScores
+		collectWithIndex: [ :score :i | 
+			{score.
+			i} ].
+	Transcript
+		show: 'sentence scoring: ';
+		show: bestIndices;
+		cr.
+	bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ].
+	^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ]
diff --git a/KBSnlp.package/NLPsummarizer.class/properties.json b/KBSnlp.package/NLPsummarizer.class/properties.json
@@ -0,0 +1,14 @@
+{
+	"category" : "KBSnlp",
+	"classinstvars" : [
+		 ],
+	"classvars" : [
+		 ],
+	"commentStamp" : "MarkWatson 5/19/2017 06:26",
+	"instvars" : [
+		 ],
+	"name" : "NLPsummarizer",
+	"pools" : [
+		 ],
+	"super" : "Object",
+	"type" : "normal" }
diff --git a/KBSnlp.package/NLPtagger.class/README.md b/KBSnlp.package/NLPtagger.class/README.md
@@ -0,0 +1,6 @@
+NLP tagger converted to Squeak.
+A class that implements an NLP tagger.
+
+Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
+
+See:  https://github.com/mark-watson/nlp_smalltalk