-
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
removing debug printouts to Transcript
- Loading branch information
1 parent
e5f9037
commit dfa32b3
Showing
36 changed files
with
516 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"noMethodMetaData" : true, | ||
"separateMethodMetaAndSource" : false, | ||
"useCypressPropertiesFile" : true } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. | ||
|
||
See: https://github.com/mark-watson/nlp_smalltalk |
3 changes: 3 additions & 0 deletions
3
...package/ManifestKBSnlp.class/class/ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive.st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
code-critics | ||
ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive | ||
^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') ) |
3 changes: 3 additions & 0 deletions
3
KBSnlp.package/ManifestKBSnlp.class/class/ruleRBToDoCollectRuleV1FalsePositive.st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
code-critics | ||
ruleRBToDoCollectRuleV1FalsePositive | ||
^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"category" : "KBSnlp", | ||
"classinstvars" : [ | ||
], | ||
"classvars" : [ | ||
], | ||
"commentStamp" : "MarkWatson 5/19/2017 06:24", | ||
"instvars" : [ | ||
], | ||
"name" : "ManifestKBSnlp", | ||
"pools" : [ | ||
], | ||
"super" : "PackageManifest", | ||
"type" : "normal" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
A NLPcategories is class to categorize text. | ||
|
||
Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. | ||
|
||
See: https://github.com/mark-watson/nlp_smalltalk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
classify | ||
classify: text | ||
"classify text in a string" | ||
|
||
| tokens categories scores num hash numTokens results cutoff | | ||
tokens := NLPtagger tokenize: (text , 'XXXXXX'). | ||
categories := (Smalltalk at: #NlpCategoryHash) keys. | ||
num := categories size. | ||
numTokens := tokens size - 1. | ||
scores := Array new: num. | ||
1 to: num do: [ :i | | ||
scores at: i put: 0. | ||
hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i). | ||
1 to: numTokens do: [ :j | | ||
(hash includesKey: (tokens at: j)) | ||
ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ]. | ||
hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i). | ||
1 to: numTokens do: [ :j | | ||
(hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1))) | ||
ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]]. | ||
results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. | ||
1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ]. | ||
cutoff := ((results at: 1) at: 1) / 2. | ||
results := results select: [ :x | (x at: 1) > cutoff ]. | ||
^results. | ||
|
8 changes: 8 additions & 0 deletions
8
KBSnlp.package/NLPcategories.class/class/initializeCategoryHash.st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
classify | ||
initializeCategoryHash | ||
"requires NeoJSON" | ||
|
||
Smalltalk at: #NlpCategoryHash | ||
put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags.json') contentsOfEntireFile). | ||
Smalltalk at: #NlpCategory2gramHash | ||
put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags_2gram.json') contentsOfEntireFile) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"category" : "KBSnlp", | ||
"classinstvars" : [ | ||
], | ||
"classvars" : [ | ||
], | ||
"commentStamp" : "MarkWatson 5/19/2017 06:25", | ||
"instvars" : [ | ||
], | ||
"name" : "NLPcategories", | ||
"pools" : [ | ||
], | ||
"super" : "Object", | ||
"type" : "normal" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
A NLPentities is a class to find people's names, company names, place names, etc. in text. | ||
|
||
Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. | ||
|
||
See: https://github.com/mark-watson/nlp_smalltalk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
entityDetection | ||
entities: aString | ||
"return a Dictionary of entities (keys type, values Sets" | ||
|
||
| temp result | | ||
result := Dictionary new. | ||
temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString. | ||
temp size > 0 | ||
ifTrue: [ result at: 'companies' put: temp ]. | ||
temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString. | ||
temp size > 0 | ||
ifTrue: [ result at: 'products' put: temp ]. | ||
temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString. | ||
temp size > 0 | ||
ifTrue: [ result at: 'places' put: temp ]. | ||
temp := NLPentities humanNameHelper: aString. | ||
temp size > 0 | ||
ifTrue: [ result at: 'places' put: temp ]. | ||
^ result |
20 changes: 20 additions & 0 deletions
20
KBSnlp.package/NLPentities.class/class/entityHelper.text..st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
entityDetection | ||
entityHelper: entitySet text: aString | ||
"this is a helper method for everything **but** person names" | ||
|
||
| tokens num ngram2 ngram3 results | | ||
results := Set new. | ||
tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. | ||
num := tokens size - 3. " account for the 3 fake tokens at the end " | ||
1 to: num do: [ :i | | ||
ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1). | ||
ngram3 := ngram2 , ' ' , (tokens at: i + 2). "Transcript show: ngram2; cr." | ||
(entitySet includes: ngram3) | ||
ifTrue: [ results add: ngram3 ] | ||
ifFalse: [ | ||
(entitySet includes: ngram2) | ||
ifTrue: [ results add: ngram2 ] | ||
ifFalse: [ | ||
(entitySet includes: (tokens at: i)) | ||
ifTrue: [ results add: (tokens at: i) ] ] ] ]. | ||
^ results |
23 changes: 23 additions & 0 deletions
23
KBSnlp.package/NLPentities.class/class/fileToDictionary..st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
entityDetection | ||
fileToDictionary: filePath | ||
|
||
"Read data/lexicon.txt and build in memory lexicon" | ||
|
||
| read count aLine strm set | | ||
|
||
Transcript show: 'Processing file ' , filePath; cr. | ||
|
||
set := Set new. | ||
read := (MultiByteFileStream fileNamed: filePath) readOnly. | ||
count := 0. | ||
[read atEnd] | ||
whileFalse: [count := count + 1. | ||
aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" | ||
"look for a space character: " | ||
((aLine indexOf: $:) > 0) | ||
ifTrue: [ | ||
strm := ReadStream on: aLine. | ||
aLine := strm upTo: $:]. | ||
set add: aLine]. | ||
read close. | ||
^set |
22 changes: 22 additions & 0 deletions
22
KBSnlp.package/NLPentities.class/class/humanNameHelper..st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
entityDetection | ||
humanNameHelper: aString | ||
"this is a helper method for everything **but** person names" | ||
|
||
| tokens num results | | ||
results := Set new. | ||
tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. | ||
num := tokens size - 3. " account for the 3 fake tokens at the end " | ||
1 to: num do: [ :i | | ||
((Smalltalk at: #NLPfirstNames) includes: (tokens at: i)) | ||
ifTrue: [ | ||
(((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1)) | ||
and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2))) | ||
ifTrue: [ | ||
results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2). | ||
i := i + 2 ] | ||
ifFalse: [ | ||
((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1)) | ||
ifTrue: [ | ||
results add: (tokens at: i) , ' ' , (tokens at: i + 1). | ||
i := i + 1 ] ] ] ]. | ||
^ results |
32 changes: 32 additions & 0 deletions
32
KBSnlp.package/NLPentities.class/class/initializeEntities.st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
entityDetection | ||
initializeEntities | ||
"load entity name data" | ||
|
||
" Note: place name lines of the form: Cairo:country_capital Fixed in fileToDictionary " | ||
|
||
Smalltalk | ||
at: #NLPcompanyNames | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/company_names.txt'). | ||
Smalltalk | ||
at: #NLPfirstNames | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/firstnames.txt'). | ||
Smalltalk | ||
at: #NLPlastNames | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/lastnames.txt'). | ||
Smalltalk | ||
at: #NLPhonorifics | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/honorifics.txt'). | ||
Smalltalk | ||
at: #NLPprefixNames | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/prefixnames.txt'). | ||
Smalltalk | ||
at: #NLPplaceNames | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/placenames.txt'). | ||
Smalltalk | ||
at: #NLPproductNames | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/product_names.txt'). | ||
|
||
" also read in data we will need for sentence segmentation: " | ||
Smalltalk | ||
at: #NLPtokensWithPeriods | ||
put: (NLPentities fileToDictionary: './nlp_smalltalk/tokens_with_periods.txt'). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"category" : "KBSnlp", | ||
"classinstvars" : [ | ||
], | ||
"classvars" : [ | ||
], | ||
"commentStamp" : "MarkWatson 5/19/2017 06:25", | ||
"instvars" : [ | ||
], | ||
"name" : "NLPentities", | ||
"pools" : [ | ||
], | ||
"super" : "Object", | ||
"type" : "normal" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
A class to segment text into sentences. | ||
|
||
Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. | ||
|
||
See: https://github.com/mark-watson/nlp_smalltalk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
utiities | ||
fileToSet: filePath | ||
"Read file, create Set with elements being each line in file" | ||
|
||
| read aLine set | | ||
Transcript | ||
show: 'Processing file ' , filePath; | ||
cr. | ||
set := Set new. | ||
read := (MultiByteFileStream fileNamed: filePath) readOnly. | ||
[ read atEnd ] | ||
whileFalse: [ aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" | ||
set add: aLine ]. | ||
read close. | ||
^ set |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
initialize | ||
loadData | ||
"Load tokens that normally contain periods" | ||
|
||
| aSet count reverseDictionary forwardDictionary | | ||
count := 0. | ||
reverseDictionary := Dictionary new. | ||
forwardDictionary := Dictionary new. | ||
aSet := NLPsentences fileToSet: './nlp_smalltalk/tokensWithPeriods.txt'. | ||
Smalltalk at: #NLPtokensWithPeriods put: aSet. | ||
^ 'tokens with periods data loaded' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
segment | ||
sentences: someText | ||
"tokenize a string into individual sentences" | ||
|
||
| tokens aSet lastToken currentSentence allSentences | | ||
aSet := Smalltalk at: #NLPtokensWithPeriods. | ||
tokens := OrderedCollection new. | ||
(NLPsentences tokenizeLeavePeriods: someText) | ||
do: [ :token | | ||
(token includesSubstring: '.') not | ||
ifTrue: [ tokens add: token ] | ||
ifFalse: [ (aSet includes: token) | ||
ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: ''). | ||
tokens add: '.' ] | ||
ifTrue: [ tokens add: token ] ] ]. | ||
currentSentence := OrderedCollection new. | ||
allSentences := OrderedCollection new. | ||
lastToken := ''. | ||
Transcript | ||
show: tokens; | ||
cr. | ||
tokens | ||
do: [ :token | | ||
Transcript | ||
show: token; | ||
cr. | ||
currentSentence add: token. | ||
((token = '.' and: lastToken isAllDigits not) or: token = '?') | ||
ifTrue: [ allSentences addLast: currentSentence. | ||
currentSentence := OrderedCollection new ]. | ||
lastToken := token ]. | ||
currentSentence isNotEmpty | ||
ifTrue: [ allSentences addLast: currentSentence ]. | ||
^ allSentences |
9 changes: 9 additions & 0 deletions
9
KBSnlp.package/NLPsentences.class/class/tokenizeLeavePeriods..st
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
utiities | ||
tokenizeLeavePeriods: wordsInAString | ||
"tokenizes a string" | ||
|
||
^ wordsInAString | ||
findTokens: | ||
' ;:,<>[]{}! | ||
@#$%^&*()?' | ||
keep: ';:.,<>[]{}!$?' " keep CR in this string!! " |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"category" : "KBSnlp", | ||
"classinstvars" : [ | ||
], | ||
"classvars" : [ | ||
], | ||
"commentStamp" : "MarkWatson 5/19/2017 06:26", | ||
"instvars" : [ | ||
], | ||
"name" : "NLPsentences", | ||
"pools" : [ | ||
], | ||
"super" : "Object", | ||
"type" : "normal" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
A class to classify English text into categories. | ||
|
||
Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. | ||
|
||
See: https://github.com/mark-watson/nlp_smalltalk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
summary | ||
summarize: text | ||
"extractive summarizer" | ||
|
||
| sentences sentenceScores tokens scoredCategories hash x bestIndices | | ||
scoredCategories := NLPcategories classify: text. | ||
sentences := NLPtagger sentences: text. | ||
sentenceScores := Array new: sentences size. | ||
1 to: sentences size do: [ :i | | ||
sentenceScores at: i put: 0. | ||
tokens := sentences at: i. | ||
Transcript | ||
show: (sentences at: i); | ||
cr. | ||
scoredCategories | ||
do: [ :sc | | ||
hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2). | ||
tokens | ||
do: [ :token | | ||
(hash includesKey: token) | ||
ifTrue: [ x := hash at: token. | ||
sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ]. | ||
bestIndices := sentenceScores | ||
collectWithIndex: [ :score :i | | ||
{score. | ||
i} ]. | ||
Transcript | ||
show: 'sentence scoring: '; | ||
show: bestIndices; | ||
cr. | ||
bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ]. | ||
^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"category" : "KBSnlp", | ||
"classinstvars" : [ | ||
], | ||
"classvars" : [ | ||
], | ||
"commentStamp" : "MarkWatson 5/19/2017 06:26", | ||
"instvars" : [ | ||
], | ||
"name" : "NLPsummarizer", | ||
"pools" : [ | ||
], | ||
"super" : "Object", | ||
"type" : "normal" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
NLP tagger converted to Squeak. | ||
A class that implements an NLP tagger. | ||
|
||
Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. | ||
|
||
See: https://github.com/mark-watson/nlp_smalltalk |
Oops, something went wrong.