/
styletool.rb
640 lines (574 loc) · 21.4 KB
/
styletool.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
#!/usr/bin/ruby
#styletool -- a simple word frequency based stylometry tool
#Copyright (C) 2008 Leon N. Maurer
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#version 2 as published by the Free Software Foundation;
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#A copy of the license is available at
#<http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>
#You can also receive a paper copy by writing the Free Software
#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
require 'tk'
require 'yaml' #needs to be before gsl?
require 'gsl'
#require 'gnuplot'
require 'tkextlib/tile/treeview'
require 'tempfile'
def max(n,m)
n > m ? n : m
end
class Document
attr_reader :name, :author, :group, :wordCount, :countedWords
def initialize(name, author, group, text)
@name = name
@author = author
@group = group
@countedWords = Hash.new(0)
words = text.downcase.scan(/\w+/) #doesn't catch contractions
if block_given?
words.each{|word| @countedWords[word] += 1 if yield(word)}
else
words.each{|word| @countedWords[word] += 1}
end
@wordCount = words.size
end
def words
@countedWords.keys
end
def count(word)
#will return 0 if 'word' not in 'countedWords' since 0 is the default value
@countedWords[word]
end
def relativeFrequency(word)
@countedWords[word] / @wordCount.to_f
end
def <=>(other) #used for sorting
if @author == other.author
@name <=> other.name
else
@author <=> other.author
end
end
end
class PCAtool
attr_reader :matrix
def initialize(vectors)
@matrix = GSL::Matrix.alloc(vectors.flatten, vectors.size, vectors[0].size).transpose
end
def center
avg = GSL::Vector.calloc(@matrix.size1) #calloc initalizes all values to 0
for r in 0..(@matrix.size1 - 1)
for c in 0..(@matrix.size2 - 1)
avg[r] += @matrix[r,c] / @matrix.size2.to_f
end
end
avg
end
def centeredMatrix
cm = @matrix.duplicate
avg = self.center
for r in 0..(@matrix.size1 - 1)
for c in 0..(@matrix.size2 - 1)
cm[r,c] -= avg[r]
end
end
cm
end
def scatterMatrix
cm = self.centeredMatrix
cm*cm.transpose
end
def reduceDimensions(dims)
vecs = Array.new(@matrix.size2){Array.new(dims)}
eigval, eigvec = self.scatterMatrix.eigen_symmv
cm = self.centeredMatrix
for c in 0..(@matrix.size2 - 1)
for e in 0..(dims - 1)
vecs[c][e] = cm.col(c).row*eigvec.col(e)
end
end
vecs
end
end
class Interface
attr_reader :documents
@@ConfigFile = ".styletoolconfig"
def initialize
@documents = Array.new
@masterWordList = Array.new
#first, all the procs for use by the GUI
quit = proc {
settings = {"UseWordList" => (@wordListSpecified.get_value == "1"),
"PCAdims" => @pcaspinbox.get.to_i,
"WordList" => @masterWordList,
"ChunkSize"=>@chunkSize.get.to_i,
"SaveChunks"=> (@saveChunks.get_value == "1")}
File.open(@@ConfigFile, "w"){|file| file.print(settings.to_yaml)}
Process.exit
}
addfile = proc {
filename = Tk.getOpenFile
#if the user clicks "cancel" in the dialog box then filename == ""
self.addFile(filename,@author.value) unless filename == ""
}
addfolder = proc {
foldername = Tk.chooseDirectory
self.addFolder(foldername,@author.value) unless foldername == ""
}
chunkaddfile = proc {
filename = Tk.getOpenFile
savefoldername = ""
savefoldername = Tk.chooseDirectory("title"=>"Choose folder to save chunks to") if @saveChunks.get_value == "1"
self.chunkAndAddFile(filename,@author.value,savefoldername) unless filename == ""
}
chunkaddfolder = proc {
foldername = Tk.chooseDirectory("title"=>"Choose folder to add files from")
savefoldername = ""
savefoldername = Tk.chooseDirectory("title"=>"Choose folder to save chunks to") if @saveChunks.get_value == "1"
self.chunkAndAddFolder(foldername,@author.value,savefoldername) unless foldername == ""
}
remove = proc {
self.remove(@tree.focus_item)
}
save = proc {
filename = Tk.getSaveFile("filetypes"=>[["CSV", ".csv"]])
self.saveToCSV(filename) unless filename == ""
}
plotpca = proc {
self.plotPCA
}
explorepca = proc {
self.explorePCA
}
savepca = proc {
filename = Tk.getSaveFile("filetypes"=>[["CSV", ".csv"]])
self.savePCAtoCSV(filename,@pcaspinbox.get.to_i) unless filename == ""
}
wordlisttoggled = proc {
if @wordListSpecified.get_value == "1"
filename = Tk.getOpenFile
unless filename == ""
self.specifyWordList(filename)
else #the user hit 'cancel' -- don't change anything!
@wordListSpecified.set_value("0")
end
else
self.unspecifyWordList
end
}
#and now for the GUI
#the last bit calls the quit proc when the window is closed
@root = TkRoot.new(){title 'Style Tool'}.protocol('WM_DELETE_WINDOW', quit)
#top row (output commands)
TkButton.new(@root) {
text 'Save word frequencies as CSV'
command save
}.grid('column'=>0, 'row'=>0,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
TkButton.new(@root) {
text 'plot 2D PCA'
command plotpca
}.grid('column'=>1, 'row'=>0,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
TkButton.new(@root) {
text 'Save PCA as CSV'
command savepca
}.grid('column'=>2, 'row'=>0,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
#second row (more PCA)
TkButton.new(@root) {
text 'explore 2D PCA'
command explorepca
}.grid('column'=>0, 'row'=>1,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
TkLabel.new{
@root
text "PCA dimensions:"
}.grid('column'=>1,'row'=>1, 'sticky'=>'e', 'padx'=>5, 'pady'=>5)
@pcaspinbox = TkSpinbox.new(@root) {
to 50
from 1
increment 1
width 4
}.grid('column'=>2,'row'=>1, 'sticky'=>'w', 'padx'=>5, 'pady'=>5)
@pcaspinbox.set(2) #a good default value
#3rd row (file treeview)
TkLabel.new{
@root
text "Loaded files:"
}.grid('column'=>0,'row'=>2, 'sticky'=>'w')
#TODO: horizontal scroll bar? change width and height?
yscroll = proc{|*args| @lbscroll.set(*args)}
scroll = proc{|*args| @tree.yview(*args)}
@tree = Tk::Tile::Treeview.new(@root){
yscrollcommand yscroll
selectmode 'browse'
}.grid('column'=>1,'row'=> 2, 'sticky'=>'we')
@lbscroll = TkScrollbar.new(@root) {
orient 'vertical'
command scroll
}.grid('column'=>2, 'row'=>2,'sticky'=>'wns')
#4th row (author)
TkLabel.new{
@root
text "Author:"
}.grid('column'=>0,'row'=>3, 'sticky'=>'w')
@author = TkVariable.new()
authorDisp = TkEntry.new(@root) {
width 30
relief 'sunken'
}.grid('column'=>1,'row'=> 3, 'sticky'=>'w', 'padx'=>5, 'pady'=>5)
authorDisp.textvariable(@author)
@author.value = 'Unknown'
#5th row (specify wordlist)
@wordListSpecified = TkCheckButton.new(@root){
text "Count specific words only"
command wordlisttoggled
}.grid('column'=>1,'row'=> 4, 'sticky'=>'w')
#6th row (adding files)
TkButton.new(@root) {
text 'Add file'
command addfile
}.grid('column'=>0, 'row'=>5,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
TkButton.new(@root) {
text 'Add folder'
command addfolder
}.grid('column'=>1, 'row'=>5,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
TkButton.new(@root) {
text 'Remove'
command remove
}.grid('column'=>2, 'row'=>5,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
#7th row (chunking)
TkButton.new(@root) {
text 'Chunk and add file'
command chunkaddfile
}.grid('column'=>0, 'row'=>6,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
TkButton.new(@root) {
text 'Chunk and add folder'
command chunkaddfolder
}.grid('column'=>1, 'row'=>6,'sticky'=>'w', 'padx'=>5, 'pady'=>5)
#8th row (chunking settings)
@saveChunks = TkCheckButton.new(@root){
text "Save file chunks?"
}.grid('column'=>0,'row'=> 7, 'sticky'=>'w')
TkLabel.new{
@root
text "Chunk size (words):"
}.grid('column'=>1,'row'=>7, 'sticky'=>'e', 'padx'=>5, 'pady'=>5)
@chunkSize = TkSpinbox.new(@root) {
to 100000
from 100
increment 100
width 5
}.grid('column'=>2,'row'=>7, 'sticky'=>'w', 'padx'=>5, 'pady'=>5)
@chunkSize.set(1000) #a good default value
#END GUI
#load settings from config file if it exists
#if there's none to load, the default values are built in to the code
if File.file?(@@ConfigFile)
#TODO: error handling for YAML
settings = YAML.load(File.open(@@ConfigFile))
if settings["UseWordList"]
@wordListSpecified.set_value("1")
@masterWordList = settings["WordList"]
end
@pcaspinbox.set(settings["PCAdims"])
if settings["ChunkDocs"]
@chunkDocs.set_value("1")
@chunkSize.state('normal')
end
@chunkSize.set(settings["ChunkSize"])
@saveChunks.set_value('1') if settings["SaveChunks"]
end
end
def doPCA(dims)
PCAtool.new(self.coords).reduceDimensions(dims)
end
def specifyWordList(filename)
@masterWordList = File.read(filename).downcase.scan(/\w+/).uniq
self.reload
end
def unspecifyWordList
@masterWordList = Array.new
self.reload
end
def reload
docinfo = @documents.collect{|doc| [doc.name,doc.author]}
#clear everything
@documents = Array.new
@tree.children("").each{|item| @tree.delete(item)}
#reload it
docinfo.each{|filename,author| self.addFile(filename,author)}
end
def readFile(filename)
#removes comments
File.read(filename).gsub(/<(.|\s)*?>/,'')
end
def addFile(filename,author)
addDoc(self.readFile(filename),filename,author)
end
def addDoc(text,filename,author)
if @tree.exist?(filename)
Tk.messageBox('type' => 'ok',
'icon' => 'error',
'title' => 'File already included',
'message' => "A file named #{filename} has already been added -- you cannot add the same file more than once.")
return #exits the function
end
#add author if need be
unless @tree.exist?(author)
authors = @tree.children("").collect{|item| item.id}
i = 0
while (i < authors.size) and (author.casecmp(authors[i]) == 1)
i += 1
end
@tree.insert('', i, :id => author, :text => author)
end
#id is the full path but text is just the file name
name = filename.split(File::SEPARATOR).pop
#the group is the filename without the chunk number
group = name.gsub(/[.]\d+/,'')
names = @tree.children(author).collect{|item| item.id.split(File::SEPARATOR).pop}
i = 0
while (i < names.size) and (name.casecmp(names[i]) == 1)
i += 1
end
@tree.insert(author, i, :id => filename, :text => name)
if @wordListSpecified.get_value == '1'
newdoc = Document.new(filename,author,group,text) {|word| @masterWordList.include?(word)}
else
newdoc = Document.new(filename,author,group,text)
@masterWordList = (@masterWordList | newdoc.words).sort
end
@documents.push(newdoc)
@documents = @documents.sort #keeps everything sorted
end
def chunkAndAddFile(filename,author,savedir="")
name = filename.split(File::SEPARATOR).pop
text = self.readFile(filename).split
chunks = Array.new
while text.size >= @chunkSize.get.to_i
chunks << text.slice!(0,@chunkSize.get.to_i).join(' ')
end
if chunks.size == 0 #document too short
#TODO: pop up message?
return
end
#make an array of chunks
if savedir == "" #save chunks to tempfiles
chunks.each_with_index{|chunk,i|
Tempfile.open(name + '.' + i.to_s){|f| f.print(chunk)}
self.addDoc(chunk,filename + '.' + i.to_s,author)
}
else #save them to real files
chunks.each_with_index{|chunk,i|
savefile = savedir + File::SEPARATOR + name + '.' + i.to_s
File.open(savefile,"w"){|f| f.print(chunk)}
self.addDoc(chunk,savefile,author)
}
end
end
def addFolder(path,author)
#add path to keep things consistant with adding single files
Dir.chdir(path){Dir.foreach(path){|file| self.addFile(path + File::SEPARATOR + file,author) if File.file?(file)}}
end
def chunkAndAddFolder(path,author,savedir="")
Dir.chdir(path){Dir.foreach(path){|file| self.chunkAndAddFile(path + File::SEPARATOR + file,author,savedir) if File.file?(file)}}
end
def remove(item)
if @documents.collect{|doc| doc.author}.include?(item.id) #have we slected all works by the author?
@documents.reject!{|doc| doc.author == item.id}
else #it's a file
@documents.reject!{|doc| doc.name == item.id}
end
unless @wordListSpecified.get_value == '1'
@masterWordList = @documents.inject([""]){|words,doc| words |doc.words}
end
@tree.delete(item)
end
def coords
@documents.collect{|doc| @masterWordList.collect{|word| doc.relativeFrequency(word)}}
end
def saveToCSV(filename)
File.open(filename, "w") do |file|
#prints the file name at the top of each column
@documents.each{|doc| file.print(",",doc.author)}
file.print("\n")
@documents.each{|doc| file.print(",",doc.name)}
file.print("\n")
@masterWordList.each do |word|
file.print(word)
@documents.each{|doc| file.print(",",doc.relativeFrequency(word))}
file.print("\n")
end
end
end
@@GraphColors = ['Red', 'Green', 'Blue', 'Magenta', 'Cyan']
def plotPCA
authors = @documents.collect{|doc| doc.author}.uniq
#the following gives [ ['author',[x,y]], ...]
c = @documents.collect{|doc| doc.author}.zip(self.doPCA(2))
tfiles = authors.collect do |author|
tf = Tempfile.new(author)
c.find_all{|arr| arr[0] == author}.each{|coords| tf.print(coords[1][0]," ",coords[1][1])}
tf.close
tf #need to return tf at the end
end
#TODO: change this? only a couple of colors are available.
#TODO: Make a key for the colors
#TODO: use canvas instead of another program? canvas can output to postscript...
color = 0
command = tfiles.inject("graph -T X -C"){|command,tf| command + " -m -#{color+=1} -S 3 " + '"' + tf.path + '"'}
#'"' are to put quotes around the name, incase there is a space in it
color = -1
command += authors.inject(" -L \""){|command, auth| command + " #{auth} #{@@GraphColors[color+=1]} "} + '"'
IO.popen(command, "w")
responce = Tk::messageBox(
'type' => 'yesno',
'message' => 'Do you wish to save this plot?',
'icon' => 'question',
'title' => 'Save plot?')
if responce == 'yes'
filename = Tk.getSaveFile("filetypes"=>[["PS", ".ps"],["PNG",".png"],["SVG",".svg"]])
# self.savePlot(filename) unless filename == ""
color = 0
command = tfiles.inject("graph -T #{filename.split(".").pop} -C"){|command,tf| command + " -m -#{color+=1} -S 3 " + '"' + tf.path + '"'}
color = -1
command += authors.inject(" -L \""){|command, auth| command + " #{auth} #{@@GraphColors[color+=1]} "} + "\" > #{filename}"
IO.popen(command, "w")
end
end
def explorePCA
authors = @documents.collect{|doc| doc.author}.uniq
n = @documents.collect{|doc| doc.name}
a = @documents.collect{|doc| doc.author}
g = @documents.collect{|doc| doc.group}
p = Plot.new(@root)
#the following makes [ [[x,y],'name','author','group'], ...], then makes points from it
self.doPCA(2).zip(n,a,g).each{|coord,name,author,group| p.add(coord[0],coord[1],name,author,group)}
p.refresh
end
# def savePlot(filename)
#puts filename
# end
def savePCAtoCSV(filename,dims)
pca = self.doPCA(dims)
File.open(filename, "w") do |file|
@documents.zip(pca).each do |doc,coords|
file.print(doc.author,",")
file.print(doc.name,",")
coords.each{|coord| file.print(coord,",")}
file.print("\n")
end
end
end
end
class Plot < TkToplevel
@@CanvasSize = 500
def initialize(parent)
super(parent)
@items = Array.new
end
def add(x,y,name,group,subgroup)
# print x," ",y,"\n"
@items << Point.new(x,y,name,group,subgroup)
end
def refresh
@canvas = TkCanvas.new(self) {
width @@CanvasSize
height @@CanvasSize
}.grid('column'=>0,'row'=> 0, 'sticky'=>'nsew')
@name = TkVariable.new()
nameDisp = TkEntry.new(self) {
width 30
relief 'sunken'
}.grid('column'=>0,'row'=> 1, 'sticky'=>'n', 'padx'=>5, 'pady'=>5)
nameDisp.textvariable(@name)
@name.value = 'none selected'
closest = proc{|x, y|
c = @items.sort_by{|item|
xp = ((1 + item.x/@maxsize)*@@CanvasSize/2)
yp = ((1 + item.y/@maxsize)*@@CanvasSize/2)
((x-xp)**2 + (y-yp)**2)**0.5
}[0]
@name.value = c.group + ': ' + c.name.split(File::SEPARATOR).pop
@canvas.delete('linetoclosest')
xp = ((1 + c.x/@maxsize)*@@CanvasSize/2)
yp = ((1 + c.y/@maxsize)*@@CanvasSize/2)
TkcLine.new(@canvas, x, y, xp, yp, :fill => 'black', :width => 1, :tags => 'linetoclosest')
}
@canvas.bind("Motion", closest, "%x %y")
#size is the largest coordiante plus a bit
@maxsize = @items.inject(0){|largest,item| max(max(item.x,item.y),largest)} * 1.05
groups = @items.collect{|item| item.group}.sort
groups.each_with_index{|group,i|
ingroup = @items.reject{|item| item.group != group}
subgroups = ingroup.collect{|item| item.subgroup}.sort
subgroups.each_with_index{|subgroup,j|
insubgroup = ingroup.reject{|item| item.subgroup != subgroup}
insubgroup.each{|item|
r,g,b = color(i,groups.size,j,subgroups.size)
self.point(item.x,item.y,r,g,b)
}
}
}
#draw axis
TkcLine.new(@canvas, 0, @@CanvasSize/2, @@CanvasSize, @@CanvasSize/2, :fill => 'black', :width => 1)
TkcLine.new(@canvas, @@CanvasSize/2, 0, @@CanvasSize/2, @@CanvasSize, :fill => 'black', :width => 1)
TkcLine.new(@canvas, 0, 0, 0, 0, :fill => 'black', :width => 1, :tags => 'linetoclosest')
end
def point(x,y,r,g,b)
#print x," ",y," ",r," ",g," ",b,"\n"
rs = (r*255).round.to_s(16)
if rs.length == 1
rs = '0' + rs
end
gs = (g*255).round.to_s(16)
if gs.length == 1
gs = '0' + gs
end
bs = (b*255).round.to_s(16)
if bs.length == 1
bs = '0' + bs
end
color = '#' + rs + gs + bs
#print color + "\n"
xp = ((1 + x/@maxsize)*@@CanvasSize/2).round
yp = ((1 + y/@maxsize)*@@CanvasSize/2).round
TkcLine.new(@canvas, xp-5, yp, xp+5, yp, :fill => color, :width => 1)
TkcLine.new(@canvas, xp, yp-5, xp, yp+5, :fill => color, :width => 1)
end
def color(i,hues,j,saturations)
h = 360 * (i/hues.to_f)
s = 0.75 + 0.25 * (j/saturations.to_f)
v = 0.5 + 0.5 * (j/saturations.to_f)
#convert to RGB
hi = (h/60.0).floor % 6
f = (h/60.0) - (h/60.0).floor
p = v*(1.0 - s)
q = v*(1.0 - s * f)
t = v*(1.0 - (1-f) * s)
case hi
when 0: [v,t,p]
when 1: [q,v,p]
when 2: [p,v,t]
when 3: [p,q,v]
when 4: [t,p,v]
when 5: [v,p,q]
end
end
end
class Point
attr_reader :x, :y, :name, :group, :subgroup
def initialize(x,y,name,group,subgroup)
@x = x
@y = y
@name = name
@group = group
@subgroup = subgroup
end
end
if __FILE__ == $0
Interface.new
Tk.mainloop()
end