-
Notifications
You must be signed in to change notification settings - Fork 138
/
LoadData.hs
456 lines (379 loc) · 15.4 KB
/
LoadData.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
{-# LANGUAGE ScopedTypeVariables,TemplateHaskell,DeriveDataTypeable,DataKinds,FlexibleInstances,TypeFamilies,RankNTypes,BangPatterns,FlexibleContexts,StandaloneDeriving,GeneralizedNewtypeDeriving,TypeOperators,MultiParamTypeClasses,NoImplicitPrelude,RebindableSyntax #-}
module LoadData
where
import Control.DeepSeq
import Control.Monad
import Control.Monad.ST
import Data.List hiding (insert,length,concat,partition,head)
import Data.Maybe
import qualified Data.Set as Set
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Mutable as VM
import qualified Data.Vector.Generic.Mutable as VGM
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Unboxed.Mutable as VUM
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Storable.Mutable as VSM
import qualified Data.ByteString.Lazy.Char8 as BS
import qualified Data.Vector.Algorithms.Intro as Intro
import System.Console.CmdArgs.Implicit
import System.Mem
import System.IO
import System.Directory
import System.FilePath.Posix
import qualified Numeric.LinearAlgebra as LA
import qualified Numeric.LinearAlgebra.Devel as LA
-- import Test.QuickCheck hiding (verbose,sample,label)
-- import Control.Parallel.Strategies
import SubHask hiding (Functor(..), Applicative(..), Monad(..), Then(..), fail, return)
import SubHask.Algebra.Container
import SubHask.Compatibility.ByteString
import SubHask.Compatibility.Cassava
import SubHask.Compatibility.Containers
import SubHask.Compatibility.Vector.Lebesgue
-- import HLearn.Algebra
import HLearn.Models.Classifiers.Common
-- import HLearn.DataStructures.StrictVector
-- import HLearn.DataStructures.CoverTree
-- import HLearn.DataStructures.SpaceTree
-- import HLearn.DataStructures.SpaceTree.Algorithms.NearestNeighbor
-- import HLearn.DataStructures.SpaceTree.Algorithms.RangeSearch
-- import HLearn.DataStructures.SpaceTree.DualTreeMonoids
-- import qualified HLearn.DataStructures.StrictList as Strict
-- import qualified HLearn.DataStructures.StrictVector as Strict
import HLearn.Metrics.EMD
-- import HLearn.Metrics.Mahalanobis
-- import HLearn.Metrics.Mahalanobis.Normal
-- import HLearn.Models.Distributions
import SubHask.TemplateHaskell.Deriving
import Timing
import HLearn.UnsafeVector
import Debug.Trace
import Prelude (asTypeOf,unzip)
import qualified Prelude as P
--------------------------------------------------------------------------------
-- | This loads files in the format used by the BagOfWords UCI dataset.
-- See: https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/readme.txt
loadBagOfWords :: FilePath -> IO (Array (Map' Int Float))
loadBagOfWords filepath = do
hin <- openFile filepath ReadMode
numdp :: Int <- liftM read $ hGetLine hin
numdim :: Int <- liftM read $ hGetLine hin
numlines :: Int <- liftM read $ hGetLine hin
ret <- VGM.replicate numdp zero
forM [0..numlines-1] $ \i -> do
line <- hGetLine hin
let [dp,dim,val] :: [Int] = map read $ words line
curdp <- VGM.read ret (dp-1)
VGM.write ret (dp-1) $ insert (dim,fromIntegral val) curdp
hClose hin
VG.unsafeFreeze ret
-- | Loads a dataset of strings in the unix words file format (i.e. one word per line).
-- This format is also used by the UCI Bag Of Words dataset.
-- See: https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/readme.txt
loadWords :: (Elem dp~Char, Eq dp, Unfoldable dp) => FilePath -> IO (Array dp)
loadWords filepath = do
hin <- openFile filepath ReadMode
contents <- hGetContents hin
return $ fromList $ map fromList $ lines contents
--------------------------------------------------------------------------------
-- | Returns all files in a subdirectory (and all descendant directories).
-- Unlike "getDirectoryContents", this function prepends the directory's path to each filename.
-- This is important so that we can tell where in the hierarchy the file is located.
--
-- FIXME:
-- This is relatively untested.
-- It probably has bugs related to weird symbolic links.
getDirectoryContentsRecursive :: FilePath -> IO [FilePath]
getDirectoryContentsRecursive dirpath = do
files <- getDirectoryContents dirpath
fmap concat $ forM files $ \file -> case file of
'.':_ -> return []
_ -> do
-- let file' = dirpath++"/"++takeFileName file
let file' = dirpath++"/"++file
isdir <- doesDirectoryExist file'
contents <- if isdir
then getDirectoryContentsRecursive file'
else return []
return $ file':contents
-- | A generic method for loading unlabeled data points
-- where each file in a directory hierarchy corresponds to a single data point.
loadDirectory ::
( Eq a
, NFData a
) => FilePath -- ^ directory to load data from
-> (FilePath -> IO a) -- ^ function to load an individual file
-> (FilePath -> Bool) -- ^ function to filter out invalid filenames
-> (a -> Bool) -- ^ function to filter out malformed results
-> IO (Array a) -- ^
loadDirectory dirpath loadFile validFilepath validResult = {-# SCC loadDirectory #-} do
files <- --fmap (take 1000) $
fmap (filter validFilepath) $
getDirectoryContentsRecursive dirpath
results <- fmap (filter validResult)
$ mapM loadFile files
time "loadDirectory" results
putStrLn $ " numdp: " ++ show (length files)
-- when debug $ do
-- forM files $ \file -> do
-- putStrLn file
return $ fromList results
-------------------
type HistogramSignature_ a = EMD (Lexical (StorableArray Float), Lexical (Array a))
type HistogramSignature = HistogramSignature_ (L2 Vector Float)
loadHistogramSignature
:: Bool -- ^ print debug info?
-> FilePath -- ^ path of signature file
-> IO HistogramSignature
loadHistogramSignature debug filepath = {-# SCC loadHistogramSignature #-} do
filedata <- liftM lines $ readFile filepath
let (fs,as) = unzip
$ map (\[b,g,r,v] -> (v,VG.fromList [r,g,b]))
$ map ((read :: String -> [Float]).(\x->"["+x+"]")) filedata
ret = (fromList fs, fromList as)
when debug $ do
putStrLn $ "filepath="++show filepath
putStrLn $ " filedata="++show filedata
-- putStrLn $ "signature length=" ++ show (length ret)
deepseq ret $ return $ EMD ret
--------------------------------------------------------------------------------
data DataParams = DataParams
{ datafile :: String
, labelcol :: Maybe Int
, pca :: Bool
, varshift :: Bool
}
head x = case unCons x of
Nothing -> error "head on empty"
Just (x,_) -> x
{-# INLINABLE loadCSV #-}
loadCSV ::
( Monoid a
, NFData a
, FromRecord a
, Eq a
-- , Normed a
, Show (Scalar a)
) => FilePath -> IO (Array a)
loadCSV filepath = do
bs <- timeIO ("loading ["++filepath++"]") $ readFileByteString filepath
let rse = decode NoHeader bs
time "parsing csv file" rse
rs <- case rse of
Right rs -> return rs
Left str -> error $ "failed to parse CSV file " ++ filepath ++ ": " ++ take 1000 str
putStrLn " dataset info:"
putStrLn $ " num dp: " ++ show (abs rs)
-- putStrLn $ " size dp: " ++ show (abs $ head rs)
putStrLn ""
return rs
{-# INLINABLE loaddata #-}
loaddata ::
( VG.Vector v f
, Monoid (v f)
, NFData (v f)
, FromRecord (v f)
, Eq (v f)
, Ord f
-- , Normed (v f)
, Show (Scalar (v f))
, Floating f
, VUM.Unbox f
) => DataParams -> IO (Array (v f))
loaddata params = do
(ArrayT rs) <- loadCSV $ datafile params
setptsize $ VG.length $ VG.head rs
rs' <- if pca params
then error "pca disabled"
-- then time "calculating PCA" $ VG.convert $ rotatePCA rs
else return rs
rs'' <- if varshift params
then time "varshifting data" $
VG.convert $ VG.map (shuffleVec $ VU.map fst $ mkShuffleMap rs') rs'
else return rs'
return $ ArrayT rs''
{-
{-# INLINABLE loadLabeledNumericData #-}
loadLabeledNumericData :: forall v f.
( VG.Vector v f
, NFData (v f)
, FromRecord (V.Vector f)
, Read f
, f ~ Double
) => DataParams -> IO (V.Vector (MaybeLabeled String (v f)))
loadLabeledNumericData params = do
let filename = datafile params
colindex = fromJust $ labelcol params
xse :: Either String (V.Vector (V.Vector String))
<- timeIO ("loading ["++datafile params++"] ")
$ fmap (decode HasHeader)
$ BS.readFile $ datafile params
xs <- case xse of
Right rs -> return rs
Left str -> error $ "failed to parse CSV file " ++ datafile params ++ ": " ++ take 1000 str
let numdp = VG.length xs
let numdim = VG.length $ xs VG.! 0
let numlabels = Set.size $ Set.fromList $ VG.toList $ VG.map (VG.! colindex) xs
hPutStrLn stderr " dataset info:"
hPutStrLn stderr $ " num dp: " ++ show numdp
hPutStrLn stderr $ " num dim: " ++ show numdim
hPutStrLn stderr $ " num labels: " ++ show numlabels
let ys = VG.map (\x -> MaybeLabeled
{ label = Just $ x VG.! colindex
, attr = VG.convert (
VG.map read $ VG.fromList $ (:) "1" $ VG.toList $ VG.take (colindex) x VG.++ VG.drop (colindex+1) x
-- VG.map read $ VG.fromList $ VG.toList $ VG.take label_index x <> VG.drop (label_index+1) x
:: V.Vector f
)
})
xs
-- :: V.Vector (MaybeLabeled String (LA.Vector Double))
let rs=VG.map attr ys
rs' <- if pca params
then time "calculating PCA" $ VG.convert $ rotatePCADouble rs
else return rs
rs'' <- if varshift params
then time "varshifting data" $ error "FIXME: varshift not implemented"
-- VG.convert $ VG.map (shuffleVec $ VU.map fst $ mkShuffleMap rs') rs'
else return rs'
let ys' = VG.zipWith (\y r -> MaybeLabeled (label y) r) ys rs''
deepseq ys' $ return ys'
-}
-------------------------------------------------------------------------------
-- data preprocessing
{-# INLINABLE mkShuffleMap #-}
-- | calculate the variance of each column, then sort so that the highest variance is first
mkShuffleMap :: forall v a s.
( VG.Vector v a
, Floating a
, Ord a
, VU.Unbox a
) => V.Vector (v a) -> VU.Vector (Int,a)
mkShuffleMap v = runST ( do
let numdim = VG.length (v V.! 0)
numdpf = fromIntegral $ VG.length v
let m1V = VG.foldl1 (VG.zipWith (+)) v
m2V = VG.foldl1 (VG.zipWith (+)) $ VG.map (VG.map (\x -> x*x)) v
varV = VG.zipWith (\m1 m2 -> m2/numdpf-m1/numdpf*m1/numdpf) m1V m2V
sortV = VG.zip (VG.fromList [0..numdim-1::Int]) $ VG.convert varV :: VU.Vector (Int, a)
msortV <- VG.unsafeThaw sortV
Intro.sortBy (\(_,v2) (_,v1) -> compare v1 v2) msortV
sortV' <- VG.unsafeFreeze msortV
return sortV'
)
-- meanV :: VUM.MVector s a <- VGM.new numdim
-- varV :: VUM.MVector s a <- VGM.new numdim
-- let go (-1) = return ()
-- go i = do
-- let go_inner (-1) = return ()
-- go_inner j = do
-- let tmp = v VG.! i VG.! j
-- meanVtmp <- (+tmp ) `liftM` VGM.read meanV j
-- varVtmp <- (+tmp*tmp) `liftM` VGM.read varV j
-- VUM.write meanV
--
-- let xs = fmap (VG.! i) v
-- dist = train xs :: Normal a a
-- var = variance dist
-- VGM.write varV i (i,var)
-- go (i-1)
-- msortV :: VUM.MVector s (Int, a) <- VGM.new numdim
--
-- let go (-1) = return ()
-- go i = do
-- -- let !xs = fmap (VG.! i) v
-- let !xs = fmap (`VG.unsafeIndex` i) v
-- !dist = train xs :: Normal a a
-- !var = variance dist
-- VGM.write msortV i (i,var)
-- go (i-1)
-- go (numdim-1)
-- forM [0..numdim-1] $ \i -> do
-- let xs = fmap (VG.! i) v
-- dist = train xs :: Normal a a
-- var = variance dist
-- VGM.write varV i (i,var)
{-# INLINABLE shuffleVec #-}
-- | apply the shufflemap to the data set to get a better ordering of the data
shuffleVec :: VG.Vector v a => VU.Vector Int -> v a -> v a
shuffleVec vmap v = VG.generate (VG.length vmap) $ \i -> v VG.! (vmap VG.! i)
-- shuffleVec vmap v = runST $ do
-- ret <- VGM.new (VG.length v)
--
-- let go (-1) = return ()
-- go i = do
-- VGM.write ret i $ v VG.! (vmap VG.! i)
-- go (i-1)
-- go (VG.length v-1)
--
-- -- forM [0..VG.length v-1] $ \i -> do
-- -- VGM.write ret i $ v VG.! (vmap VG.! i)
-- VG.freeze ret
{-# INLINABLE meanCenter #-}
-- | translate a dataset so the mean is zero
meanCenter ::
( VG.Vector v1 (v2 a)
, VG.Vector v2 a
, Floating a
) => v1 (v2 a) -> v1 (v2 a)
meanCenter dps = {-# SCC meanCenter #-} VG.map (\v -> VG.zipWith (-) v meanV) dps
where
meanV = {-# SCC meanV #-} VG.map (/ fromIntegral (VG.length dps)) $ VG.foldl1' (VG.zipWith (+)) dps
{-# INLINABLE rotatePCA #-}
-- | rotates the data using the PCA transform
rotatePCA ::
( VG.Vector container dp
, VG.Vector container [Float]
, VG.Vector v a
, dp ~ v a
, Show a
, a ~ Float
) => container dp -> container dp
rotatePCA dps' = {-# SCC rotatePCA #-} VG.map rotate dps
where
-- rotate dp = VG.convert $ LA.single $ eigm LA.<> LA.double (VG.convert dp :: VS.Vector Float)
rotate dp = {-# SCC convert #-} VG.convert $ LA.single $ (LA.trans eigm) LA.<> LA.double (VG.convert dp :: VS.Vector Float)
dps = meanCenter dps'
(eigv,eigm) = {-# SCC eigSH #-} LA.eigSH $ LA.double gramMatrix
gramMatrix = {-# SCC gramMatrix #-} gramMatrix_ $ map VG.convert $ VG.toList dps
-- gramMatrix = {-# SCC gramMatrix #-} LA.trans tmpm LA.<> tmpm
-- where
-- tmpm = LA.fromLists (VG.toList $ VG.map VG.toList dps)
-- gramMatrix = {-# SCC gramMatrix #-} foldl1' (+)
-- [ let dp' = VG.convert dp in LA.asColumn dp' LA.<> LA.asRow dp' | dp <- VG.toList dps ]
gramMatrix_ :: (Ring a, Storable a) => [Vector a] -> LA.Matrix a
gramMatrix_ xs = runST ( do
let dim = VG.length (head xs)
m <- LA.newMatrix 0 dim dim
forM_ xs $ \x -> do
forM_ [0..dim-1] $ \i -> do
forM_ [0..dim-1] $ \j -> do
mij <- LA.unsafeReadMatrix m i j
LA.unsafeWriteMatrix m i j $ mij + (x `VG.unsafeIndex` i)*(x `VG.unsafeIndex` j)
LA.unsafeFreezeMatrix m
)
{-# INLINABLE rotatePCADouble #-}
-- | rotates the data using the PCA transform
rotatePCADouble ::
( VG.Vector container (v Double)
, VG.Vector container [Double]
, VG.Vector v Double
) => container (v Double) -> container (v Double)
rotatePCADouble dps' = VG.map rotate dps
where
rotate dp = VG.convert $ (LA.trans eigm) LA.<> (VG.convert dp :: VS.Vector Double)
dps = meanCenter dps'
(eigv,eigm) = LA.eigSH gramMatrix
gramMatrix = LA.trans tmpm LA.<> tmpm
where
tmpm = LA.fromLists (VG.toList $ VG.map VG.toList dps)
-------------------------------------------------------------------------------
-- tests
v1 = VS.fromList [1,2,3]
v2 = VS.fromList [1,3,4]
v3 = VS.fromList [1,5,6]
v4 = VS.fromList [0,0,1]
vs = V.fromList [v1,v2,v3,v4] :: V.Vector (VS.Vector Float)
dist a b = distance (L2 a) (L2 b)