forked from scipy/scipy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mio5_utils.pyx
948 lines (853 loc) · 33.2 KB
/
mio5_utils.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
''' Cython mio5 utility routines (-*- python -*- like)
'''
'''
Programmer's notes
------------------
Routines here have been reasonably optimized.
The char matrix reading is not very fast, but it's not usually a
bottleneck. See comments in ``read_char`` for possible ways to go if you
want to optimize.
'''
import sys
from copy import copy as pycopy
from libc.stdlib cimport calloc, free
from libc.string cimport strcmp, strlen
from cpython cimport Py_INCREF, Py_DECREF
from cpython cimport PyObject
cdef extern from "Python.h":
ctypedef struct PyTypeObject:
pass
from cpython cimport PyBytes_Size, PyBytes_FromString, \
PyBytes_FromStringAndSize
import numpy as np
from numpy.compat import asbytes, asstr
cimport numpy as cnp
cdef extern from "numpy/arrayobject.h":
PyTypeObject PyArray_Type
cnp.ndarray PyArray_NewFromDescr(PyTypeObject *subtype,
cnp.dtype newdtype,
int nd,
cnp.npy_intp* dims,
cnp.npy_intp* strides,
void* data,
int flags,
object parent)
cdef extern from "numpy_rephrasing.h":
void PyArray_Set_BASE(cnp.ndarray arr, object obj)
# Numpy must be initialized before any code using the numpy C-API
# directly
cnp.import_array()
# Constant from numpy - max number of array dimensions
DEF _MAT_MAXDIMS = 32
# max number of integer indices of matlab data types (miINT8 etc)
DEF _N_MIS = 20
# max number of integer indices of matlab class types (mxINT8_CLASS etc)
DEF _N_MXS = 20
cimport streams
import scipy.io.matlab.miobase as miob
from scipy.io.matlab.mio_utils import squeeze_element, chars_to_strings
import scipy.io.matlab.mio5_params as mio5p
import scipy.sparse
cdef enum:
miINT8 = 1
miUINT8 = 2
miINT16 = 3
miUINT16 = 4
miINT32 = 5
miUINT32 = 6
miSINGLE = 7
miDOUBLE = 9
miINT64 = 12
miUINT64 = 13
miMATRIX = 14
miCOMPRESSED = 15
miUTF8 = 16
miUTF16 = 17
miUTF32 = 18
cdef enum: # see comments in mio5_params
mxCELL_CLASS = 1
mxSTRUCT_CLASS = 2
mxOBJECT_CLASS = 3
mxCHAR_CLASS = 4
mxSPARSE_CLASS = 5
mxDOUBLE_CLASS = 6
mxSINGLE_CLASS = 7
mxINT8_CLASS = 8
mxUINT8_CLASS = 9
mxINT16_CLASS = 10
mxUINT16_CLASS = 11
mxINT32_CLASS = 12
mxUINT32_CLASS = 13
mxINT64_CLASS = 14
mxUINT64_CLASS = 15
mxFUNCTION_CLASS = 16
mxOPAQUE_CLASS = 17 # This appears to be a function workspace
mxOBJECT_CLASS_FROM_MATRIX_H = 18
sys_is_le = sys.byteorder == 'little'
native_code = sys_is_le and '<' or '>'
swapped_code = sys_is_le and '>' or '<'
cdef cnp.dtype OPAQUE_DTYPE = mio5p.OPAQUE_DTYPE
cpdef cnp.uint32_t byteswap_u4(cnp.uint32_t u4):
return ((u4 << 24) |
((u4 << 8) & 0xff0000U) |
((u4 >> 8 & 0xff00u)) |
(u4 >> 24))
cdef class VarHeader5:
cdef readonly object name
cdef readonly int mclass
cdef readonly object dims
cdef cnp.int32_t dims_ptr[_MAT_MAXDIMS]
cdef int n_dims
cdef int is_complex
cdef int is_logical
cdef public int is_global
cdef size_t nzmax
def set_dims(self, dims):
""" Allow setting of dimensions from python
This is for constructing headers for tests
"""
self.dims = dims
self.n_dims = len(dims)
for i, dim in enumerate(dims):
self.dims_ptr[i] = <cnp.int32_t>int(dim)
cdef class VarReader5:
cdef public int is_swapped, little_endian
cdef int struct_as_record
cdef object codecs, uint16_codec
# c-optimized version of reading stream
cdef streams.GenericStream cstream
# pointers to stuff in preader.dtypes
cdef PyObject* dtypes[_N_MIS]
# pointers to stuff in preader.class_dtypes
cdef PyObject* class_dtypes[_N_MXS]
# cached here for convenience in later array creation
cdef cnp.dtype U1_dtype
cdef cnp.dtype bool_dtype
# element processing options
cdef:
int mat_dtype
int squeeze_me
int chars_as_strings
""" Initialize from file reader object
preader needs the following fields defined:
* mat_stream (file-like)
* byte_order (str)
* uint16_codec (str)
* struct_as_record (bool)
* chars_as_strings (bool)
* mat_dtype (bool)
* squeeze_me (bool)
"""
def __cinit__(self, preader):
byte_order = preader.byte_order
self.is_swapped = byte_order == swapped_code
if self.is_swapped:
self.little_endian = not sys_is_le
else:
self.little_endian = sys_is_le
# option affecting reading of matlab struct arrays
self.struct_as_record = preader.struct_as_record
# store codecs for text matrix reading
self.codecs = mio5p.MDTYPES[byte_order]['codecs'].copy()
self.uint16_codec = preader.uint16_codec
uint16_codec = self.uint16_codec
# Set length of miUINT16 char encoding
self.codecs['uint16_len'] = len(" ".encode(uint16_codec)) \
- len(" ".encode(uint16_codec))
self.codecs['uint16_codec'] = uint16_codec
# set c-optimized stream object from python file-like object
self.set_stream(preader.mat_stream)
# options for element processing
self.mat_dtype = preader.mat_dtype
self.chars_as_strings = preader.chars_as_strings
self.squeeze_me = preader.squeeze_me
# copy refs to dtypes into object pointer array. We only need the
# integer-keyed dtypes
for key, dt in mio5p.MDTYPES[byte_order]['dtypes'].items():
if isinstance(key, str):
continue
self.dtypes[key] = <PyObject*>dt
# copy refs to class_dtypes into object pointer array
for key, dt in mio5p.MDTYPES[byte_order]['classes'].items():
if isinstance(key, str):
continue
self.class_dtypes[key] = <PyObject*>dt
# Always use U1 rather than <U or >U1 for interpreting string
# data because the strings are created by the Python runtime
# by .decode() and hence use native byte order rather the mat file's
self.U1_dtype = np.dtype('U1')
bool_dtype = np.dtype('bool')
def set_stream(self, fobj):
''' Set stream of best type from file-like `fobj`
Called from Python when initiating a variable read
'''
self.cstream = streams.make_stream(fobj)
def read_tag(self):
''' Read tag mdtype and byte_count
Does necessary swapping and takes account of SDE formats.
See also ``read_full_tag`` method.
Returns
-------
mdtype : int
matlab data type code
byte_count : int
number of bytes following that comprise the data
tag_data : None or str
Any data from the tag itself. This is None for a full tag,
and string length `byte_count` if this is a small data
element.
'''
cdef cnp.uint32_t mdtype, byte_count
cdef char tag_ptr[4]
cdef int tag_res
cdef object tag_data = None
tag_res = self.cread_tag(&mdtype, &byte_count, tag_ptr)
if tag_res == 2: # sde format
tag_data = tag_ptr[:byte_count]
return (mdtype, byte_count, tag_data)
cdef int cread_tag(self,
cnp.uint32_t *mdtype_ptr,
cnp.uint32_t *byte_count_ptr,
char *data_ptr) except -1:
''' Read tag mdtype and byte_count
Does necessary swapping and takes account of SDE formats
Data may be returned in data_ptr, if this was an SDE
Returns 1 for success, full format; 2 for success, SDE format; -1
if error arises
'''
cdef cnp.uint16_t mdtype_sde, byte_count_sde
cdef cnp.uint32_t mdtype
cdef cnp.uint32_t* u4_ptr = <cnp.uint32_t*>data_ptr
cdef cnp.uint32_t u4s[2]
# First read 8 bytes. The 8 bytes can be in one of two formats.
# For the first - standard format - the 8 bytes are two uint32
# values, of which the first is the integer code for the matlab
# data type (*mdtype*), and the second is the number of bytes of
# that data type that follow (*byte_count*). Thus, if the
# ``mdtype`` is 4 (miDOUBLE), and the ``byte_count`` is 12, then
# there will follow 3 double values. The alternative format is
# "small data element". The first four bytes contain the
# ``byte_count`` and the ``mdtype``, but as uint16. The
# arrangement of the ``byte_count`` and ``mdtype`` is a little
# complex, see below. The following 4 bytes of the 8 bytes
# contain the data. For example, the ``mdtype`` might be 2
# (miUINT8), and the byte count is 3, and the data is in a
# string ``tag``, then the contained matrix is length 3, type
# uint8, where values are ``tag[4], tag[5], tag[6]``.
#
# The following paragraph describes the extraction of ``mdtype``
# and ``byte_count`` for the small data element format. The
# following is somewhat contrary to the matlab documentation,
# but seems to be true of actual .mat files.
#
# If the *file* is big endian, then the first four bytes of the
# tag are two big-endian uint16 values, first ``byte_count`` and
# second ``mdtype``. If the *file* is little-endian then the
# first four bytes are two little-endian uint16 values, first
# ``mdtype`` and second ``byte_count``.
self.cstream.read_into(<void *>u4s, 8)
if self.is_swapped:
mdtype = byteswap_u4(u4s[0])
else:
mdtype = u4s[0]
# The most significant two bytes of a U4 *mdtype* will always be
# 0, if they are not, this must be SDE format
byte_count_sde = mdtype >> 16
if byte_count_sde: # small data element format
mdtype_sde = mdtype & 0xffff
if byte_count_sde > 4:
raise ValueError('Error in SDE format data')
return -1
u4_ptr[0] = u4s[1]
mdtype_ptr[0] = mdtype_sde
byte_count_ptr[0] = byte_count_sde
return 2
# regular element
if self.is_swapped:
byte_count_ptr[0] = byteswap_u4(u4s[1])
else:
byte_count_ptr[0] = u4s[1]
mdtype_ptr[0] = mdtype
u4_ptr[0] = 0
return 1
cdef object read_element(self,
cnp.uint32_t *mdtype_ptr,
cnp.uint32_t *byte_count_ptr,
void **pp,
int copy=True):
''' Read data element into string buffer, return buffer
The element is the atom of the matlab file format.
Parameters
----------
mdtype_ptr : uint32_t*
pointer to uint32_t value to which we write the mdtype value
byte_count_ptr : uint32_t*
pointer to uint32_t value to which we write the byte count
pp : void**
pointer to void*. pp[0] will be set to point to the start of
the returned string memory
copy : int
If not 0, do any copies required to allow memory to be freely
altered without interfering with other objects. Otherwise
return string that should not be written to, therefore saving
unnecessary copies
Return
------
data : str
Python string object containing read data
Notes
-----
See ``read_element_into`` for routine to read element into a
pre-allocated block of memory.
'''
cdef cnp.uint32_t mdtype, byte_count
cdef char tag_data[4]
cdef object data
cdef int mod8
cdef int tag_res = self.cread_tag(mdtype_ptr,
byte_count_ptr,
tag_data)
mdtype = mdtype_ptr[0]
byte_count = byte_count_ptr[0]
if tag_res == 1: # full format
data = self.cstream.read_string(
byte_count,
pp,
copy)
# Seek to next 64-bit boundary
mod8 = byte_count % 8
if mod8:
self.cstream.seek(8 - mod8, 1)
else: # SDE format, make safer home for data
data = PyBytes_FromStringAndSize(tag_data, byte_count)
pp[0] = <char *>data
return data
cdef int read_element_into(self,
cnp.uint32_t *mdtype_ptr,
cnp.uint32_t *byte_count_ptr,
void *ptr) except -1:
''' Read element into pre-allocated memory in `ptr`
Parameters
----------
mdtype_ptr : uint32_t*
pointer to uint32_t value to which we write the mdtype value
byte_count_ptr : uint32_t*
pointer to uint32_t value to which we write the byte count
ptr : void*
memory location into which to read. Memory is assumed large
enough to contain read data
Returns
-------
void
Notes
-----
Compare ``read_element``.
'''
cdef:
int mod8
cdef int res = self.cread_tag(
mdtype_ptr,
byte_count_ptr,
<char *>ptr)
cdef cnp.uint32_t byte_count = byte_count_ptr[0]
if res == 1: # full format
res = self.cstream.read_into(ptr, byte_count)
# Seek to next 64-bit boundary
mod8 = byte_count % 8
if mod8:
self.cstream.seek(8 - mod8, 1)
return 0
cpdef cnp.ndarray read_numeric(self, int copy=True):
''' Read numeric data element into ndarray
Reads element, then casts to ndarray.
The type of the array is given by the ``mdtype`` returned via
``read_element``.
'''
cdef cnp.uint32_t mdtype, byte_count
cdef void *data_ptr
cdef cnp.npy_intp el_count
cdef cnp.ndarray el
cdef object data = self.read_element(
&mdtype, &byte_count, <void **>&data_ptr, copy)
cdef cnp.dtype dt = <cnp.dtype>self.dtypes[mdtype]
el_count = byte_count // dt.itemsize
cdef int flags = 0
if copy:
flags = cnp.NPY_WRITEABLE
Py_INCREF(<object> dt)
el = PyArray_NewFromDescr(&PyArray_Type,
dt,
1,
&el_count,
NULL,
<void*>data_ptr,
flags,
<object>NULL)
Py_INCREF(<object> data)
PyArray_Set_BASE(el, data)
return el
cdef inline object read_int8_string(self):
''' Read, return int8 type string
int8 type strings used for variable names, class names of
objects, and field names of structs and objects.
Specializes ``read_element``
'''
cdef:
cnp.uint32_t mdtype, byte_count
void *ptr
object data
data = self.read_element(&mdtype, &byte_count, &ptr)
if mdtype != miINT8:
raise TypeError('Expecting miINT8 as data type')
return data
cdef int read_into_int32s(self, cnp.int32_t *int32p) except -1:
''' Read int32 values into pre-allocated memory
Byteswap as necessary. Specializes ``read_element_into``
Parameters
----------
int32p : int32 pointer
Returns
-------
n_ints : int
Number of integers read
'''
cdef:
cnp.uint32_t mdtype, byte_count
int i
self.read_element_into(&mdtype, &byte_count, <void *>int32p)
if mdtype != miINT32:
raise TypeError('Expecting miINT32 as data type')
return -1
cdef int n_ints = byte_count // 4
if self.is_swapped:
for i in range(n_ints):
int32p[i] = byteswap_u4(int32p[i])
return n_ints
def read_full_tag(self):
''' Python method for reading full u4, u4 tag from stream
Returns
-------
mdtype : int32
matlab data type code
byte_count : int32
number of data bytes following
Notes
-----
Assumes tag is in fact full, that is, is not a small data
element. This means it can skip some checks and makes it
slightly faster than ``read_tag``
'''
cdef cnp.uint32_t mdtype, byte_count
self.cread_full_tag(&mdtype, &byte_count)
return mdtype, byte_count
cdef int cread_full_tag(self,
cnp.uint32_t* mdtype,
cnp.uint32_t* byte_count) except -1:
''' C method for reading full u4, u4 tag from stream'''
cdef cnp.uint32_t u4s[2]
self.cstream.read_into(<void *>u4s, 8)
if self.is_swapped:
mdtype[0] = byteswap_u4(u4s[0])
byte_count[0] = byteswap_u4(u4s[1])
else:
mdtype[0] = u4s[0]
byte_count[0] = u4s[1]
return 0
cpdef VarHeader5 read_header(self):
''' Return matrix header for current stream position
Returns matrix headers at top level and sub levels
'''
cdef:
cdef cnp.uint32_t u4s[2]
cnp.uint32_t mdtype, byte_count
cnp.uint32_t flags_class, nzmax
cnp.uint16_t mc
int ret, i
void *ptr
VarHeader5 header
# Read and discard mdtype and byte_count
self.cstream.read_into(<void *>u4s, 8)
# get array flags and nzmax
self.cstream.read_into(<void *>u4s, 8)
if self.is_swapped:
flags_class = byteswap_u4(u4s[0])
nzmax = byteswap_u4(u4s[1])
else:
flags_class = u4s[0]
nzmax = u4s[1]
header = VarHeader5()
mc = flags_class & 0xFF
header.mclass = mc
header.is_logical = flags_class >> 9 & 1
header.is_global = flags_class >> 10 & 1
header.is_complex = flags_class >> 11 & 1
header.nzmax = nzmax
# all miMATRIX types except the mxOPAQUE_CLASS have dims and a
# name.
if mc == mxOPAQUE_CLASS:
header.name = None
header.dims = None
return header
header.n_dims = self.read_into_int32s(header.dims_ptr)
if header.n_dims > _MAT_MAXDIMS:
raise ValueError('Too many dimensions (%d) for numpy arrays'
% header.n_dims)
# convert dims to list
header.dims = []
for i in range(header.n_dims):
header.dims.append(header.dims_ptr[i])
header.name = self.read_int8_string()
return header
cdef inline size_t size_from_header(self, VarHeader5 header):
''' Supporting routine for calculating array sizes from header
Probably unnecessary optimization that uses integers stored in
header rather than ``header.dims`` that is a python list.
Parameters
----------
header : VarHeader5
array header
Returns
-------
size : size_t
size of array referenced by header (product of dims)
'''
# calculate number of items in array from dims product
cdef size_t size = 1
cdef int i
for i in range(header.n_dims):
size *= header.dims_ptr[i]
return size
cdef read_mi_matrix(self, int process=1):
''' Read header with matrix at sub-levels
Combines ``read_header`` and functionality of
``array_from_header``. Applies standard processing of array
given options set in self.
Parameters
----------
process : int, optional
If not zero, apply post-processing on returned array
Returns
-------
arr : ndarray or sparse matrix
'''
cdef:
VarHeader5 header
cnp.uint32_t mdtype, byte_count
object arr
# read full tag
self.cread_full_tag(&mdtype, &byte_count)
if mdtype != miMATRIX:
raise TypeError('Expecting matrix here')
if byte_count == 0: # empty matrix
if process and self.squeeze_me:
return np.array([])
else:
return np.array([[]])
header = self.read_header()
return self.array_from_header(header, process)
cpdef array_from_header(self, VarHeader5 header, int process=1):
''' Read array of any class, given matrix `header`
Parameters
----------
header : VarHeader5
array header object
process : int, optional
If not zero, apply post-processing on returned array
Returns
-------
arr : array or sparse array
read array
'''
cdef:
object arr
cnp.dtype mat_dtype
cdef int mc = header.mclass
if (mc == mxDOUBLE_CLASS
or mc == mxSINGLE_CLASS
or mc == mxINT8_CLASS
or mc == mxUINT8_CLASS
or mc == mxINT16_CLASS
or mc == mxUINT16_CLASS
or mc == mxINT32_CLASS
or mc == mxUINT32_CLASS
or mc == mxINT64_CLASS
or mc == mxUINT64_CLASS): # numeric matrix
arr = self.read_real_complex(header)
if process and self.mat_dtype: # might need to recast
if header.is_logical:
mat_dtype = self.bool_dtype
else:
mat_dtype = <object>self.class_dtypes[mc]
arr = arr.astype(mat_dtype)
elif mc == mxSPARSE_CLASS:
arr = self.read_sparse(header)
# no current processing makes sense for sparse
return arr
elif mc == mxCHAR_CLASS:
arr = self.read_char(header)
if process and self.chars_as_strings:
arr = chars_to_strings(arr)
elif mc == mxCELL_CLASS:
arr = self.read_cells(header)
elif mc == mxSTRUCT_CLASS:
arr = self.read_struct(header)
elif mc == mxOBJECT_CLASS: # like structs, but with classname
classname = asstr(self.read_int8_string())
arr = self.read_struct(header)
arr = mio5p.MatlabObject(arr, classname)
elif mc == mxFUNCTION_CLASS: # just a matrix of struct type
arr = self.read_mi_matrix()
arr = mio5p.MatlabFunction(arr)
# to make them more re-writeable - don't squeeze
return arr
elif mc == mxOPAQUE_CLASS:
arr = self.read_opaque(header)
arr = mio5p.MatlabOpaque(arr)
# to make them more re-writeable - don't squeeze
return arr
if process and self.squeeze_me:
return squeeze_element(arr)
return arr
def shape_from_header(self, VarHeader5 header):
cdef int mc = header.mclass
cdef tuple shape
if mc == mxSPARSE_CLASS:
shape = tuple(header.dims)
elif mc == mxCHAR_CLASS:
shape = tuple(header.dims)
if self.chars_as_strings:
shape = shape[:-1]
else:
shape = tuple(header.dims)
if self.squeeze_me:
shape = tuple([x for x in shape if x != 1])
return shape
cpdef cnp.ndarray read_real_complex(self, VarHeader5 header):
''' Read real / complex matrices from stream '''
cdef:
cnp.ndarray res, res_j
if header.is_complex:
# avoid array copy to save memory
res = self.read_numeric(False)
res_j = self.read_numeric(False)
# Use c8 for f4s and c16 for f8 input. Just ``res = res + res_j *
# 1j`` upcasts to c16 regardless of input type.
if res.itemsize == 4:
res = res.astype('c8')
else:
res = res.astype('c16')
res.imag = res_j
else:
res = self.read_numeric()
return res.reshape(header.dims[::-1]).T
cdef object read_sparse(self, VarHeader5 header):
''' Read sparse matrices from stream '''
cdef cnp.ndarray rowind, indptr, data, data_j
cdef size_t M, N, nnz
rowind = self.read_numeric()
indptr = self.read_numeric()
if header.is_complex:
# avoid array copy to save memory
data = self.read_numeric(False)
data_j = self.read_numeric(False)
data = data + (data_j * 1j)
else:
data = self.read_numeric()
''' From the matlab (TM) API documentation, last found here:
http://www.mathworks.com/access/helpdesk/help/techdoc/matlab_external/
rowind are simply the row indices for all the (nnz) non-zero
entries in the sparse array. rowind has nzmax entries, so
may well have more entries than nnz, the actual number of
non-zero entries, but rowind[nnz:] can be discarded and
should be 0. indptr has length (number of columns + 1), and
is such that, if D = diff(colind), D[j] gives the number of
non-zero entries in column j. Because rowind values are
stored in column order, this gives the column corresponding
to each rowind
'''
M,N = header.dims
indptr = indptr[:N+1]
nnz = indptr[-1]
rowind = rowind[:nnz]
data = data[:nnz]
return scipy.sparse.csc_matrix(
(data,rowind,indptr),
shape=(M,N))
cpdef cnp.ndarray read_char(self, VarHeader5 header):
''' Read char matrices from stream as arrays
Matrices of char are likely to be converted to matrices of
string by later processing in ``array_from_header``
'''
'''Notes to friendly fellow-optimizer
This routine is not much optimized. If I was going to do it,
I'd store the codecs as an object pointer array, as for the
.dtypes, I might use python_string.PyBytes_Decode for decoding,
I'd do something with pointers to pull the LSB out of the uint16
dtype, without using an intermediate array, I guess I'd consider
using the numpy C-API for array creation. I'd try and work out
how to deal with UCS-2 and UCS-4 builds of python, and how numpy
deals with unicode strings passed as memory,
My own unicode introduction here:
http://matthew-brett.github.com/pydagogue/python_unicode.html
'''
cdef:
cnp.uint32_t mdtype, byte_count
char *data_ptr
size_t el_count
object data, res, codec
cnp.ndarray arr
cnp.dtype dt
cdef size_t length = self.size_from_header(header)
data = self.read_element(
&mdtype, &byte_count, <void **>&data_ptr, True)
# There are mat files in the wild that have 0 byte count strings, but
# maybe with non-zero length.
if byte_count == 0:
arr = np.array(' ' * length, dtype='U')
return np.ndarray(shape=header.dims,
dtype=self.U1_dtype,
buffer=arr,
order='F')
# Character data can be of apparently numerical types,
# specifically np.uint8, np.int8, np.uint16. np.unit16 can have
# a length 1 type encoding, like ascii, or length 2 type
# encoding
dt = <cnp.dtype>self.dtypes[mdtype]
if mdtype == miUINT16:
codec = self.uint16_codec
if self.codecs['uint16_len'] == 1: # need LSBs only
arr = np.ndarray(shape=(length,),
dtype=dt,
buffer=data)
data = arr.astype(np.uint8).tostring()
elif mdtype == miINT8 or mdtype == miUINT8:
codec = 'ascii'
elif mdtype in self.codecs: # encoded char data
codec = self.codecs[mdtype]
if not codec:
raise TypeError('Do not support encoding %d' % mdtype)
else:
raise ValueError('Type %d does not appear to be char type'
% mdtype)
uc_str = data.decode(codec)
# cast to array to deal with 2, 4 byte width characters
arr = np.array(uc_str, dtype='U')
# could take this to numpy C-API level, but probably not worth
# it
return np.ndarray(shape=header.dims,
dtype=self.U1_dtype,
buffer=arr,
order='F')
cpdef cnp.ndarray read_cells(self, VarHeader5 header):
''' Read cell array from stream '''
cdef:
size_t i
cnp.ndarray[object, ndim=1] result
# Account for fortran indexing of cells
tupdims = tuple(header.dims[::-1])
cdef size_t length = self.size_from_header(header)
result = np.empty(length, dtype=object)
for i in range(length):
result[i] = self.read_mi_matrix()
return result.reshape(tupdims).T
def read_fieldnames(self):
''' Read fieldnames for struct-like matrix '
Python wrapper for cdef'ed method
'''
cdef int n_names
return self.cread_fieldnames(&n_names)
cdef inline object cread_fieldnames(self, int *n_names_ptr):
cdef:
cnp.int32_t namelength
int i, n_names
object name, field_names
# Read field names into list
cdef int res = self.read_into_int32s(&namelength)
if res != 1:
raise ValueError('Only one value for namelength')
cdef object names = self.read_int8_string()
field_names = []
n_names = PyBytes_Size(names) // namelength
# Make n_duplicates and pointer arrays
cdef:
int *n_duplicates
char **name_ptrs
n_duplicates = <int *>calloc(n_names, sizeof(int))
name_ptrs = <char **>calloc(n_names, sizeof(char *))
cdef:
char *n_ptr = names
int j, dup_no
for i in range(n_names):
name = asstr(PyBytes_FromString(n_ptr))
# Check if this is a duplicate field, rename if so
name_ptrs[i] = n_ptr
dup_no = 0
for j in range(i):
if strcmp(n_ptr, name_ptrs[j]) == 0: # the same
n_duplicates[j] += 1
dup_no = n_duplicates[j]
break
if dup_no != 0:
name = '_%d_%s' % (dup_no, name)
field_names.append(name)
n_ptr += namelength
free(n_duplicates)
free(name_ptrs)
n_names_ptr[0] = n_names
return field_names
cpdef cnp.ndarray read_struct(self, VarHeader5 header):
''' Read struct or object array from stream
Objects are just structs with an extra field *classname*,
defined before (this here) struct format structure
'''
cdef:
cnp.int32_t namelength
int i, n_names
cnp.ndarray rec_res
cnp.ndarray[object, ndim=1] result
object dt, tupdims
# Read field names into list
cdef object field_names = self.cread_fieldnames(&n_names)
# Prepare struct array
tupdims = tuple(header.dims[::-1])
cdef size_t length = self.size_from_header(header)
if self.struct_as_record: # to record arrays
if not n_names:
# If there are no field names, there is no dtype
# representation we can use, falling back to empty
# object
return np.empty(tupdims, dtype=object).T
dt = [(field_name, object) for field_name in field_names]
rec_res = np.empty(length, dtype=dt)
for i in range(length):
for field_name in field_names:
rec_res[i][field_name] = self.read_mi_matrix()
return rec_res.reshape(tupdims).T
# Backward compatibility with previous format
obj_template = mio5p.mat_struct()
obj_template._fieldnames = field_names
result = np.empty(length, dtype=object)
for i in range(length):
item = pycopy(obj_template)
for name in field_names:
item.__dict__[name] = self.read_mi_matrix()
result[i] = item
return result.reshape(tupdims).T
cpdef cnp.ndarray read_opaque(self, VarHeader5 hdr):
''' Read opaque (function workspace) type
Looking at some mat files, the structure of this type seems to
be:
* array flags as usual (already read into `hdr`)
* 3 int8 strings
* a matrix
Then there's a matrix at the end of the mat file that seems have
the anonymous founction workspaces - we load it as
``__function_workspace__``
See the comments at the beginning of ``mio5.py``
'''
cdef cnp.ndarray res = np.empty((1,), dtype=OPAQUE_DTYPE)
res[0]['s0'] = self.read_int8_string()
res[0]['s1'] = self.read_int8_string()
res[0]['s2'] = self.read_int8_string()
res[0]['arr'] = self.read_mi_matrix()
return res