/
_xlrd.py
143 lines (118 loc) · 4.45 KB
/
_xlrd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from __future__ import annotations
from datetime import time
import math
from typing import TYPE_CHECKING
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel._base import BaseExcelReader
if TYPE_CHECKING:
from xlrd import Book
from pandas._typing import (
Scalar,
StorageOptions,
)
class XlrdReader(BaseExcelReader["Book"]):
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer,
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using xlrd engine.
Parameters
----------
filepath_or_buffer : str, path object or Workbook
Object to be parsed.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
import_optional_dependency("xlrd", extra=err_msg)
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
@property
def _workbook_class(self) -> type[Book]:
from xlrd import Book
return Book
def load_workbook(self, filepath_or_buffer, engine_kwargs) -> Book:
from xlrd import open_workbook
if hasattr(filepath_or_buffer, "read"):
data = filepath_or_buffer.read()
return open_workbook(file_contents=data, **engine_kwargs)
else:
return open_workbook(filepath_or_buffer, **engine_kwargs)
@property
def sheet_names(self):
return self.book.sheet_names()
def get_sheet_by_name(self, name):
self.raise_if_bad_sheet_by_name(name)
return self.book.sheet_by_name(name)
def get_sheet_by_index(self, index):
self.raise_if_bad_sheet_by_index(index)
return self.book.sheet_by_index(index)
def get_sheet_data(
self, sheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
from xlrd import (
XL_CELL_BOOLEAN,
XL_CELL_DATE,
XL_CELL_ERROR,
XL_CELL_NUMBER,
xldate,
)
epoch1904 = self.book.datemode
def _parse_cell(cell_contents, cell_typ):
"""
converts the contents of the cell into a pandas appropriate object
"""
if cell_typ == XL_CELL_DATE:
# Use the newer xlrd datetime handling.
try:
cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
except OverflowError:
return cell_contents
# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
if (not epoch1904 and year == (1899, 12, 31)) or (
epoch1904 and year == (1904, 1, 1)
):
cell_contents = time(
cell_contents.hour,
cell_contents.minute,
cell_contents.second,
cell_contents.microsecond,
)
elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
if math.isfinite(cell_contents):
# GH54564 - don't attempt to convert NaN/Inf
val = int(cell_contents)
if val == cell_contents:
cell_contents = val
return cell_contents
data = []
nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)
return data