Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

ENH: Allow referencing of Excel columns by their Excel column names. #1936

Closed
wants to merge 8 commits into
from
View
@@ -1312,6 +1312,8 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
If None then parse all columns,
If int then indicates last column to be parsed
If list of ints then indicates list of column numbers to be parsed
+ If string then indicates comma separated list of column names and
+ column ranges (e.g. "A:E" or "A,C,E:F")
na_values : list-like, default None
List of additional strings to recognize as NA/NaN
@@ -1336,8 +1338,34 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
skip_footer=skip_footer)
def _should_parse(self, i, parse_cols):
+
+ def _range2cols(areas):
+ """
+ Convert comma separated list of column names and column ranges to a
+ list of 0-based column indexes.
+
+ >>> _range2cols('A:E')
+ [0, 1, 2, 3, 4]
+ >>> _range2cols('A,C,Z:AB')
+ [0, 2, 25, 26, 27]
+ """
+ def _excel2num(x):
+ "Convert Excel column name like 'AB' to 0-based column index"
+ return reduce(lambda s,a: s*26+ord(a)-ord('A')+1, x.upper().strip(), 0)-1
+
+ cols = []
+ for rng in areas.split(','):
+ if ':' in rng:
+ rng = rng.split(':')
+ cols += range(_excel2num(rng[0]), _excel2num(rng[1])+1)
+ else:
+ cols.append(_excel2num(rng))
+ return cols
+
if isinstance(parse_cols, int):
return i <= parse_cols
+ elif isinstance(parse_cols, basestring):
+ return i in _range2cols(parse_cols)
else:
return i in parse_cols
@@ -861,6 +861,48 @@ def test_parse_cols_list(self):
assert_frame_equal(df, df2)
assert_frame_equal(df3, df2)
+ def test_parse_cols_str(self):
+ _skip_if_no_openpyxl()
+ _skip_if_no_xlrd()
+
+ suffix = ['', 'x']
+
+ for s in suffix:
+
+ pth = os.path.join(self.dirpath, 'test.xls%s' % s)
+ xls = ExcelFile(pth)
+
+ df = xls.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols='A:D')
+ df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
+ df2 = df2.reindex(columns=['A', 'B', 'C'])
+ df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True, parse_cols='A:D')
+ assert_frame_equal(df, df2)
+ assert_frame_equal(df3, df2)
+ del df, df2, df3
+
+ df = xls.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols='A,C,D')
+ df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
+ df2 = df2.reindex(columns=['B', 'C'])
+ df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True,
+ parse_cols='A,C,D')
+ assert_frame_equal(df, df2)
+ assert_frame_equal(df3, df2)
+ del df, df2, df3
+
+ df = xls.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols='A,C:D')
+ df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
+ df2 = df2.reindex(columns=['B', 'C'])
+ df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True,
+ parse_cols='A,C:D')
+ assert_frame_equal(df, df2)
+ assert_frame_equal(df3, df2)
+
def test_read_table_unicode(self):
fin = StringIO('\u0141aski, Jan;1')
df1 = read_table(fin, sep=";", encoding="utf-8", header=None)