Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

add --zero to csvcut. like -n but zero based for mapping to python, etc #144

Merged
merged 5 commits into from

2 participants

@JoeGermuska

I want it! Would have preferred '-z' but that flag is already used, so I didn't use any short flag.

@onyxfish
Owner

Just now looking over this patch. I think it should be generalized it before it is merged in. cut, grep, join, sort and stat all use column indices. We should also update -n to print zero-based columns when --zero is also included.

@JoeGermuska

i'll buy that. i hadn't thought of it in terms of shifting all counting, but it makes sense. might be a while before I get to it, but I'll do it.

@JoeGermuska

this is now extended so that all printing and index identifying will honor the --zero flag

@onyxfish onyxfish merged commit 77580b2 into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
View
48 csvkit/cli.py
@@ -142,6 +142,12 @@ def _init_common_parser(self):
self.argparser.add_argument('-l', '--linenumbers', dest='line_numbers', action='store_true',
help='Insert a column of line numbers at the front of the output. Useful when piping to grep or as a simple primary key.')
+ # Input/Output
+ if 'zero' not in self.override_flags:
+ self.argparser.add_argument('--zero', dest='zero_based', action='store_true',
+ help='When interpreting or displaying column numbers, use zero-based numbering instead of the default 1-based numbering.')
+
+
def _extract_csv_reader_kwargs(self):
"""
Extracts those from the command-line arguments those would should be passed through to the input CSV reader(s).
@@ -201,7 +207,27 @@ def handler(t, value, traceback):
sys.excepthook = handler
-def match_column_identifier(column_names, c):
+ def print_column_names(self):
+ """
+ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
+ """
+ f = self.args.file
+ output = self.output_file
+ try:
+ zero_based=self.args.zero_based
+ except:
+ zero_based=False
+
+ rows = CSVKitReader(f, **self.reader_kwargs)
+ column_names = rows.next()
+
+ for i, c in enumerate(column_names):
+ if not zero_based:
+ i += 1
+ output.write('%3i: %s\n' % (i, c))
+
+
+def match_column_identifier(column_names, c, zero_based=False):
"""
Determine what column a single column id (name or index) matches in a series of column names.
Note that integer values are *always* treated as positional identifiers. If you happen to have
@@ -211,7 +237,9 @@ def match_column_identifier(column_names, c):
return column_names.index(c)
else:
try:
- c = int(c) - 1
+ c = int(c)
+ if not zero_based:
+ c -= 1
# Fail out if neither a column name nor an integer
except:
raise ColumnIdentifierError('Column identifier "%s" is neither an integer, nor a existing column\'s name.' % c)
@@ -226,7 +254,7 @@ def match_column_identifier(column_names, c):
return c
-def parse_column_identifiers(ids, column_names):
+def parse_column_identifiers(ids, column_names,zero_based=False):
"""
Parse a comma-separated list of column indices AND/OR names into a list of integer indices.
Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of
@@ -243,7 +271,7 @@ def parse_column_identifiers(ids, column_names):
c = c.strip()
try:
- columns.append(match_column_identifier(column_names, c))
+ columns.append(match_column_identifier(column_names, c, zero_based))
except ColumnIdentifierError:
if ':' in c:
a,b = c.split(':',1)
@@ -266,17 +294,7 @@ def parse_column_identifiers(ids, column_names):
raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
for x in range(a,b):
- columns.append(match_column_identifier(column_names, x))
+ columns.append(match_column_identifier(column_names, x, zero_based))
return columns
-def print_column_names(f, output, **reader_kwargs):
- """
- Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
- """
- rows = CSVKitReader(f, **reader_kwargs)
- column_names = rows.next()
-
- for i, c in enumerate(column_names):
- output.write('%3i: %s\n' % (i + 1, c))
-
View
4 csvkit/table.py
@@ -174,7 +174,7 @@ def row(self, i):
return row_data
@classmethod
- def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, **kwargs):
+ def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, **kwargs):
"""
Creates a new Table from a file-like object containing CSV data.
@@ -200,7 +200,7 @@ def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, bl
headers = reader.next()
if column_ids:
- column_ids = parse_column_identifiers(column_ids, headers)
+ column_ids = parse_column_identifiers(column_ids, headers, zero_based)
headers = [headers[c] for c in column_ids]
else:
column_ids = range(len(headers))
View
6 csvkit/utilities/csvcut.py
@@ -10,7 +10,7 @@
"""
from csvkit import CSVKitReader, CSVKitWriter
-from csvkit.cli import CSVKitUtility, parse_column_identifiers, print_column_names
+from csvkit.cli import CSVKitUtility, parse_column_identifiers
class CSVCut(CSVKitUtility):
description = 'Filter and truncate CSV files. Like unix "cut" command, but for tabular data.'
@@ -25,13 +25,13 @@ def add_arguments(self):
def main(self):
if self.args.names_only:
- print_column_names(self.args.file, self.output_file, **self.reader_kwargs)
+ self.print_column_names()
return
rows = CSVKitReader(self.args.file, **self.reader_kwargs)
column_names = rows.next()
- column_ids = parse_column_identifiers(self.args.columns, column_names)
+ column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
output = CSVKitWriter(self.output_file, **self.writer_kwargs)
output.writerow([column_names[c] for c in column_ids])
View
6 csvkit/utilities/csvgrep.py
@@ -5,7 +5,7 @@
from argparse import FileType
from csvkit import CSVKitReader, CSVKitWriter
-from csvkit.cli import CSVKitUtility, CSVFileType, parse_column_identifiers, print_column_names
+from csvkit.cli import CSVKitUtility, CSVFileType, parse_column_identifiers
from csvkit.grep import FilteringCSVReader
class CSVGrep(CSVKitUtility):
@@ -30,7 +30,7 @@ def add_arguments(self):
def main(self):
if self.args.names_only:
- print_column_names(self.args.file, self.output_file, **self.reader_kwargs)
+ self.print_column_names()
return
if not self.args.regex and not self.args.pattern and not self.args.matchfile:
@@ -39,7 +39,7 @@ def main(self):
rows = CSVKitReader(self.args.file, **self.reader_kwargs)
column_names = rows.next()
- column_ids = parse_column_identifiers(self.args.columns, column_names)
+ column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
if self.args.regex:
pattern = re.compile(self.args.regex)
View
6 csvkit/utilities/csvsort.py
@@ -4,7 +4,7 @@
from csvkit import CSVKitWriter
from csvkit import table
-from csvkit.cli import CSVKitUtility, parse_column_identifiers, print_column_names
+from csvkit.cli import CSVKitUtility, parse_column_identifiers
class CSVSort(CSVKitUtility):
description = 'Sort CSV files. Like unix "sort" command, but for tabular data.'
@@ -21,7 +21,7 @@ def add_arguments(self):
def main(self):
if self.args.names_only:
- print_column_names(self.args.file, self.output_file, **self.reader_kwargs)
+ self.print_column_names()
return
if self.args.file.name != '<stdin>':
@@ -31,7 +31,7 @@ def main(self):
table_name = 'csvsql_table'
tab = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs)
- column_ids = parse_column_identifiers(self.args.columns, tab.headers())
+ column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based)
rows = tab.to_rows(serialize_dates=True)
rows.sort(key=lambda r: [r[c] for c in column_ids], reverse=self.args.reverse)
View
2  csvkit/utilities/csvstat.py
@@ -44,7 +44,7 @@ def add_arguments(self):
help='Only output max value length.')
def main(self):
- tab = table.Table.from_csv(self.args.file, snifflimit=self.args.snifflimit, column_ids=self.args.columns, **self.reader_kwargs)
+ tab = table.Table.from_csv(self.args.file, snifflimit=self.args.snifflimit, column_ids=self.args.columns,zero_based=self.zero_based, **self.reader_kwargs)
operations = [op for op in OPERATIONS if getattr(self.args, op + '_only')]
View
11 tests/test_cli.py
@@ -10,24 +10,33 @@ def setUp(self):
def test_match_column_identifier_string(self):
self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here'))
+ self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here', zero_based=True))
def test_match_column_identifier_numeric(self):
self.assertEqual(2, match_column_identifier(self.headers, 3))
+ self.assertEqual(3, match_column_identifier(self.headers, 3, zero_based=True))
def test_match_column_which_could_be_integer_name_is_treated_as_positional_id(self):
self.assertEqual(0, match_column_identifier(self.headers, '1'))
+ self.assertEqual(1, match_column_identifier(self.headers, '1', zero_based=True))
def test_parse_column_identifiers(self):
self.assertEqual([2, 0, 1], parse_column_identifiers(' i_work_here, 1,name ', self.headers))
+ self.assertEqual([2, 1, 1], parse_column_identifiers(' i_work_here, 1,name ', self.headers, zero_based=True))
def test_range_notation(self):
self.assertEqual([0,1,2], parse_column_identifiers('1:3', self.headers))
+ self.assertEqual([1,2,3], parse_column_identifiers('1:3', self.headers, zero_based=True))
self.assertEqual([1,2,3], parse_column_identifiers('2-4', self.headers))
+ self.assertEqual([2,3,4], parse_column_identifiers('2-4', self.headers, zero_based=True))
self.assertEqual([0,1,2,3], parse_column_identifiers('1,2:4', self.headers))
+ self.assertEqual([1,2,3,4], parse_column_identifiers('1,2:4', self.headers, zero_based=True))
self.assertEqual([4,2,5], parse_column_identifiers('more-header-values,3,stuff', self.headers))
+ self.assertEqual([4,3,5], parse_column_identifiers('more-header-values,3,stuff', self.headers,zero_based=True))
def test_range_notation_open_ended(self):
self.assertEqual([0,1,2], parse_column_identifiers(':3', self.headers))
target = range(3,len(self.headers) - 1) # protect against devs adding to self.headers
target.insert(0,0)
- self.assertEqual(target, parse_column_identifiers('1,4:', self.headers))
+ self.assertEqual(target, parse_column_identifiers('1,4:', self.headers))
+
Something went wrong with that request. Please try again.