-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
csvcorpus.py
75 lines (55 loc) · 2.01 KB
/
csvcorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Zygmunt Zając <zygmunt@fastml.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Corpus in CSV format."""
from __future__ import with_statement
import logging
import csv
import itertools
from gensim import interfaces, utils
logger = logging.getLogger(__name__)
class CsvCorpus(interfaces.CorpusABC):
"""Corpus in CSV format.
Notes
-----
The CSV delimiter, headers etc. are guessed automatically based on the file content.
All row values are expected to be ints/floats.
"""
def __init__(self, fname, labels):
"""
Parameters
----------
fname : str
Path to corpus.
labels : bool
If True - ignore first column (class labels).
"""
logger.info("loading corpus from %s", fname)
self.fname = fname
self.length = None
self.labels = labels
# load the first few lines, to guess the CSV dialect
with utils.open(self.fname, 'rb') as f:
head = ''.join(itertools.islice(f, 5))
self.headers = csv.Sniffer().has_header(head)
self.dialect = csv.Sniffer().sniff(head)
logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
def __iter__(self):
"""Iterate over the corpus, returning one BoW vector at a time.
Yields
------
list of (int, float)
Document in BoW format.
"""
with utils.open(self.fname, 'rb') as f:
reader = csv.reader(f, self.dialect)
if self.headers:
next(reader) # skip the headers
line_no = -1
for line_no, line in enumerate(reader):
if self.labels:
line.pop(0) # ignore the first column = class label
yield list(enumerate(float(x) for x in line))
self.length = line_no + 1 # store the total number of CSV rows = documents