-
Notifications
You must be signed in to change notification settings - Fork 0
/
preproc.py
95 lines (70 loc) · 2.32 KB
/
preproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import nltk
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
from constants import OFFSET
def tokenize_and_downcase(string,vocab=None):
"""for a given string, corresponding to a document:
- tokenize first by sentences and then by word
- downcase each token
- return a Counter of tokens and frequencies.
:param string: input document
:returns: counter of tokens and frequencies
:rtype: Counter
"""
bow = Counter()
# If we have a vocabulary, add all words to counter
if vocab:
for token in vocab:
downcase = token.lower()
bow[downcase] = 0
sentences = nltk.sent_tokenize(string)
for sentence in sentences:
words = nltk.word_tokenize(sentence)
for word in words:
downcase = word.lower()
bow[downcase] += 1
return bow
### Helper code
def read_data(csvfile,labelname,preprocessor=lambda x : x):
# note that use of utf-8 encoding to read the file
df = pd.read_csv(csvfile,encoding='utf-8')
return df[labelname].values,[preprocessor(string) for string in df['text'].values]
def get_vocab_counts(x_tr, x_te):
counts = Counter()
for bows in [x_tr, x_te]:
for bow in bows:
for k, v in bow.iteritems():
counts[k] += v
return counts
def create_vocab(x_tr, x_te):
"""
input: training data, a list of bag-of-words (python Counter objects)
output: the set of all possible tokens (vocab) in our input vectors
"""
vocab = set()
for bows in [x_tr, x_te]:
for bow in bows:
# add all tokens in bow to the vocab set
vocab.update(bow.iterkeys())
vocab.add(OFFSET) # this should improve accuracies later
return vocab
def create_mapping(vocab, K):
"""
input: vocab, K = size of vocab
output: a defaultdict mapping distinct tokens (keys) to indices in a feature array
"""
# use a defaultdict to prevent crashing code if we somehow try to get a
# mapping for a word not in the original vocab
mapping_temp = {token : idx for idx, token in enumerate(vocab)}
# any word not in vocab maps to Kth index
mapping = defaultdict(lambda: K, mapping_temp)
return mapping
def feature_vector(bow, vocab, K, mapping):
vec_size = len(vocab)
vec = np.zeros(vec_size)
for word, count in bow.iteritems():
idx = mapping[word]
if idx != K:
vec[idx] += count
return vec