Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Switched to fixed width input matrix to prevent different widths on t…
…raining set and final data, and added in option to cast target as int or otherwise pass straight through
  • Loading branch information
Chris Raynor committed Nov 18, 2011
1 parent 4acab1c commit 802e56b
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 12 deletions.
16 changes: 6 additions & 10 deletions mlloutils.py
Expand Up @@ -5,12 +5,12 @@
# Takes our Kaggle data sets, where some of the columns are lists of space-separated
# numbers representing words, and expands them into a flat array containing a binary
# value for each possible word, indicating if it was present
def expand_to_vectors(filename, lat_header, lon_header, code_headers, target_header=None):
def expand_to_vectors(filename, lat_header, lon_header, code_headers, target_header=None, int_target=False):
code_headers_map = {}
for index, header in enumerate(code_headers):
code_headers_map[header] = index
reader = csv.reader(open(filename))
max_code = 0
max_code = 2153 # number of whitelisted words
max_index = 0
max_i = 0
for i, input_row in enumerate(reader):
Expand All @@ -22,13 +22,6 @@ def expand_to_vectors(filename, lat_header, lon_header, code_headers, target_hea
# do not treat the target as a text field
continue
max_index = max(max_index, index)
if index in code_headers_map:
codes = value.split(' ')
for code_string in codes:
if code_string == '':
continue
code = int(code_string)
max_code = max(max_code, code)
latlon_start = max_index+(len(code_headers)*max_code)
max_j = latlon_start+360*180
reader = csv.reader(open(filename))
Expand All @@ -42,7 +35,10 @@ def expand_to_vectors(filename, lat_header, lon_header, code_headers, target_hea
continue # Skip header row
for index, value in enumerate(input_row):
if index == target_header:
target.append(int(value))
if int_target
target.append(int(value))
else
target.append(val)
elif index in code_headers_map:
code_offset = max_index+(code_headers_map[index]*max_code)
codes = value.split(' ')
Expand Down
2 changes: 1 addition & 1 deletion predict.py
Expand Up @@ -17,7 +17,7 @@

test_name = sys.argv[1]
print >> sys.stderr, 'Loading test set from '+test_name
test_vectors, ids = expand_to_vectors(test_name, 1, 2, [6, 7, 8], 0)
test_vectors, ids = expand_to_vectors(test_name, 1, 2, [6, 7, 8], 0, False)
print "%d vectors with dimension %d" % test_vectors.shape

print >> sys.stderr, 'Predicting...'
Expand Down
2 changes: 1 addition & 1 deletion train.py
Expand Up @@ -20,7 +20,7 @@
training_name = sys.argv[1]
print >> sys.stderr, 'Loading training set from '+training_name
training_vectors, training_target = expand_to_vectors(
training_name, 1, 2, [6, 7, 8], 9)
training_name, 1, 2, [6, 7, 8], 9, True)
print "%d vectors with dimension %d" % training_vectors.shape

# Normalize the sparse positive features using the TF-IDF normalizer as field
Expand Down

0 comments on commit 802e56b

Please sign in to comment.