/
mlloutils.py
65 lines (64 loc) · 2.26 KB
/
mlloutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import csv
import numpy as np
import scipy.sparse as sp
# Takes our Kaggle data sets, where some of the columns are lists of space-separated
# numbers representing words, and expands them into a flat array containing a binary
# value for each possible word, indicating if it was present
def expand_to_vectors(filename, lat_header, lon_header, code_headers, target_header=None, int_target=False):
code_headers_map = {}
for index, header in enumerate(code_headers):
code_headers_map[header] = index
reader = csv.reader(open(filename))
max_code = 2153 # number of whitelisted words
max_index = 0
max_i = 0
for i, input_row in enumerate(reader):
max_i = max(max_i, i)
if i == 0:
continue # Skip header row
for index, value in enumerate(input_row):
if index == target_header:
# do not treat the target as a text field
continue
max_index = max(max_index, index)
latlon_start = max_index+(len(code_headers)*max_code)
max_j = latlon_start+360*180
reader = csv.reader(open(filename))
i_indices = []
j_indices = []
values = []
target = []
for i, input_row in enumerate(reader):
latlon = {'lat':0, 'lon':0}
if i == 0:
continue # Skip header row
for index, value in enumerate(input_row):
if index == target_header:
if int_target:
target.append(int(value))
else:
target.append(value)
elif index in code_headers_map:
code_offset = max_index+(code_headers_map[index]*max_code)
codes = value.split(' ')
for code_string in codes:
if code_string == '':
continue
code = int(code_string)
i_indices.append(i-1)
j_indices.append(code_offset+code)
values.append(1.0)
elif index == lat_header:
latlon['lat'] = int(value)
elif index == lon_header:
latlon['lon'] = int(value)
elif index != 0:
i_indices.append(i-1)
j_indices.append(index)
values.append(int(value))
i_indices.append(i-1)
j_indices.append(latlon_start+(latlon['lat']+90)*360+latlon['lon']+180)
values.append(1.0)
shape = (max_i, max_j+1)
output = sp.coo_matrix((values, (i_indices, j_indices)), shape=shape, dtype=np.dtype(float))
return (output, np.asarray(target))