Permalink
Browse files

Initial import

  • Loading branch information...
0 parents commit c7de70e0a98a5c31050b4d201262c7a6c08fa14c @petewarden committed Nov 1, 2011
Showing with 52,428 additions and 0 deletions.
  1. +58 −0 mlloutils.py
  2. +28 −0 predict.py
  3. +52 −0 score.py
  4. +12,001 −0 test_data.csv
  5. +26 −0 train.py
  6. +40,263 −0 training_data.csv
@@ -0,0 +1,58 @@
+import csv
+import numpy as np
+import scipy.sparse as sp
+
+# Takes our Kaggle data sets, where some of the columns are lists of space-separated
+# numbers representing words, and expands them into a flat array containing a binary
+# value for each possible word, indicating if it was present
+def expand_to_vectors(filename, code_headers, target_header=None):
+ code_headers_map = {}
+ for index, header in enumerate(code_headers):
+ code_headers_map[header] = index
+ reader = csv.reader(open(filename)) or die('Couldn\'t open '+filename)
+ max_code = 0
+ max_index = 0
+ max_i = 0
+ for i, input_row in enumerate(reader):
+ max_i = max(max_i, i)
+ if i == 0:
+ continue # Skip header row
+ for index, value in enumerate(input_row):
+ max_index = max(max_index, index)
+ if index in code_headers_map:
+ codes = value.split(' ')
+ for code_string in codes:
+ if code_string == '':
+ continue
+ code = int(code_string)
+ max_code = max(max_code, code)
+ max_j = max_index+(len(code_headers)*max_code)
+ reader = csv.reader(open(filename)) or die('Couldn\'t open '+filename)
+ i_indices = []
+ j_indices = []
+ values = []
+ target = []
+ for i, input_row in enumerate(reader):
+ if i == 0:
+ continue # Skip header row
+ output_row = []
+ for index, value in enumerate(input_row):
+ if index == target_header:
+ target.append(int(value))
+ elif index in code_headers_map:
+ code_offset = max_index+(code_headers_map[index]*max_code)
+ codes = value.split(' ')
+ for code_string in codes:
+ if code_string == '':
+ continue
+ code = int(code_string)
+ i_indices.append(i-1)
+ j_indices.append(code_offset+code)
+ values.append(1.0)
+ elif index != 0:
+ i_indices.append(i-1)
+ j_indices.append(index)
+ values.append(int(value))
+ shape = (max_i+1, max_j+1)
+ output = sp.coo_matrix((values, (i_indices, j_indices)), shape=shape, dtype=np.dtype(float))
+ return (output, target)
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+import csv
+from sklearn import svm
+import numpy as np
+import scipy.sparse as sp
+import sys
+from sklearn.externals import joblib
+from mlloutils import expand_to_vectors
+
+if len(sys.argv) < 2:
+ print >> sys.stderr, 'Usage: python '+sys.argv[0]+' <test csv input> <persistent model input>'
+ exit(0)
+
+model_name = sys.argv[2]
+print >> sys.stderr, 'Loading classifier from '+model_name
+clf = joblib.load(model_name) or die('Couldn\'t load model from '+model_name)
+
+test_name = sys.argv[1]
+print >> sys.stderr, 'Loading test set from '+test_name
+test_vectors, ids = expand_to_vectors(test_name, [6, 7, 8], 0)
+
+print >> sys.stderr, 'Predicting...'
+prediction_matrix = clf.predict(test_vectors)
+prediction = prediction_matrix.tolist()
+
+print 'id,good'
+for index, value in enumerate(ids):
+ print str(value)+','+str(prediction[index])
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+import csv
+import sys
+
+# Takes in a CSV file that starts with a header row, and returns an array of
+# dictionaries, with the keys set by the names in the header.
+def load_csv(filename):
+ reader = csv.reader(open(filename)) or die('Couldn\'t open '+filename)
+ line = 0
+ output = {}
+ for input_row in reader:
+ line += 1
+ if line == 1:
+ headers = input_row
+ else:
+ key = input_row[0]
+ value = float(input_row[1])
+ output[key] = value
+ return output
+
+solution_name = sys.argv[1]
+solution = load_csv(solution_name)
+
+candidate_name = sys.argv[2]
+candidate = load_csv(candidate_name)
+
+threshold = float(sys.argv[3])
+
+true_positives = 0
+true_negatives = 0
+false_positives = 0
+false_negatives = 0
+for key, solution_value in solution.items():
+ candidate_value = candidate[key]
+ solution_good = (solution_value > threshold)
+ candidate_good = (candidate_value > threshold)
+ if candidate_good and solution_good:
+ true_positives += 1
+ elif (not candidate_good) and (not solution_good):
+ true_negatives += 1
+ elif candidate_good:
+ false_positives += 1
+ else:
+ false_negatives += 1
+
+total = (true_positives + true_negatives + false_positives + false_negatives)
+true_positives_percentage = round((true_positives*1000.0)/total)/10
+true_negatives_percentage = round((true_negatives*1000.0)/total)/10
+false_positives_percentage = round((false_positives*1000.0)/total)/10
+false_negatives_percentage = round((false_negatives*1000.0)/total)/10
+
+print str(true_positives_percentage)+'% true positives, '+str(false_positives_percentage)+'% false positives, '+str(true_negatives_percentage)+'% true negatives, '+str(false_negatives_percentage)+'% false negatives'
Oops, something went wrong.

0 comments on commit c7de70e

Please sign in to comment.