In [7]:
import collections
import heapq
import matplotlib.pyplot as plt
import numpy as np
import os, time, json, sys
import pickle

import itertools, sys, math, time
from scipy.spatial import distance as dist
from numpy import linalg

from matplotlib import offsetbox
from scipy.sparse import csr_matrix
from sklearn import cluster, datasets, decomposition, ensemble, lda, manifold, random_projection
from sklearn.decomposition import TruncatedSVD

from sklearn import svm, grid_search
from sklearn.decomposition import PCA, SparsePCA
from scipy import sparse as sp

from sklearn import cluster

%matplotlib inline

In [8]:
def sparse_where(sparse_matrix, num):
    """
    np.where() for a sparse matrix. Returns a set of indices.
    """
    return set(np.where(sparse_matrix[num,:].toarray())[1].tolist())

In [9]:
def load_sparse_data(filename, num_lines):
    """
    Function to load sparse data.
    """
    inverted_index = collections.defaultdict(set)
    
    sparse_indptr = [0]
    sparse_indices = []
    sparse_data = []
    vocabulary = {}

    print 'Reading data.'
    for line_num, line in enumerate(open(filename)):
        new_row = [(idx,float(prob)) for idx, prob in enumerate(line.strip().split(',')) if float(prob) > 0.0]
        for i,p in new_row:
            sparse_indices.append(i)
            sparse_data.append(p)
            inverted_index[i].add(line_num)
        sparse_indptr.append(len(sparse_indices))
        sys.stdout.write("\r%d%%" % (100.0 * line_num / num_lines))
        sys.stdout.flush()
    print 100.0 * line_num / num_lines, '%'
    print 'Done reading data.'

    sparse_matrix = csr_matrix((sparse_data, sparse_indices, sparse_indptr), dtype=float)
    return sparse_matrix, inverted_index

In [10]:
# Skip parts that take a long time.
SKIP_LONG_PARTS = True

# Create a dense representation of the data.
CREATE_DENSE_ARRAY = False

NUM_SPEECHES = 2740

NUM_DEBATES = 38

In [11]:
speech_vectors, inverted_index = load_sparse_data('speech_vectors.csv', NUM_SPEECHES)

Reading data.
99%99.9635036496 %
Done reading data.


In [12]:
speech_graph, inverted_graph = load_sparse_data('speech_graph.csv', NUM_SPEECHES)

Reading data.
99%99.9635036496 %
Done reading data.


In [13]:
from sklearn import svm, grid_search
from sklearn.decomposition import PCA, SparsePCA
from scipy import sparse as sp

# run PCA on data
truncated_svd = TruncatedSVD(n_components=10)
reduced_data = truncated_svd.fit_transform(sp.hstack((speech_graph.todense(), speech_vectors[:, :25000])))

# form the training data
X = reduced_data[[2, 13, 18, 24, 1, 3, 27, 177], :]
y = [0 for i in range(4)] + [1 for i in range(4)]

# fit SVM, searching over paramaters
parameters = {'kernel':('linear', 'rbf'), 'C':np.arange(1, 10, 0.5)}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)
a = clf.fit(X, y)
print a
print clf

# make predictions
predictions = []
ones = 0

for i in range(len(reduced_data)):
    prediction = clf.predict(reduced_data[i])[0]
    if prediction == 1:
        ones += 1
    predictions += [prediction]

clf = grid_search.GridSearchCV(svr, parameters)

# print predictions
print float(ones)/len(predictions)

GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': array([ 1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,
        6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': array([ 1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,
        6.5,  7. ,  7.5,  8. 

In [14]:
good_preds = np.loadtxt(open('preds_laplacian1.csv'),delimiter=",",skiprows=1)

# score = 0
# for i, pred in enumerate(good_preds):
#     if pred == predictions[i]:
#         score += 1
# print 'score' + str(score/2740.0)
    
print (2740-sum(np.logical_xor(predictions, good_preds[:, 1])))/2740.0

with open('svm_pred.csv', 'w') as output_file:
    output_file.write('Id,Prediction\n')
    for i, p in enumerate(predictions):
        output_file.write(str(i) + ', ' + str(p) + '\n')

0.99197080292
