In [100]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV

In [101]:
newsgroups = datasets.fetch_20newsgroups(subset='all', 
                                         categories=['alt.atheism', 'sci.space'])

In [102]:
newsgroups.target.size

1786

In [103]:
vectorizer = TfidfVectorizer()
data_tf = vectorizer.fit_transform(newsgroups.data)
data_tf

<1786x28382 sparse matrix of type '<type 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [104]:
data_tf

<1786x28382 sparse matrix of type '<type 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [105]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
grid

{'C': array([  1.00000000e-05,   1.00000000e-04,   1.00000000e-03,
          1.00000000e-02,   1.00000000e-01,   1.00000000e+00,
          1.00000000e+01,   1.00000000e+02,   1.00000000e+03,
          1.00000000e+04,   1.00000000e+05])}

In [106]:
cv = KFold(newsgroups.target.size, n_folds=5, shuffle=True, random_state=241)

In [107]:
clf = SVC(kernel='linear', random_state=241)

In [109]:
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(data_tf, newsgroups.target)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=1786, n_folds=5, shuffle=True, random_state=241),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [111]:
for a in gs.grid_scores_:
    print "%f : %s" % (a.mean_validation_score, a.parameters)

0.552632 : {'C': 1.0000000000000001e-05}
0.552632 : {'C': 0.0001}
0.552632 : {'C': 0.001}
0.552632 : {'C': 0.01}
0.950168 : {'C': 0.10000000000000001}
0.993281 : {'C': 1.0}
0.993281 : {'C': 10.0}
0.993281 : {'C': 100.0}
0.993281 : {'C': 1000.0}
0.993281 : {'C': 10000.0}
0.993281 : {'C': 100000.0}


In [112]:
clf_optimal = SVC(kernel='linear', random_state=241, C=1)

In [113]:
clf_optimal.fit(data_tf, newsgroups.target)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [153]:
result_indexes = np.argsort(np.absolute(np.asarray(clf_optimal.coef_.todense())).reshape(-1))[-10:]

In [155]:
all_words = np.asarray(vectorizer.get_feature_names())
all_words

array([u'00', u'000', u'0000', ..., u'zwarte', u'zwork', u'zyklon'], 
      dtype='<U80')

In [161]:
result = all_words[result_indexes]
result = result[np.argsort(result)]

In [168]:
answer = ""
for s in result:
    answer += s + ','

answer = answer[0 : len(answer) - 1]
answer

u'atheism,atheists,bible,god,keith,moon,religion,sci,sky,space'

In [169]:
f = open('answers/2', 'w')
f.write(answer)
f.close()

In [170]:
!cat answers/2

atheism,atheists,bible,god,keith,moon,religion,sci,sky,space