In [45]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC 
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

In [148]:
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

In [149]:
vectorizer = tfidf()
X = vectorizer.fit_transform(raw_documents=newsgroups.data)
y = newsgroups.target

In [150]:
search_grid = {'C': np.power(10.0, np.arange(-5,6))}

In [151]:
folds = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, search_grid, scoring='accuracy', cv=folds)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [59]:
for a in gs.grid_scores_:
    print(a.parameters, a.mean_validation_score)

{'C': 1.0000000000000001e-05} 0.552631578947
{'C': 0.0001} 0.552631578947
{'C': 0.001} 0.552631578947
{'C': 0.01} 0.552631578947
{'C': 0.10000000000000001} 0.950167973124
{'C': 1.0} 0.993281075028
{'C': 10.0} 0.993281075028
{'C': 100.0} 0.993281075028
{'C': 1000.0} 0.993281075028
{'C': 10000.0} 0.993281075028
{'C': 100000.0} 0.993281075028




In [157]:
clf = SVC(C=1, kernel='linear').fit(X,y)
print(clf.coef_)

  (0, 11098)	0.113315317878
  (0, 6775)	0.0513432082411
  (0, 5107)	0.0544519626112
  (0, 98)	0.059766413309
  (0, 27042)	0.104718642966
  (0, 22622)	0.104718642966
  (0, 6135)	0.104718642966
  (0, 27130)	0.00684318140366
  (0, 27083)	0.00684318140366
  (0, 26026)	0.00721866056204
  (0, 23036)	0.00620129554273
  (0, 22982)	0.00721866056204
  (0, 22762)	0.00721866056204
  (0, 22739)	0.00721866056204
  (0, 22595)	0.00721866056204
  (0, 21945)	0.00684318140366
  (0, 20801)	0.0144373211241
  (0, 20800)	0.0479022698256
  (0, 20042)	0.0216559816861
  (0, 16400)	0.0166782290454
  (0, 16224)	0.00721866056204
  (0, 13928)	0.00721866056204
  (0, 11716)	0.00721866056204
  (0, 10127)	0.00684318140366
  (0, 8938)	0.00721866056204
  :	:
  (0, 9935)	0.324839975194
  (0, 9711)	-0.0951687122427
  (0, 9622)	0.0399088130115
  (0, 9368)	-0.311250533821
  (0, 9107)	-0.157524596754
  (0, 9034)	0.144841345667
  (0, 9009)	-0.315049193508
  (0, 8498)	0.0111960110243
  (0, 7951)	-0.0200462590056
  (0, 7418)	-0.

In [163]:
coefficients = np.absolute(clf.coef_.toarray())

In [164]:
words = []
for i in range(0,10):
    max_idx = coefficients.argmax()
    words.append(vectorizer.get_feature_names()[max_idx])
    print('idx:{0}, weight:{1}, word:{2}'.format(max_idx, coefficients[0,max_idx], vectorizer.get_feature_names()[max_idx]))
    coefficients[0, max_idx] = -1

idx:24019, weight:2.6631647884797105, word:space
idx:12871, weight:1.9203794002294938, word:god
idx:5088, weight:1.2546899512384038, word:atheism
idx:5093, weight:1.2491800073760075, word:atheists
idx:17802, weight:1.2016111817520696, word:moon
idx:23673, weight:1.1801315951388633, word:sky
idx:21850, weight:1.13908083789883, word:religion
idx:5776, weight:1.130612344664901, word:bible
idx:15606, weight:1.0970936466401482, word:keith
idx:22936, weight:1.0293069271856938, word:sci


In [160]:
words.sort()
f = open('ans.txt', 'w')
for word in words:
    f.write('{0} '.format(word))
f.close()