In [1]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import numpy as np

In [2]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [3]:
x = newsgroups.data
y = newsgroups.target

In [4]:
vectorizer = TfidfVectorizer()
data = vectorizer.fit_transform(newsgroups.data)
print(data)

  (0, 7135)	0.09177291452449737
  (0, 4228)	0.09732962706472328
  (0, 7730)	0.02575006576343587
  (0, 10229)	0.07116022661373511
  (0, 19790)	0.13853387972005007
  (0, 15344)	0.05308005187089393
  (0, 20738)	0.06612072788446571
  (0, 23930)	0.025879967583209106
  (0, 7156)	0.05954839972168062
  (0, 17180)	0.08346610253642223
  (0, 21711)	0.09732962706472328
  (0, 26651)	0.04831465306608014
  (0, 23767)	0.09427154747892712
  (0, 18131)	0.04803011953672212
  (0, 27676)	0.0271693788925225
  (0, 5591)	0.07750026888717974
  (0, 16195)	0.04900248039624845
  (0, 4594)	0.029268293013519193
  (0, 21315)	0.08966035046275332
  (0, 5542)	0.1724324039685429
  (0, 19013)	0.052035390940301
  (0, 18071)	0.029078607779420725
  (0, 13384)	0.09427154747892712
  (0, 9776)	0.05697744713041564
  (0, 27786)	0.09732962706472328
  :	:
  (1785, 27676)	0.04497597829354177
  (1785, 18071)	0.024068250464378153
  (1785, 28273)	0.02370385618179079
  (1785, 18954)	0.016313452448975245
  (1785, 12908)	0.06033099767146

In [5]:
C_grid = {'C': np.power(10.0, np.arange(-5, 6))}
kf = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, C_grid, scoring='accuracy', cv=kf, n_jobs=-1)
gs.fit(data, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=241,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [6]:
print(gs.best_estimator_)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=241,
    shrinking=True, tol=0.001, verbose=False)


In [8]:
clf.fit(data, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=241,
    shrinking=True, tol=0.001, verbose=False)

In [10]:
feature_map = vectorizer.get_feature_names()
weights = np.absolute(clf.coef_.toarray())
max_weights = sorted(zip(weights[0], feature_map))[-10:]
max_weights.sort(key=lambda x: x[1])
print(max_weights)

[(1.2546899512384038, 'atheism'), (1.2491800073760078, 'atheists'), (1.1306123446649008, 'bible'), (1.9203794002294927, 'god'), (1.0970936466401477, 'keith'), (1.201611181752071, 'moon'), (1.13908083789883, 'religion'), (1.0293069271856938, 'sci'), (1.1801315951388636, 'sky'), (2.6631647884797105, 'space')]


In [13]:
f = open('text.txt', 'w')
for w, c in max_weights[:-1]:
    f.write(c)
    f.write(',')
f.write(max_weights[-1][1])
f.close()