In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
import operator

In [2]:
#Downloading DS
newsgroups = datasets.fetch_20newsgroups(subset='all', categories= ['alt.atheism', 'sci.space'])
y = newsgroups.target

In [3]:
#TF_IDF Vectorizing
vector = TfidfVectorizer()
x = vector.fit_transform(newsgroups.data)
idf = vector.idf_

In [249]:
#Getting IDF vectors for each word
tfidf = dict(zip(vector.get_feature_names(), idf))
tfidf

{u'schlegel': 7.7951463345994831,
 u'luanch': 7.7951463345994831,
 u'tilton': 7.7951463345994831,
 u'woods': 6.5423833661041151,
 u'hanging': 6.0033868653714277,
 u'localized': 7.3896812264913185,
 u'disobeying': 7.7951463345994831,
 u'734841689': 7.7951463345994831,
 u'fractioning': 7.3896812264913185,
 u'5980': 7.7951463345994831,
 u'rickman': 7.7951463345994831,
 u'pigment': 7.3896812264913185,
 u'bringing': 5.4925612416054372,
 u'vibrational': 7.3896812264913185,
 u'wednesday': 7.1019991540395377,
 u'viable': 6.6965340459313731,
 u'matthean': 7.7951463345994831,
 u'270': 7.3896812264913185,
 u'271': 7.7951463345994831,
 u'272': 7.3896812264913185,
 u'273': 6.878855602725328,
 u'274': 7.3896812264913185,
 u'275': 6.6965340459313731,
 u'276': 7.3896812264913185,
 u'277': 7.3896812264913185,
 u'scraped': 7.7951463345994831,
 u'inanimate': 7.7951463345994831,
 u'errors': 5.230196977137946,
 u'cooking': 7.7951463345994831,
 u'usenet': 5.0225576123597016,
 u'100010': 7.3896812264913185,


In [88]:
x

<1786x28382 sparse matrix of type '<type 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [5]:
#Parameters selection
crossval = KFold(y.size, shuffle=True, random_state=241, n_folds=5)
classifier = SVC(kernel='linear', random_state=241)
gsearch = GridSearchCV(classifier, grid, cv = crossval, scoring='accuracy')
gsearch.fit(x,y)

In [17]:
for a in gsearch.grid_scores_:
    print a.mean_validation_score # — оценка качества по кросс-валидации
    print a.parameters  # — значения параметров

0.552631578947
{'C': 1.0000000000000001e-05}
0.552631578947
{'C': 0.0001}
0.552631578947
{'C': 0.001}
0.552631578947
{'C': 0.01}
0.950167973124
{'C': 0.10000000000000001}
0.993281075028
{'C': 1.0}
0.993281075028
{'C': 10.0}
0.993281075028
{'C': 100.0}
0.993281075028
{'C': 1000.0}
0.993281075028
{'C': 10000.0}
0.993281075028
{'C': 100000.0}


In [11]:
C = 100000

In [21]:
#Classification
classifier = SVC(C=100000,random_state=241, kernel='linear')
classifier.fit(x,y)

SVC(C=100000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [269]:
#Showing answer

features = classifier.coef_.toarray().tolist()
ans_full = zip(vector.get_feature_names(), map(abs, features[0]))
ans_full.sort(key = lambda t: t[1], reverse=True)
ans = dict(ans_full[:10])

In [283]:
line = ''
with open('ans2', 'w') as f:
    for w in sorted(ans):
        line = line + w[0]+' '
    f.write(line[:-1])

In [284]:
!less ans2

atheism atheists bible god keith moon nick religion sky space


In [252]:
vector.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'00000',
 u'000000',
 u'000021',
 u'000050',
 u'000062david42',
 u'000406',
 u'000410',
 u'00041032',
 u'0004136',
 u'00041555',
 u'0004244402',
 u'0004246',
 u'00043819',
 u'0004422',
 u'00044513',
 u'00044808',
 u'00044939',
 u'0004651657',
 u'0004847546',
 u'0004988',
 u'0005',
 u'0005169',
 u'0008512',
 u'00090711',
 u'000th',
 u'001125',
 u'0011265',
 u'0012',
 u'001326',
 u'00140',
 u'001428',
 u'001442',
 u'001555',
 u'001718',
 u'001757',
 u'0018',
 u'0020',
 u'0022',
 u'002214',
 u'002341',
 u'0028',
 u'0029',
 u'0033',
 u'0034',
 u'003719',
 u'004119',
 u'004311',
 u'004405',
 u'006',
 u'0065',
 u'0098',
 u'00pm',
 u'01',
 u'010',
 u'0100',
 u'0100lines',
 u'010116',
 u'010326',
 u'010821',
 u'010847',
 u'011255',
 u'012536',
 u'013034',
 u'013423tan102',
 u'013657',
 u'014305',
 u'014506',
 u'014554',
 u'01463',
 u'015',
 u'015922',
 u'015931',
 u'01609',
 u'01776',
 u'01826',
 u'018b',
 u'0192',
 u'0195',
 u'01wb',
 u'02',
 u'020021',
 u'020259'

In [241]:
sorted(ans.iteritems())

[(u'dc', 0.8887984098553399),
 (u'ether', 0.9515260812810916),
 (u'gamma', 0.9057433029593873),
 (u'launch', 0.85431284822333),
 (u'moon', 1.2483784040233228),
 (u'nasa', 1.0459696724252585),
 (u'nick', 1.0899236751513084),
 (u'sci', 1.009669264882222),
 (u'sky', 1.1959538612919782),
 (u'space', 2.7203044633899114)]

In [243]:
!less ans2

dc ether gamma launch moon nasa nick sci sky space
