In [1]:
from newsgac import Pipeline, DataSource, TFIDF, LearnerSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import PCA

In [2]:
[ds.display_title for ds in DataSource.objects.all()]

[u'BGS ', u'Hold-out-test ']

In [3]:
ds = DataSource.objects.first()

In [4]:
p = Pipeline()

In [5]:
p.learner = LearnerSVC.create()

In [6]:
p.nlp_tool = TFIDF.create()
p.nlp_tool.parameters.scaling = False

In [7]:
skp = p.get_sk_pipeline()

In [8]:
skp.steps

[('CleanOCR', <newsgac.nlp_tools.transformers.CleanOCR at 0x7fe3d11b11d0>),
 ('StopWordRemoval',
  <newsgac.nlp_tools.transformers.StopWordRemoval at 0x7fe3d11b1890>),
 ('FeatureExtraction', FeatureUnion(n_jobs=None,
         transformer_list=[('TF-IDF', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=None, min_df=5,
          ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=True,
          token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None))],
         transformer_weights=None)),
 ('Classifier', SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel=u'linear',
    max_iter=-1, probability=True, random_state=42, shrinking=True,
    tol=0.001, verbose

In [9]:
model = skp.steps.pop()[1]
# model = OneVsRestClassifier(SVC(kernel='linear', probability=True, decision_function_shape='ovr'))
feature_extractor = skp

In [10]:
articles = ds.articles[0:100]

In [11]:
features = feature_extractor.fit_transform([a.raw_text for a in articles])
labels = [a.label for a in articles]
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
features[0]

<1x1546 sparse matrix of type '<type 'numpy.float64'>'
	with 67 stored elements in Compressed Sparse Row format>

In [54]:
label_encoder.classes_

array([u'ACH', u'COL', u'INT', u'NIE', u'OPI', u'REC', u'REP', u'VER'],
      dtype='<U3')

In [12]:
model.fit(features.toarray(), labels)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel=u'linear',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
test_articles = ds.articles[100:105]
test_features = feature_extractor.transform([a.raw_text for a in test_articles]).toarray()
test_features[0]
test_n=2

In [14]:
test_features[0].shape

(1546,)

In [15]:
model.predict([test_features[test_n]])[0]

u'OPI'

In [16]:
model.predict_proba([test_features[test_n]])[0]

array([0.17252935, 0.22346681, 0.03312437, 0.04915946, 0.21057002,
       0.12561491, 0.04790224, 0.13763283])

In [17]:
sorted(
    zip(model.classes_, model.predict_proba([test_features[test_n]])[0]),
    key=lambda x: -x[1]
)

[(u'COL', 0.22346680618065554),
 (u'OPI', 0.21057002490796678),
 (u'ACH', 0.1725293464849411),
 (u'VER', 0.13763283276106553),
 (u'REC', 0.12561491393427038),
 (u'NIE', 0.04915946307575586),
 (u'REP', 0.04790224096852394),
 (u'INT', 0.033124371686820114)]

In [33]:
test_articles[test_n].label

u'OPI'

In [29]:
model.classes_

array([u'ACH', u'COL', u'INT', u'NIE', u'OPI', u'REC', u'REP', u'VER'],
      dtype='<U3')