* fetch corpus from `title_sample.txt` and use `TfidfVectorizer` to generate ngrams's tfidf features 
* change `token_pattern` to properly deal with chinese data

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus_f = open('/home/crazyplum/mining-news/title_sample.txt', 'r')
corpus = corpus_f.readlines()
vectorizer = TfidfVectorizer(min_df=1, max_df=0.8, token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2))

In [2]:
X = vectorizer.fit_transform(corpus)

* You can tune the tfidf features with the `min_df` ,`max_df` and `ngram_range` parameters

In [3]:
vectorizer.stop_words_

{'新聞'}

In [4]:
y = [0 if int(x) < 0 else 1 for x in open('/home/crazyplum/mining-news/title_sample_target.txt', 'r')]

In [6]:
len(y)

10000

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split

In [8]:
from sklearn import svm

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
print(clf.score(X_test, y_test))  # validation accuracy
print(clf.score(X_train, y_train)) # training set accuracy

0.815666666667
0.974428571429


[accuracy score](http://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score)

* observe features 
* coef_: the bigger the absolute value of coefficient, the more important is the coefficient

In [None]:
important_features = np.argsort(clf.coef_[0].toarray())

In [None]:
for f in important_features[0][:100]:
    print(vectorizer.get_feature_names()[f], clf.coef_[0, f])

In [None]:
for f in important_features[0][-100:]:
    print(vectorizer.get_feature_names()[f], clf.coef_[0, f])

* Use GridSearchCV to tune model parameters

In [11]:
from sklearn.model_selection import GridSearchCV

In [37]:
tuned_parameters = {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5)

In [38]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1, 10, 100, 1000], 'kernel': ['linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [43]:
clf.best_score_

0.81328571428571428

* Use the parameters to train the model again

In [40]:
best_clf = svm.SVC(**clf.best_params_)
best_clf.fit(X_train, y_train)
print(best_clf.score(X_test, y_test))  # validation accuracy
print(best_clf.score(X_train, y_train)) # training set accuracy

0.811666666667
0.999


* Use LinearSVC with penalty function `l1` to reduce features.

In [29]:
from sklearn.feature_selection import SelectFromModel
lsvc = svm.LinearSVC(C=10, penalty="l1", dual=False).fit(X_train, y_train)
model = SelectFromModel(lsvc, prefit=True)

In [30]:
X_train.shape

(7000, 100237)

In [31]:
X_new = model.transform(X_train)

In [32]:
X_new.shape

(7000, 9287)

* train model with reduced features and the picked parameters

In [41]:
best_reduced_clf = svm.SVC(**clf.best_params_).fit(X_new, y_train)

In [42]:
best_reduced_clf.score(model.transform(X_test), y_test)

0.81599999999999995