In [1]:
import numpy as np
from xgboost import XGBClassifier
import pickle
import nltk
from nltk.stem.snowball import SnowballStemmer

from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

In [12]:
def load_model(file_name):
    with open(file_name, "rb") as f:
        while True:
            try:
                model = pickle.load(f)
                print('Model loaded successfully')
            except EOFError:
                break
    return model 

def test_model(model, dataset=test_set):
    predicted = model.predict(dataset.data)
    return np.mean(predicted == dataset.target)

## Download datasets

In [3]:
train_set = fetch_20newsgroups(subset='train', shuffle=True)
test_set = fetch_20newsgroups(subset='test', shuffle=True)

## Download data for stemming

In [4]:
nltk.download('snowball_data')
nltk.download('stopwords')

[nltk_data] Downloading package snowball_data to
[nltk_data]     /home/pietrek/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pietrek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Prepare stemming based CountVectorizer

In [5]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

## Prepare parameters and GridSearch pipeline

In [None]:
parameters = {
    'vect__ngram_range': [(1, 1),(1, 2)],  
    'tfidf__use_idf': (True, False),
    'dtc__max_depth': [1, 6, 8, 10],
    'dtc__reg_lambda': [1, 1.2, 3]
}

pipe_clf = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()), 
    ('dtc', XGBClassifier(
        seed=13
    ))
])

gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1)

## Run GridSearch

In [None]:
gs_clf = gs_clf.fit(train_set.data, train_set.target)
model = gs_clf.best_estimator_

print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 

## Save the best model to file

In [None]:
with open("model.pckl", "wb") as f:
    pickle.dump(model, f)

## Load the best model and run test set evaluation

In [None]:
xgboost = load_model('xgboost.pckl')
test_model(xgboost)

# SVM

In [None]:
from sklearn.svm import SVC
parameters = {
    'vect__ngram_range': [(1, 2)],  
    'tfidf__use_idf': [True],
    'dtc__kernel': ["linear", "poly", "rbf", "sigmoid"],
    'dtc__C': [1, 10, 100, 1000], 
    'dtc__gamma': [0.1, 0.001, 0.0001]
}
pipe_clf = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()), 
    ('dtc', SVC())
])

gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_set.data, train_set.target)
model = gs_clf.best_estimator_

print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 
with open("svm_model.pckl", "wb") as f:
    pickle.dump(model, f)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

parameters = {
    'vect__ngram_range': [(1, 2)],  
    'tfidf__use_idf': [True],
    'dtc__n_estimators': [10, 100, 1000],
    'dtc__max_depth': [10, 100, None], 
    'dtc__class_weight': [None, "balanced",]
}
pipe_clf = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()), 
    ('dtc', RandomForestClassifier())
])

gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_set.data, train_set.target)
model = gs_clf.best_estimator_

print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 
with open("svm_model.pckl", "wb") as f:
    pickle.dump(model, f)

In [7]:
svm_model = load_model('models/xgboost.pckl')
test_model(svm_model)

Model loaded successfully


0.7679235262878386

# SGDClassifier

In [11]:
from sklearn.linear_model import SGDClassifier

parameters = {
    'vect__ngram_range': [(1, 2)], (1, 1)],  
    'tfidf__use_idf': [True],
    'dtc__alpha': [1e-2, 1e-3, 1e-1],
    'dtc__fit_intercept': [True, False]
}

pipe_clf = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()), 
    ('dtc', SGDClassifier())
])

gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_set.data, train_set.target)
model = gs_clf.best_estimator_

print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 
with open("sgd_modelz.pckl", "wb") as f:
    pickle.dump(model, f)

Best score: 0.8987101931545529
Best param: {'dtc__alpha': 0.01, 'dtc__fit_intercept': True, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [13]:
svm_model = load_model('models/sgd_model.pckl')
print(test_model(svm_model, train_set))
print(test_model(svm_model))

Model loaded successfully
0.8207530493194273
0.9883165161975571
