In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

In [2]:
corpus = ['This is a snow leopard',
          'It is not snowing today',]

In [3]:
# without considering word stem
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())
print(vectorizer.get_feature_names())

[[1 0 1 0 1 0 1 0]
 [1 1 0 1 0 1 0 1]]
['is', 'it', 'leopard', 'not', 'snow', 'snowing', 'this', 'today']


In [4]:
# when we check word stem
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
#
stemmed_count_vect = StemmedCountVectorizer()
print(stemmed_count_vect.fit_transform(corpus).toarray())
print(stemmed_count_vect.get_feature_names())

[[1 0 1 0 1 1 0]
 [1 1 0 1 1 0 1]]
['is', 'it', 'leopard', 'not', 'snow', 'this', 'today']


In [5]:
# when TF-IDF is added
pipe_line = Pipeline([('vect', stemmed_count_vect),('tfidf', TfidfTransformer()),])
print(pipe_line.fit_transform(corpus).toarray())

[[0.40993715 0.         0.57615236 0.         0.40993715 0.57615236
  0.        ]
 [0.35520009 0.49922133 0.         0.49922133 0.35520009 0.
  0.49922133]]


In [6]:
# add svm
test = ['A', 'W']
pipe_line = Pipeline([('vect', stemmed_count_vect),('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
fit_text_stemmed = pipe_line.fit(corpus, test)
pred = fit_text_stemmed.predict(['This is snow', 'It is snowy today'])
print(pred)
y_test = ['A', 'W']
print("accuracy score: " + str(accuracy_score(pred, y_test)))



['A' 'W']
accuracy score: 1.0


In [159]:
# Predict confidence scores for samples.
# The confidence score for a sample is the signed distance of that sample to the hyperplane.
# print(fit_text_stemmed.decision_function(corpus))