# Text Analysis - with Pipeline Cross Validation 

For this demo, we’ll create four different pipelines using TF-IDF and CountVectorizer for vectorization and SGDClassifier and SVC (support vector classifier). Then using cross_val_score function, we’ll train the each model two times and record their mean accuracy. We’ll choose the highest performing model and train it and then evaluate it in the test set.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score
DATA_DIR = "./bbc/"

print('done')

done


## Get and set up our data 

In [2]:
data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace")
# calculate count of each category
labels, counts = np.unique(data.target, return_counts=True)

# convert data.target_names to np array for fancy indexing
labels_str = np.array(data.target_names)[labels]
print(dict(zip(labels_str, counts)))

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
list(t[:80] for t in X_train[:10])

#vectorize the data
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000, decode_error="ignore")
vectorizer.fit(X_train)

vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized

{'business': 510, 'entertainment': 386, 'politics': 417, 'sport': 511, 'tech': 401}


<1668x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 112802 stored elements in Compressed Sparse Row format>

## Build the starting pipeline

In [3]:
# start with the classic
# with either pure counts or tfidf features
sgd = Pipeline([
        ("count vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
sgd_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
 
svc = Pipeline([
        ("count_vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])
svc_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])

sgd, SVC, svc_tfidf

(Pipeline(memory=None,
          steps=[('count vectorizer',
                  CountVectorizer(analyzer='word', binary=False,
                                  decode_error='strict',
                                  dtype=<class 'numpy.int64'>, encoding='utf-8',
                                  input='content', lowercase=True, max_df=1.0,
                                  max_features=3000, min_df=1,
                                  ngram_range=(1, 1), preprocessor=None,
                                  stop_words='english', strip_accents=None,
                                  token_pattern='(?u)\\b\\w\\w+\\b',
                                  tokenizer=Non...
                  SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                                early_stopping=False, epsilon=0.1, eta0=0.0,
                                fit_intercept=True, l1_ratio=0.15,
                                learning_rate='optimal', loss='modified_huber',
                     

## Configure all the pipelines 

In [4]:
all_models = [
    ("sgd", sgd),
    ("sgd_tfidf", sgd_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
    ]
all_models

[('sgd',
  Pipeline(memory=None,
           steps=[('count vectorizer',
                   CountVectorizer(analyzer='word', binary=False,
                                   decode_error='strict',
                                   dtype=<class 'numpy.int64'>, encoding='utf-8',
                                   input='content', lowercase=True, max_df=1.0,
                                   max_features=3000, min_df=1,
                                   ngram_range=(1, 1), preprocessor=None,
                                   stop_words='english', strip_accents=None,
                                   token_pattern='(?u)\\b\\w\\w+\\b',
                                   tokenizer=Non...
                   SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                                 early_stopping=False, epsilon=0.1, eta0=0.0,
                                 fit_intercept=True, l1_ratio=0.15,
                                 learning_rate='optimal', loss='modified_huber

## set up the scores 

In [5]:
unsorted_scores = [(name, cross_val_score(model, X_train, y_train, cv=2).mean()) for name, model in all_models]
scores = sorted(unsorted_scores, key=lambda x: -x[1])
print(scores)  

[('svc_tfidf', 0.974220623501199), ('sgd_tfidf', 0.9610311750599521), ('svc', 0.959832134292566), ('sgd', 0.9568345323741008)]


## Identify the best of the best

Support Vector Machine with tf-idf features scored the highest accuracy of 97%. Lets train it and evaluate it in the test dataset.

In [6]:
model = svc_tfidf
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9730700179533214
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       138
           1       0.98      0.97      0.97        94
           2       0.96      0.97      0.97       104
           3       0.99      0.99      0.99       131
           4       0.96      0.99      0.97        90

    accuracy                           0.97       557
   macro avg       0.97      0.97      0.97       557
weighted avg       0.97      0.97      0.97       557

