# Lab 7: Text Classification with SVM

In [1]:
from sklearn.datasets import load_files
movie_reviews_data_folder = 'lab7.data/movie_reviews/txt_sentoken'
dataset = load_files(movie_reviews_data_folder, shuffle=False)

Labels

In [2]:
dataset.target_names

['neg', 'pos']

Training data

In [3]:
dataset.filenames

array(['lab7.data/movie_reviews/txt_sentoken/neg/cv000_29416.txt',
       'lab7.data/movie_reviews/txt_sentoken/neg/cv001_19502.txt',
       'lab7.data/movie_reviews/txt_sentoken/neg/cv002_17424.txt', ...,
       'lab7.data/movie_reviews/txt_sentoken/pos/cv997_5046.txt',
       'lab7.data/movie_reviews/txt_sentoken/pos/cv998_14111.txt',
       'lab7.data/movie_reviews/txt_sentoken/pos/cv999_13106.txt'], 
      dtype='<U56')

In [4]:
len(dataset.data)

2000

Split data into train data and test data

In [5]:
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=None)


In [35]:
# docs_train[:2]
# y_train[:2]
dataset.target

array([0, 0, 0, ..., 1, 1, 1])

TASK: Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC, NuSVC

# your code here ...
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(docs_train)
X_train_counts.shape

(1500, 35381)

In [29]:
count_vect.vocabulary_.get('for')

12241

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(1500, 35381)

In [31]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1500, 35381)

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])
text_clf.fit(docs_train, y_train)

Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [33]:
predicted = text_clf.predict(docs_test)

In [39]:
import numpy as np
np.mean(predicted == y_test)  

0.83399999999999996

In [41]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted,
    target_names=dataset.target_names))

             precision    recall  f1-score   support

        neg       0.82      0.85      0.83       240
        pos       0.85      0.82      0.84       260

avg / total       0.83      0.83      0.83       500



TASK: Build a grid search to find out whether unigrams or bigrams are more useful.

Fit the pipeline on the training set using grid search for the parameters

In [42]:
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__use_idf': (True, False),
              'clf__C': (1.0, 0.1, 1e-2, 1e-3),
}

In [43]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [45]:
gs_clf = gs_clf.fit(docs_train, y_train)

In [46]:
gs_clf.best_score_

0.82599999999999996

In [47]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__C: 1.0
vect__ngram_range: (1, 1)
vect__use_idf: True


In [50]:
clf = gs_clf.best_estimator_
predicted = clf.predict(docs_test)
import numpy as np
np.mean(predicted == y_test) 

0.83399999999999996

Print the mean and std for each candidate along with the parameter settings for all the candidates explored by grid search.

In [53]:
n_candidates = len(gs_clf.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
             % (gs_clf.cv_results_['params'][i],
                gs_clf.cv_results_['mean_test_score'][i],
                gs_clf.cv_results_['std_test_score'][i]))

0 params - {'clf__C': 1.0, 'vect__ngram_range': (1, 1), 'vect__use_idf': True}; mean - 0.83; std - 0.00
1 params - {'clf__C': 1.0, 'vect__ngram_range': (1, 1), 'vect__use_idf': False}; mean - 0.81; std - 0.00
2 params - {'clf__C': 1.0, 'vect__ngram_range': (1, 2), 'vect__use_idf': True}; mean - 0.82; std - 0.02
3 params - {'clf__C': 1.0, 'vect__ngram_range': (1, 2), 'vect__use_idf': False}; mean - 0.80; std - 0.02
4 params - {'clf__C': 0.1, 'vect__ngram_range': (1, 1), 'vect__use_idf': True}; mean - 0.80; std - 0.00
5 params - {'clf__C': 0.1, 'vect__ngram_range': (1, 1), 'vect__use_idf': False}; mean - 0.73; std - 0.01
6 params - {'clf__C': 0.1, 'vect__ngram_range': (1, 2), 'vect__use_idf': True}; mean - 0.79; std - 0.02
7 params - {'clf__C': 0.1, 'vect__ngram_range': (1, 2), 'vect__use_idf': False}; mean - 0.74; std - 0.01
8 params - {'clf__C': 0.01, 'vect__ngram_range': (1, 1), 'vect__use_idf': True}; mean - 0.77; std - 0.01
9 params - {'clf__C': 0.01, 'vect__ngram_range': (1, 1), 'v

Predict the outcome on the testing set and store it in a variable named y_predicted

In [54]:
y_predicted = gs_clf.predict(docs_test)

#### Print the classification report

In [55]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

        neg       0.82      0.85      0.83       240
        pos       0.85      0.82      0.84       260

avg / total       0.83      0.83      0.83       500



#### Print and plot the confusion matrix

In [56]:
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[203  37]
 [ 46 214]]
