# Using Scikit-Learn To Classify Your Own Text Data (The Short Version)

http://carrefax.com/articles-blog/2018/3/11/using-scikit-learn-to-classify-your-own-text-data-the-short-version

## Load data

In [1]:
import pandas as pd

colunas = ['ROTULO_MANUAL', 'EMENTA_NORM']

df = pd.read_csv('../../data/ementas_pre-processadas.csv', header=0, sep=',', quotechar='"', usecols=colunas)

## Split the dataset into training and testing sets

In [2]:
from sklearn.model_selection import train_test_split

data = df['EMENTA_NORM'].values.astype('U')
target = df['ROTULO_MANUAL']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4)

## Transform the training data into tfidf vectors

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

print ('\nTransforming the training data...\n')

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(raw_documents=X_train)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print (X_train_tfidf.shape)


Transforming the training data...

(4870, 16372)


## Transform the test data into tfidf vectors

In [4]:
print ('\nTransforming the test data...\n')

count_vect = CountVectorizer()
X_test_counts = count_vect.fit_transform(raw_documents=X_test)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
print (X_test_tfidf.shape)

print (X_test_tfidf)
print (y_train.shape)

docs_test = X_test


Transforming the test data...

(3247, 14223)
  (0, 13417)	0.0663723311599972
  (0, 13329)	0.0663723311599972
  (0, 12329)	0.0663723311599972
  (0, 185)	0.0663723311599972
  (0, 11142)	0.0663723311599972
  (0, 11578)	0.0663723311599972
  (0, 8233)	0.0663723311599972
  (0, 10898)	0.0663723311599972
  (0, 10379)	0.0663723311599972
  (0, 11366)	0.0663723311599972
  (0, 1860)	0.0663723311599972
  (0, 2183)	0.0663723311599972
  (0, 1719)	0.0663723311599972
  (0, 6317)	0.0663723311599972
  (0, 7313)	0.0663723311599972
  (0, 12615)	0.0663723311599972
  (0, 6280)	0.0663723311599972
  (0, 7766)	0.0663723311599972
  (0, 1514)	0.0663723311599972
  (0, 9946)	0.0663723311599972
  (0, 6281)	0.0663723311599972
  (0, 9225)	0.0663723311599972
  (0, 6306)	0.0663723311599972
  (0, 2579)	0.0663723311599972
  (0, 6181)	0.0663723311599972
  :	:
  (3246, 11606)	0.16666666666666666
  (3246, 8840)	0.16666666666666666
  (3246, 9502)	0.16666666666666666
  (3246, 3240)	0.16666666666666666
  (3246, 11064)	0.166666

## Construct the classifier pipeline using a SGDClassifier algorithm

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

print ('\nApplying the classifier...\n')
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                      alpha=1e-3, random_state=42, verbose=1)),
])


Applying the classifier...



## Fit the model to the training data

In [8]:
text_clf.fit(X_train, y_train)



-- Epoch 1
Norm: 2.32, NNZs: 3012, Bias: -0.851659, T: 4870, Avg. loss: 0.008491
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 1.78, NNZs: 3815, Bias: -0.890915, T: 9740, Avg. loss: 0.006714
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 1.55, NNZs: 4143, Bias: -0.906050, T: 14610, Avg. loss: 0.006425
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 1.43, NNZs: 4520, Bias: -0.914702, T: 19480, Avg. loss: 0.006361
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 1.37, NNZs: 4805, Bias: -0.921856, T: 24350, Avg. loss: 0.006334
Total training time: 0.01 seconds.
-- Epoch 1
Norm: 7.84, NNZs: 8580, Bias: -0.935586, T: 4870, Avg. loss: 0.162653
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 7.35, NNZs: 9608, Bias: -0.967003, T: 9740, Avg. loss: 0.137458
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 7.00, NNZs: 9913, Bias: -0.980665, T: 14610, Avg. loss: 0.133056
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 7.11, NNZs: 10218, Bias: -0.988118, T: 19480, Avg. 

-- Epoch 1
Norm: 7.37, NNZs: 6499, Bias: -0.870644, T: 4870, Avg. loss: 0.094260
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 6.77, NNZs: 7203, Bias: -0.902528, T: 9740, Avg. loss: 0.075937
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 6.66, NNZs: 7610, Bias: -0.923421, T: 14610, Avg. loss: 0.073900
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 6.55, NNZs: 7902, Bias: -0.933628, T: 19480, Avg. loss: 0.073684
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 6.49, NNZs: 7981, Bias: -0.938484, T: 24350, Avg. loss: 0.073147
Total training time: 0.02 seconds.
-- Epoch 1
Norm: 4.56, NNZs: 4696, Bias: -0.890474, T: 4870, Avg. loss: 0.032700
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 4.21, NNZs: 5540, Bias: -0.928482, T: 9740, Avg. loss: 0.025219
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 4.12, NNZs: 5952, Bias: -0.943872, T: 14610, Avg. loss: 0.024633
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 4.05, NNZs: 6198, Bias: -0.956616, T: 19480, Avg. l

[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:    0.5s finished


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=1, warm_start=False))])

## Run the test data into the model

In [9]:
predicted = text_clf.predict(docs_test)

## Calculate mean accuracy of predictions

In [10]:
import numpy as np

print (np.mean(predicted == y_test))

0.72405297197413


## Generate labelled performance metrics

In [11]:
from sklearn import metrics

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        AUT       0.80      0.33      0.47        12
        BAN       0.62      0.76      0.68       322
        BUS       0.76      0.65      0.70       110
        CDC       0.82      0.42      0.56        78
        CIV       0.00      0.00      0.00        16
        COM       0.60      0.06      0.10        53
        CON       0.65      0.58      0.62       277
        DAN       0.62      0.74      0.68       283
        DMI       0.57      0.56      0.56        89
        DPV       0.87      0.77      0.81        43
        EXP       0.81      0.95      0.87       560
        FAL       0.86      0.63      0.73        30
        FAM       0.69      0.62      0.65        73
        INF       0.75      0.86      0.80         7
        MAR       0.86      0.27      0.41        22
        OIE       0.71      0.78      0.74       226
        OIG       0.89      0.84      0.87       286
        POS       0.72      0.84      0.77   