# Using Scikit-Learn To Classify Your Own Text Data (The Short Version)

http://carrefax.com/articles-blog/2018/3/11/using-scikit-learn-to-classify-your-own-text-data-the-short-version

## Load data

In [4]:
import pandas as pd

colunas = ['ROTULO_MANUAL', 'EMENTA_NORM']

df = pd.read_csv('../../data/ementas_pre-processadas.csv', header=0, sep=',', quotechar='"', usecols=colunas)

## Split the dataset into training and testing sets

In [12]:
from sklearn.model_selection import train_test_split

data = df['EMENTA_NORM'].values.astype('U')
target = df['ROTULO_MANUAL']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4)

## Transform the training data into tfidf vectors

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

print ('\nTransforming the training data...\n')

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(raw_documents=X_train)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print (X_train_tfidf.shape)


Transforming the training data...

(4870, 16309)


## Transform the test data into tfidf vectors

In [14]:
print ('\nTransforming the test data...\n')

count_vect = CountVectorizer()
X_test_counts = count_vect.fit_transform(raw_documents=X_test)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
print (X_test_tfidf.shape)

print (X_test_tfidf)
print (y_train.shape)

docs_test = X_test


Transforming the test data...

(3247, 14303)
  (0, 7184)	0.06495698024616309
  (0, 3507)	0.06495698024616309
  (0, 8886)	0.06495698024616309
  (0, 6751)	0.06495698024616309
  (0, 6377)	0.06495698024616309
  (0, 12751)	0.06495698024616309
  (0, 13388)	0.06495698024616309
  (0, 1172)	0.06495698024616309
  (0, 7142)	0.06495698024616309
  (0, 2259)	0.06495698024616309
  (0, 3483)	0.06495698024616309
  (0, 12622)	0.06495698024616309
  (0, 5324)	0.06495698024616309
  (0, 11613)	0.06495698024616309
  (0, 6197)	0.06495698024616309
  (0, 5619)	0.12991396049232617
  (0, 6374)	0.06495698024616309
  (0, 7051)	0.06495698024616309
  (0, 11628)	0.06495698024616309
  (0, 7186)	0.06495698024616309
  (0, 6615)	0.06495698024616309
  (0, 12962)	0.06495698024616309
  (0, 7880)	0.06495698024616309
  (0, 11626)	0.06495698024616309
  (0, 6905)	0.06495698024616309
  :	:
  (3246, 11657)	0.10721125348377948
  (3246, 224)	0.10721125348377948
  (3246, 10887)	0.10721125348377948
  (3246, 11218)	0.10721125348377948

## Construct the classifier pipeline using a SGDClassifier algorithm

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

print ('\nApplying the classifier...\n')

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                      alpha=1e-3, random_state=42, verbose=1)),
])


Applying the classifier...



## Fit the model to the training data

In [20]:
text_clf.fit(X_train, y_train)



-- Epoch 1
Norm: 2.31, NNZs: 2759, Bias: -0.851990, T: 4870, Avg. loss: 0.007635
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 1.70, NNZs: 3514, Bias: -0.892791, T: 9740, Avg. loss: 0.006059
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 1.45, NNZs: 3898, Bias: -0.908031, T: 14610, Avg. loss: 0.005939
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 1.33, NNZs: 4262, Bias: -0.917604, T: 19480, Avg. loss: 0.005864
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 1.25, NNZs: 4569, Bias: -0.923024, T: 24350, Avg. loss: 0.005817
Total training time: 0.01 seconds.
-- Epoch 1
Norm: 7.46, NNZs: 8607, Bias: -0.950215, T: 4870, Avg. loss: 0.170526
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 7.02, NNZs: 9585, Bias: -0.980967, T: 9740, Avg. loss: 0.143398
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 6.94, NNZs: 10009, Bias: -0.988459, T: 14610, Avg. loss: 0.140561
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 6.78, NNZs: 10138, Bias: -0.994628, T: 19480, Avg.

Norm: 1.50, NNZs: 5252, Bias: -0.909105, T: 24350, Avg. loss: 0.009344
Total training time: 0.01 seconds.
-- Epoch 1
Norm: 7.94, NNZs: 6113, Bias: -0.907777, T: 4870, Avg. loss: 0.102697
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 7.62, NNZs: 6673, Bias: -0.932918, T: 9740, Avg. loss: 0.085403
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 7.54, NNZs: 7047, Bias: -0.960784, T: 14610, Avg. loss: 0.081696
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 7.56, NNZs: 7123, Bias: -0.964785, T: 19480, Avg. loss: 0.080996
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 7.53, NNZs: 7205, Bias: -0.977663, T: 24350, Avg. loss: 0.079480
Total training time: 0.02 seconds.
-- Epoch 1
Norm: 6.62, NNZs: 5759, Bias: -1.026674, T: 4870, Avg. loss: 0.073468
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 6.17, NNZs: 6871, Bias: -1.067427, T: 9740, Avg. loss: 0.062956
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 6.10, NNZs: 7134, Bias: -1.083460, T: 14610, Avg. loss: 0.0608

[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:    0.4s finished


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=1, warm_start=False))])

## Run the test data into the model

In [21]:
predicted = text_clf.predict(docs_test)

## Calculate mean accuracy of predictions

In [23]:
import numpy as np

print (np.mean(predicted == y_test))

0.7222051124114567


## Generate labelled performance metrics

In [29]:
from sklearn import metrics

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        AUT       1.00      0.36      0.53        14
        BAN       0.69      0.79      0.73       330
        BUS       0.86      0.57      0.69       100
        CDC       0.81      0.39      0.53        66
        CIV       0.67      0.20      0.31        10
        COM       0.55      0.12      0.20        49
        CON       0.64      0.59      0.61       273
        DAN       0.59      0.71      0.64       305
        DMI       0.70      0.54      0.61        92
        DPV       0.95      0.85      0.90        46
        EXP       0.82      0.97      0.89       553
        FAL       0.62      0.57      0.59        23
        FAM       0.59      0.68      0.63        81
        INF       0.86      0.75      0.80         8
        MAR       0.67      0.56      0.61        18
        OIE       0.76      0.78      0.77       228
        OIG       0.76      0.89      0.82       242
        POS       0.75      0.67      0.71   

  'precision', 'predicted', average, warn_for)
