# Using Scikit-Learn To Classify Your Own Text Data (The Short Version)

http://carrefax.com/articles-blog/2018/3/11/using-scikit-learn-to-classify-your-own-text-data-the-short-version

## Load data

In [30]:
import pandas as pd

colunas = ['ROTULO_MANUAL', 'EMENTA_NORM']

df = pd.read_csv('../../data/ementas_pre-processadas.csv', header=0, sep=',', quotechar='"', usecols=colunas)

## Split the dataset into training and testing sets

In [31]:
from sklearn.model_selection import train_test_split

data = df['EMENTA_NORM'].values.astype('U')
target = df['ROTULO_MANUAL']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4)

## Transform the training data into tfidf vectors

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

print ('\nTransforming the training data...\n')

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(raw_documents=X_train)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print (X_train_tfidf.shape)


Transforming the training data...

(4870, 16346)


## Transform the test data into tfidf vectors

In [33]:
print ('\nTransforming the test data...\n')

count_vect = CountVectorizer()
X_test_counts = count_vect.fit_transform(raw_documents=X_test)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
print (X_test_tfidf.shape)

print (X_test_tfidf)
print (y_train.shape)

docs_test = X_test


Transforming the test data...

(3247, 14269)
  (0, 7148)	0.052414241836095915
  (0, 9983)	0.052414241836095915
  (0, 2939)	0.052414241836095915
  (0, 11448)	0.052414241836095915
  (0, 2275)	0.052414241836095915
  (0, 3478)	0.052414241836095915
  (0, 329)	0.052414241836095915
  (0, 5336)	0.052414241836095915
  (0, 9537)	0.052414241836095915
  (0, 5643)	0.052414241836095915
  (0, 9438)	0.052414241836095915
  (0, 14083)	0.052414241836095915
  (0, 10781)	0.052414241836095915
  (0, 9666)	0.052414241836095915
  (0, 6356)	0.052414241836095915
  (0, 7105)	0.052414241836095915
  (0, 6577)	0.052414241836095915
  (0, 12927)	0.052414241836095915
  (0, 7861)	0.052414241836095915
  (0, 11586)	0.052414241836095915
  (0, 13065)	0.052414241836095915
  (0, 6866)	0.052414241836095915
  (0, 7800)	0.052414241836095915
  (0, 13479)	0.052414241836095915
  (0, 8210)	0.052414241836095915
  :	:
  (3245, 10108)	0.13933280001515577
  (3245, 3312)	0.03980937143290165
  (3245, 107)	0.03980937143290165
  (3246, 201

## Fit the model to the training data

In [34]:
text_clf.fit(X_train, y_train)



-- Epoch 1
Norm: 2.24, NNZs: 2766, Bias: -0.878712, T: 4870, Avg. loss: 0.009225
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 1.61, NNZs: 3425, Bias: -0.908247, T: 9740, Avg. loss: 0.006958
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 1.38, NNZs: 3749, Bias: -0.921783, T: 14610, Avg. loss: 0.006760
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 1.28, NNZs: 4249, Bias: -0.931023, T: 19480, Avg. loss: 0.006630
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 1.22, NNZs: 4560, Bias: -0.937749, T: 24350, Avg. loss: 0.006597
Total training time: 0.01 seconds.
-- Epoch 1
Norm: 8.01, NNZs: 8940, Bias: -0.960089, T: 4870, Avg. loss: 0.165634
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 7.69, NNZs: 9748, Bias: -0.989170, T: 9740, Avg. loss: 0.141044
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 7.59, NNZs: 10070, Bias: -1.002012, T: 14610, Avg. loss: 0.136748
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 7.50, NNZs: 10305, Bias: -1.008369, T: 19480, Avg.

-- Epoch 1
Norm: 6.85, NNZs: 5991, Bias: -1.006824, T: 4870, Avg. loss: 0.075423
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 6.43, NNZs: 6884, Bias: -1.055452, T: 9740, Avg. loss: 0.058763
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 6.21, NNZs: 7317, Bias: -1.072165, T: 14610, Avg. loss: 0.057385
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 6.24, NNZs: 7484, Bias: -1.083911, T: 19480, Avg. loss: 0.056160
Total training time: 0.02 seconds.
-- Epoch 5
Norm: 6.18, NNZs: 7553, Bias: -1.095045, T: 24350, Avg. loss: 0.055854
Total training time: 0.02 seconds.
-- Epoch 1
Norm: 4.33, NNZs: 4238, Bias: -0.848835, T: 4870, Avg. loss: 0.032016
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 3.97, NNZs: 5286, Bias: -0.887235, T: 9740, Avg. loss: 0.024840
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 3.87, NNZs: 5713, Bias: -0.901138, T: 14610, Avg. loss: 0.023886
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 3.82, NNZs: 5957, Bias: -0.906700, T: 19480, Avg. l

[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:    0.4s finished


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=1, warm_start=False))])

## Run the test data into the model

In [35]:
predicted = text_clf.predict(docs_test)

## Calculate mean accuracy of predictions

In [36]:
import numpy as np

print (np.mean(predicted == y_test))

0.7302125038497074


## Generate labelled performance metrics

In [37]:
from sklearn import metrics

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        AUT       0.83      0.42      0.56        12
        BAN       0.56      0.78      0.65       300
        BUS       0.72      0.59      0.65       101
        CDC       0.76      0.43      0.55        74
        CIV       0.00      0.00      0.00        16
        COM       0.21      0.13      0.16        46
        CON       0.74      0.60      0.66       289
        DAN       0.64      0.72      0.68       306
        DMI       0.69      0.56      0.62        96
        DPV       0.81      0.81      0.81        37
        EXP       0.83      0.96      0.89       577
        FAL       0.80      0.55      0.65        29
        FAM       0.71      0.61      0.66        74
        INF       0.64      0.78      0.70         9
        MAR       0.75      0.47      0.58        19
        OIE       0.76      0.79      0.78       217
        OIG       0.87      0.88      0.88       241
        POS       0.64      0.79      0.71   

  'precision', 'predicted', average, warn_for)
