# Using Scikit-Learn To Classify Your Own Text Data (The Short Version)

http://carrefax.com/articles-blog/2018/3/11/using-scikit-learn-to-classify-your-own-text-data-the-short-version

## Load data

In [1]:
import pandas as pd

colunas = ['ROTULO_MANUAL', 'EMENTA_NORM']

df = pd.read_csv('../../data/ementas_pre-processadas.csv', header=0, sep=',', quotechar='"', usecols=colunas)

## Split the dataset into training and testing sets

In [2]:
from sklearn.model_selection import train_test_split

data = df['EMENTA_NORM'].values.astype('U')
target = df['ROTULO_MANUAL']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4)

## Transform the training data into tfidf vectors

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

print ('\nTransforming the training data...\n')

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(raw_documents=X_train)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print (X_train_tfidf.shape)


Transforming the training data...

(4870, 16316)


<0x16316 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

## Transform the test data into tfidf vectors

In [4]:
print ('\nTransforming the test data...\n')

count_vect = CountVectorizer()
X_test_counts = count_vect.fit_transform(raw_documents=X_test)

tfidf_transformer = TfidfTransformer(use_idf=False)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
print (X_test_tfidf.shape)

print (X_test_tfidf)
print (y_train.shape)

docs_test = X_test


Transforming the test data...

(3247, 14414)
  (0, 4400)	0.045407660918649985
  (0, 2956)	0.045407660918649985
  (0, 11729)	0.045407660918649985
  (0, 8873)	0.045407660918649985
  (0, 9382)	0.045407660918649985
  (0, 1266)	0.045407660918649985
  (0, 6806)	0.045407660918649985
  (0, 8391)	0.045407660918649985
  (0, 1775)	0.045407660918649985
  (0, 990)	0.045407660918649985
  (0, 14113)	0.045407660918649985
  (0, 5937)	0.045407660918649985
  (0, 10529)	0.045407660918649985
  (0, 13348)	0.045407660918649985
  (0, 1911)	0.045407660918649985
  (0, 11047)	0.045407660918649985
  (0, 10845)	0.045407660918649985
  (0, 10822)	0.045407660918649985
  (0, 11727)	0.045407660918649985
  (0, 6971)	0.045407660918649985
  (0, 12636)	0.045407660918649985
  (0, 6293)	0.045407660918649985
  (0, 8749)	0.045407660918649985
  (0, 13059)	0.045407660918649985
  (0, 12256)	0.045407660918649985
  :	:
  (3246, 14009)	0.11704114719613057
  (3246, 5109)	0.11704114719613057
  (3246, 2037)	0.11704114719613057
  (3246

## Construct the classifier pipeline using a SGDClassifier algorithm

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

print ('\nApplying the classifier...\n')
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, verbose=1)),
    ('clf', LinearSVC(random_state=0, tol=1e-5))
])


Applying the classifier...



## Fit the model to the training data

In [6]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0))])

## Run the test data into the model

In [7]:
predicted = text_clf.predict(docs_test)

## Calculate mean accuracy of predictions

In [8]:
import numpy as np

print (np.mean(predicted == y_test))

0.745611333538651


## Generate labelled performance metrics

In [9]:
from sklearn import metrics

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

        AUT       1.00      0.56      0.72        16
        BAN       0.68      0.73      0.71       341
        BUS       0.72      0.65      0.69        89
        CDC       0.71      0.49      0.58        70
        CIV       0.60      0.25      0.35        12
        COM       0.45      0.11      0.18        44
        CON       0.59      0.63      0.61       259
        DAN       0.60      0.74      0.66       294
        DMI       0.68      0.53      0.60        95
        DPV       0.91      0.71      0.80        45
        EXP       0.87      0.97      0.92       561
        FAL       0.95      0.70      0.81        30
        FAM       0.77      0.63      0.69        79
        INF       1.00      0.86      0.92         7
        MAR       0.60      0.80      0.69        15
        OIE       0.76      0.82      0.79       223
        OIG       0.91      0.84      0.87       270
        POS       0.81      0.68      0.74   