# Chuẩn bị dữ liệu 

In [1]:
import codecs
def _generate_examples(filepath):
    examples = []
    with codecs.open(filepath, "rb") as f:
        for id_, row in enumerate(f):
            # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
            label, _, text = row.replace(b"\xf0",
                                         b" ").strip().decode().partition(" ")
            coarse_label, _, fine_label = label.partition(":")
            examples.append((id_, {
                "label-coarse": coarse_label,
                "label-fine": fine_label,
                "text": text,
            }))
    return examples 

In [2]:
train = _generate_examples("train_5500.label")
test = _generate_examples("TREC_10.label")

In [3]:
# lấy danh sách các nhãn trong dữ liệu huấn luyện
labels = [x['label-coarse'] for _, x in train]
set_labels = list(set(labels))
label2id = {x: i for i, x in enumerate(set_labels)}
id2label = {i: x for i, x in enumerate(set_labels)}

print("------")
print(len(labels))
print("------")
print(set_labels)
print("------")
print(label2id)
print("------")
print(id2label)

------
5452
------
['ENTY', 'LOC', 'HUM', 'NUM', 'DESC', 'ABBR']
------
{'ENTY': 0, 'LOC': 1, 'HUM': 2, 'NUM': 3, 'DESC': 4, 'ABBR': 5}
------
{0: 'ENTY', 1: 'LOC', 2: 'HUM', 3: 'NUM', 4: 'DESC', 5: 'ABBR'}


In [4]:
train_target = [label2id[x['label-coarse']] for _, x in train]
train_data = [x['text'] for _, x in train]

test_data = [x['text'] for _, x in test]
test_target = [label2id[x['label-coarse']] for _, x in test]


print("#training size", len(train))
print("#testing size", len(test))
print(train[0])
print(train[1])
print(test[0])
print(test[1])
print(train_data[0], train_target[0])
print(train_data[1], train_target[1])

#training size 5452
#testing size 500
(0, {'label-coarse': 'DESC', 'label-fine': 'manner', 'text': 'How did serfdom develop in and then leave Russia ?'})
(1, {'label-coarse': 'ENTY', 'label-fine': 'cremat', 'text': 'What films featured the character Popeye Doyle ?'})
(0, {'label-coarse': 'NUM', 'label-fine': 'dist', 'text': 'How far is it from Denver to Aspen ?'})
(1, {'label-coarse': 'LOC', 'label-fine': 'city', 'text': 'What county is Modesto , California in ?'})
How did serfdom develop in and then leave Russia ? 4
What films featured the character Popeye Doyle ? 0


# Pipeline 

In [5]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
ngram_range = (1,2)
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngram_range)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', svm.LinearSVC()),
])

In [6]:
text_clf.fit(train_data, train_target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                     

In [7]:
text_clf.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 2), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
             intercept_scaling=1, loss='squared_hinge', max_iter=1000,
             multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
             verbose=0))],
 'verbose': False,
 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content'

# Predict 

In [8]:
docs_new = ['what is computer', 
            'who is Newton', 
            'when is the Tet holiday ?']

predicted = text_clf.predict(docs_new)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, id2label[category]))

'what is computer' => DESC
'who is Newton' => HUM
'when is the Tet holiday ?' => NUM


In [9]:
# LinearSVC (1, 2) tfidf (5452, 32693)  ==> accurracy 0.9
predicted = text_clf.predict(test_data)
ncorrect = sum([y_pred == y for y_pred, y in zip (predicted, test_target)])
accurracy = ncorrect / len(test_target)
accurracy

0.886

In [10]:
y_pred = text_clf['clf'].predict(text_clf['tfidf'].transform(text_clf['vect'].transform(test_data)))
ncorrect = sum([y_pred == y for y_pred, y in zip (predicted, test_target)])
accurracy = ncorrect / len(test_target)
accurracy

0.886

# Separated

In [11]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

ngram_range  = (1,1)
use_idf = False
# step 1
count_vect = CountVectorizer(ngram_range=ngram_range)
X_train_counts = count_vect.fit_transform(train_data)        
transformer = TfidfTransformer(use_idf=use_idf).fit(X_train_counts)
X_train = transformer.transform(X_train_counts) 
print(X_train.shape)
clf = svm.LinearSVC()

# step2: evaluation
clf.fit(X_train, train_target)

# step3: evaluation

print("Gold/Ground Truth Label:")
print(test_target[:30], "...")
print([id2label[x] for x in test_target[:10]], "...")

X_new_counts = count_vect.transform(test_data)
X_new = transformer.transform(X_new_counts)
predicted = clf.predict(X_new)
print("\nNumber Item Predicted:", len(predicted))
print("System / Predicted Label:")
print(list(predicted[:30]), "...")
ncorrect = sum([y_pred == y for y_pred, y in zip (predicted, test_target)])
accurracy = ncorrect / len(test_target)

print("\nResult:")
print(" ==> accurracy", accurracy)      

(5452, 8410)
Gold/Ground Truth Label:
[3, 1, 2, 4, 3, 3, 2, 0, 4, 4, 1, 2, 3, 2, 3, 3, 0, 2, 4, 3, 2, 4, 1, 4, 4, 2, 4, 1, 1, 1] ...
['NUM', 'LOC', 'HUM', 'DESC', 'NUM', 'NUM', 'HUM', 'ENTY', 'DESC', 'DESC'] ...

Number Item Predicted: 500
System / Predicted Label:
[3, 1, 2, 4, 3, 3, 2, 4, 4, 4, 1, 4, 3, 2, 3, 3, 1, 2, 4, 3, 2, 4, 1, 4, 4, 2, 4, 4, 1, 4] ...

Result:
 ==> accurracy 0.87
