# Занятие 6.1: Введение в обработку текста

Максим Ионов, [max.ionov@gmail.com](mailto:max.ionov@gmail.com)

## 1. Чтение текстов, нужных для упражнений

In [1]:
import sklearn.datasets

In [2]:
news = sklearn.datasets.fetch_20newsgroups(subset='train', categories=['comp.graphics', 'sci.med'], shuffle=True, random_state=42)

In [3]:
news.target_names

['comp.graphics', 'sci.med']

In [4]:
news.target

array([0, 1, 0, ..., 1, 0, 0])

In [5]:
news.data[0]

u'From: zyeh@caspian.usc.edu (zhenghao yeh)\nSubject: Re: Need polygon splitting algo...\nOrganization: University of Southern California, Los Angeles, CA\nLines: 25\nDistribution: world\nNNTP-Posting-Host: caspian.usc.edu\nKeywords: polygons, splitting, clipping\n\n\nIn article <1qvq4b$r4t@wampyr.cc.uow.edu.au>, g9134255@wampyr.cc.uow.edu.au (Coronado Emmanuel Abad) writes:\n|> \n|> The idea is to clip one polygon using another polygon (not\n|> necessarily rectangular) as a window.  My problem then is in\n|> finding out all the new vertices of the resulting "subpolygons"\n|> from the first one.  Is this simply a matter of extending the\n|> usual algorithm whereby each of the edges of one polygon is checked\n|> against another polygon???  Is there a simpler way??\n|> \n|> Comments welcome.\n|> \n|> Noel.\n\n\tIt depends on what kind of the polygons. \n\tConvex - simple, concave - trouble, concave with loop(s)\n\tinside - big trouble.\n\n\tOf cause, you can use the box test to avoid che

## 2. Строим частотный список

In [None]:
import re
import collections

def get_freq_list(texts):
    words = collections.defaultdict(int)
    for text in texts:
        for word in re.split(r'\W', text):
            words[word] += 1
    del words[u'']
    
    return [(word, words[word]) for word in sorted(words, key=lambda w: words[w], reverse=True)]

## 3. Bag of words

### 3.1. CountVectorizer

In [None]:
import sklearn.feature_extraction.text

In [None]:
vctr = sklearn.feature_extraction.text.CountVectorizer()

In [None]:
X_train_counts = vctr.fit_transform(news.data)

In [None]:
X_train_counts.shape

In [None]:
vctr

In [None]:
vctr.vocabulary_

In [None]:
vctr.vocabulary_.get('the')

### 3.2. TF.IDF vectorizer

In [None]:
vctr_tfidf = sklearn.feature_extraction.text.TfidfTransformer()

In [None]:
X_train_counts_tfidf = vctr_tfidf.fit_transform(X_train_counts)

In [None]:
vctr_tfidf.idf_

In [None]:
vctr_tfidf.vocabulary_.get('the')

In [None]:
import sklearn.pipeline
import sklearn.naive_bayes

text_clf = sklearn.pipeline.Pipeline([('vect', sklearn.feature_extraction.text.CountVectorizer()),
                     ('clf', sklearn.naive_bayes.MultinomialNB())])

## 4. Classifying

In [None]:
news_test = sklearn.datasets.fetch_20newsgroups(subset='test',
     categories=['comp.graphics', 'sci.med'], shuffle=True, random_state=42)

In [None]:
text_clf.fit(news.data, news.target)

In [None]:
predicted = text_clf.predict(news_test.data)

In [None]:
import sklearn.metrics
print(sklearn.metrics.classification_report(news_test.target, predicted,
     target_names=news_test.target_names))

In [None]:
sklearn.metrics.confusion_matrix(news_test.target, predicted)