**извлечение признаков из текста на естественном языке**

частотный анализ, Term Frequency (TF)

_Евгений Борисов <esborisov@sevsu.ru>_

## библиотеки

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_colwidth = 200 

# вывод на печать чисел до 2 знака
pd.options.display.precision = 2 
np.set_printoptions(precision=2) 

## тексты

In [2]:
docs = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
 ]

In [3]:
def tf_table(x,v,d=docs):
    print( 'словарь:', len(v), 'слов\n') 
    return pd.concat([ 
        pd.DataFrame(x,columns=v), 
        pd.Series(d,name='Sentence')
    ],axis=1).set_index('Sentence')

## простая векторизация ( CountVectorizer )

In [4]:
# CountVectorizer?

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

tf_model = CountVectorizer().fit(docs)
 
tf_table(
    x = tf_model.transform(docs).todense(),
    v = sorted( tf_model.vocabulary_.keys() ),
    d=docs
)

словарь: 9 слов



Unnamed: 0_level_0,and,document,first,is,one,second,the,third,this
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
This is the first document.,0,1,1,1,0,0,1,0,1
This document is the second document.,0,2,0,1,0,1,1,0,1
And this is the third one.,1,0,0,1,1,0,1,1,1
Is this the first document?,0,1,1,1,0,0,1,0,1


## бинарная векторизация ( CountVectorizer )

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

tf_model = CountVectorizer(binary=True).fit(docs)

tf_table(
    x = tf_model.transform(docs).todense(),
    v = sorted( tf_model.vocabulary_.keys() ),
    d=docs
)

словарь: 9 слов



Unnamed: 0_level_0,and,document,first,is,one,second,the,third,this
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
This is the first document.,0,1,1,1,0,0,1,0,1
This document is the second document.,0,1,0,1,0,1,1,0,1
And this is the third one.,1,0,0,1,1,0,1,1,1
Is this the first document?,0,1,1,1,0,0,1,0,1


## векторизация словосочетаний ( CountVectorizer )

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

tf_model = CountVectorizer(ngram_range=(2,2)).fit(docs)

tf_table(
    x = tf_model.transform(docs).todense(),
    v = sorted( tf_model.vocabulary_.keys() ),
    d=docs
)

словарь: 13 слов



Unnamed: 0_level_0,and this,document is,first document,is the,is this,second document,the first,the second,the third,third one,this document,this is,this the
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
This is the first document.,0,0,1,1,0,0,1,0,0,0,0,1,0
This document is the second document.,0,1,0,1,0,1,0,1,0,0,1,0,0
And this is the third one.,1,0,0,1,0,0,0,0,1,1,0,1,0
Is this the first document?,0,0,1,0,1,0,1,0,0,0,0,0,1


##  CountVectorizer + TF

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
tf_model = CountVectorizer().fit(docs)

from sklearn.feature_extraction.text import TfidfTransformer

# модель обратной частоты для понижения значимости слишком частых слов
idf_model = TfidfTransformer(norm='l2',use_idf=False).fit( tf_model.transform( docs ) )

tf_table(
    x = idf_model.transform( tf_model.transform(docs) ).todense(),
    v = sorted( tf_model.vocabulary_.keys() ),
    d=docs
)

словарь: 9 слов



Unnamed: 0_level_0,and,document,first,is,one,second,the,third,this
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
This is the first document.,0.0,0.45,0.45,0.45,0.0,0.0,0.45,0.0,0.45
This document is the second document.,0.0,0.71,0.0,0.35,0.0,0.35,0.35,0.0,0.35
And this is the third one.,0.41,0.0,0.0,0.41,0.41,0.0,0.41,0.41,0.41
Is this the first document?,0.0,0.45,0.45,0.45,0.0,0.0,0.45,0.0,0.45


### TfidfVectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# "всё в одном" : TfidfVectorizer = CountVectorizer + TfidfTransformer
tf_model = TfidfVectorizer( use_idf=True, norm='l2').fit( docs )

tf_table(
    x = tf_model.transform(docs).todense(),
    v = sorted( tf_model.vocabulary_.keys() ),
    d=docs
)

словарь: 9 слов



Unnamed: 0_level_0,and,document,first,is,one,second,the,third,this
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
This is the first document.,0.0,0.47,0.58,0.38,0.0,0.0,0.38,0.0,0.38
This document is the second document.,0.0,0.69,0.0,0.28,0.0,0.54,0.28,0.0,0.28
And this is the third one.,0.51,0.0,0.0,0.27,0.51,0.0,0.27,0.51,0.27
Is this the first document?,0.0,0.47,0.58,0.38,0.0,0.0,0.38,0.0,0.38
