# Module 7: Text Processing

Python offers a set of tools for extracting features:http://scikit-learn.org/stable/modules/feature_extraction.html

## Text Processing

In [5]:
corpus = ['This is the first document.',
           'this is the second second document.',
           'And the third one.',
           'Is this the first first first document?',
          ]

### CountVectorizer 

CountVectorizer transforms text into a sparse matrix where rows are text and columns are words, and values are occurrence values.

In [31]:
import sklearn.feature_extraction.text as sk_text

vectorizer = sk_text.CountVectorizer(min_df=1)
#vectorizer = sk_text.CountVectorizer(stop_words = 'english')

#min_df: ignore terms that have a document frequency < min_df.

matrix = vectorizer.fit_transform(corpus)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

print(vectorizer.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 3 1 0 0 1 0 1]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


### TfIdfVectorizer

TfIdfVectorizer transforms text into a sparse matrix where rows are text and columns are words, and values are the tf-dif values. More here: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

In [28]:
vectorizer = sk_text.TfidfVectorizer(
                             #stop_words='english',
                             #max_features = 1000,
                             min_df=1)


#max_features:  build a vocabulary that only consider the top max_features features ordered by term frequency across the corpus.

matrix = vectorizer.fit_transform(corpus)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

print(vectorizer.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.23973261 0.88835239 0.23973261 0.         0.
  0.19599711 0.         0.23973261]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [3]:
vectorizer = sk_text.TfidfVectorizer(#stop_words='english',
                             #max_features = 1000,
                             min_df=2, 
                             #max_df=5
                             )

#min_df: ignore terms that have a document frequency < min_df.
#max_df: ignore terms that have a document frequency > max_df


matrix = vectorizer.fit_transform(corpus)
print(type(matrix))               # Compressed Sparse Row matrix

tfidf_data = matrix.toarray()     #  convert it to numpy array

print(tfidf_data)
print(vectorizer.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
[[0.43877674 0.54197657 0.43877674 0.35872874 0.43877674]
 [0.52210862 0.         0.52210862 0.42685801 0.52210862]
 [0.         0.         0.         1.         0.        ]
 [0.23973261 0.88835239 0.23973261 0.19599711 0.23973261]]
['document', 'first', 'is', 'the', 'this']
