# TF-IDF Example

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse.csr import csr_matrix 

In [85]:
Doc_List=["This is document1, Connection Broken Error","This is document2, Connection could not be established","This is document3, Test Bed Error"]

Initialize the model with the corpus
<ol>
  <li><b>StopWords</b>:  List of stop words that should be considered according to our context.</li>
  <li><b>max_df</b>:  Eliminate words in feature set that occur more than this percentage.</li>
  <li><b>min_df</b>:  Eliminate words in feature set that occur less than this percentage.</li>
</ol>  

In [86]:
tf = TfidfVectorizer(input=text, analyzer='word',max_df=0.8,min_df=0.1, stop_words = 'english')
tfidf_matrix =  tf.fit_transform(Doc_List)

<i>0.8 means remove words that occur in more than 80% of documents.</i>

## Feature Set

In [87]:
feature_names=tf.get_feature_names()
feature_names

['bed',
 'broken',
 'connection',
 'document1',
 'document2',
 'document3',
 'error',
 'established',
 'test']

In [88]:
transformed_features= list()
for doc in range(0,len(Doc_List)):
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
    term_document_list=list()
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        term_document_list.append((w, s))
    transformed_features.append(term_document_list)

### Remaining words In document after transformation

In [89]:
print(transformed_features)

[[('document1', 0.5628290964997665), ('connection', 0.4280460350631185), ('broken', 0.5628290964997665), ('error', 0.4280460350631185)], [('connection', 0.4736296010332684), ('document2', 0.6227660078332259), ('established', 0.6227660078332259)], [('error', 0.4020402441612698), ('document3', 0.5286346066596935), ('test', 0.5286346066596935), ('bed', 0.5286346066596935)]]
