## Extract/Transform continued

In [1]:
# import dependencies
import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB

### Bag Of Words

In [2]:
docs = ['today is monday', 'the weather is very sunny', 'i need more coffee']
docs2 = ['i have a doctor appointment on tuesday', 'lets meet at the library', 'i love to read books about space']

In [3]:
# instantiate CountVectorizer (vectorizer)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

# tokenize and build text
vect.fit(docs)
vect.get_feature_names()

['coffee',
 'is',
 'monday',
 'more',
 'need',
 'sunny',
 'the',
 'today',
 'very',
 'weather']

In [4]:
print(vect.vocabulary_)

{'today': 7, 'is': 1, 'monday': 2, 'the': 6, 'weather': 9, 'very': 8, 'sunny': 5, 'need': 4, 'more': 3, 'coffee': 0}


### Transform 

In [5]:
# encode document
docs_transformed = vect.transform(docs)
print(docs)
print(vect.get_feature_names())

# summarize encoded vector
docs_transformed.toarray()

['today is monday', 'the weather is very sunny', 'i need more coffee']
['coffee', 'is', 'monday', 'more', 'need', 'sunny', 'the', 'today', 'very', 'weather']


array([[0, 1, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1, 1],
       [1, 0, 0, 1, 1, 0, 0, 0, 0, 0]], dtype=int64)

In [6]:
data = pd.DataFrame(docs_transformed.toarray())
data.columns = vect.get_feature_names()
data.head()

Unnamed: 0,coffee,is,monday,more,need,sunny,the,today,very,weather
0,0,1,1,0,0,0,0,1,0,0
1,0,1,0,0,0,1,1,0,1,1
2,1,0,0,1,1,0,0,0,0,0


In [7]:
# passing input_text through vector does not take new words into account:
newvector = vect.transform(docs2)
print(newvector.toarray())
# data2 = pd.DataFrame(newvector.toarray())
# data2.columns = vect.get_feature_names()
# data2.head()

[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


### Term Frequency - Inverse Document Frequency (TF-IDF)
highlighting unique words across all documents

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
# transform text to vectors
tfvect  = TfidfVectorizer(stop_words={'english'})
# trans = tfvect.fit_transform(docs)
# pd.DataFrame(trans.toarray(), columns=tfvect.get_feature_names())

In [12]:
tfvect.fit(docs)

TfidfVectorizer(stop_words={'english'})

In [None]:
print(tfvect.idf_)

In [None]:
tfvect.get_feature_names()

In [None]:
columns=tfvect.get_feature_names()
data2 = [tfvect.idf_]
pd.DataFrame(data2, columns=columns)

In [None]:
newtfvect = tfvect.transform([docs2])
newtfvect()