# Terms
- Document: one row of data. it can be a text message, a tweet or a book.
- Corpus: a dataset of documents
- Token: the output of tokenization. it can be a word, a sybol or a phrase

In [1]:
# defining a corpus
messages = ["Hey hey hey lets go get lunch today :)",
           "Did you go home?",
           "Hey!!! I need a favor"]

# CountVectorizer 
it takes a bag-of-words approach. and then counts the occurances of each word in a matrix

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

In [5]:
vect.fit(messages)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
# tokens
vect.get_feature_names()

['did',
 'favor',
 'get',
 'go',
 'hey',
 'home',
 'lets',
 'lunch',
 'need',
 'today',
 'you']

It already applied
- Lowercase
- Removed punctuation
- words less than 2 letters have been removed

In [7]:
# document term matrix
dtm = vect.transform(messages)
dtm

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [8]:
repr(dtm)

"<3x11 sparse matrix of type '<class 'numpy.int64'>'\n\twith 13 stored elements in Compressed Sparse Row format>"

In [9]:
print(dtm)

  (0, 2)	1
  (0, 3)	1
  (0, 4)	3
  (0, 6)	1
  (0, 7)	1
  (0, 9)	1
  (1, 0)	1
  (1, 3)	1
  (1, 5)	1
  (1, 10)	1
  (2, 1)	1
  (2, 4)	1
  (2, 8)	1


In [10]:
import pandas as pd
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

  return f(*args, **kwds)


Unnamed: 0,did,favor,get,go,hey,home,lets,lunch,need,today,you
0,0,0,1,1,3,0,1,1,0,1,0
1,1,0,0,1,0,1,0,0,0,0,1
2,0,1,0,0,1,0,0,0,1,0,0


In [12]:
# using the vect object we created to transform new messages
new_message = ['Hey lets go get a drink tonight']
new_dtm = vect.transform(new_message)
pd.DataFrame(new_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,did,favor,get,go,hey,home,lets,lunch,need,today,you
0,0,0,1,1,1,0,1,0,0,0,0


Some tokens didnt' appear like drink as it wasn't in the vectorizer features

In [13]:
messages.append(new_message[0])
messages

['Hey hey hey lets go get lunch today :)',
 'Did you go home?',
 'Hey!!! I need a favor',
 'Hey lets go get a drink tonight']

In [14]:
dtm = vect.fit_transform(messages)
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,did,drink,favor,get,go,hey,home,lets,lunch,need,today,tonight,you
0,0,0,0,1,1,3,0,1,1,0,1,0,0
1,1,0,0,0,1,0,1,0,0,0,0,0,1
2,0,0,1,0,0,1,0,0,0,1,0,0,0
3,0,1,0,1,1,1,0,1,0,0,0,1,0


# TfidfVectorizer 
    term frequency-inverse document frequency for each word
TF-IDF is the product of two weights, the term frequency and the inverse document frequency
- `TF-IDF = term frequency * (1 / document frequency)`
- or `TF-IDF = term frequency * inverse document frequency`

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

def createDTM(messages):
    vect = TfidfVectorizer()
    dtm = vect.fit_transform(messages)
    
    return pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

In [16]:
messages = ["Hey lets get lunch :)",
           "Hey!!! I need a favor"]

createDTM(messages)

Unnamed: 0,favor,get,hey,lets,lunch,need
0,0.0,0.534046,0.379978,0.534046,0.534046,0.0
1,0.631667,0.0,0.449436,0.0,0.0,0.631667


In [17]:
messages = ["Hey hey hey lets get lunch :)",
           "Hey!!! I need a favor"]
createDTM(messages)

Unnamed: 0,favor,get,hey,lets,lunch,need
0,0.0,0.363788,0.776515,0.363788,0.363788,0.0
1,0.631667,0.0,0.449436,0.0,0.0,0.631667


In [20]:
messages = ["Hey hey hey lets get lunch :)",
           "I need a favor"]
createDTM(messages)

Unnamed: 0,favor,get,hey,lets,lunch,need
0,0.0,0.288675,0.866025,0.288675,0.288675,0.0
1,0.707107,0.0,0.0,0.0,0.0,0.707107
