In [1]:
import numpy as np
import pandas as pd

In [2]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."

In [3]:
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [4]:
tokens=str.split(sentence)
num_tokens=len(tokens)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

## by hand

In [5]:
vocab=sorted(set(tokens))
num_vocab=len(vocab)

In [6]:
onehot_vectors = np.zeros((num_tokens,num_vocab))
onehot_vectors

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [7]:
#sparce matrix
for i, word in enumerate(tokens):
    onehot_vectors[i,vocab.index(word)]=1

In [8]:
onehot_vectors

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

#### into pandas dataframe

In [9]:
df=pd.DataFrame(onehot_vectors, columns=vocab)
df

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## using nltk for tokenization
check out https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize for all the possibilities to tokenize words und sentences

In [10]:
from nltk.tokenize import word_tokenize

In [11]:
nltk_tokens=word_tokenize(sentence)
nltk_tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

## using sklearn for vectorization
uses list as input

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer = CountVectorizer()

In [14]:
X = vectorizer.fit_transform(tokens) #fit -> Learn a vocabulary dictionary of all tokens in the raw documents/transform -> documents into doc-term matrix
X_nltk = vectorizer.fit_transform(nltk_tokens)

In [16]:
vectorizer.get_feature_names()

['26',
 'age',
 'at',
 'began',
 'building',
 'jefferson',
 'monticello',
 'of',
 'the',
 'thomas']

In [17]:
vectorizer.vocabulary_

{'thomas': 9,
 'jefferson': 5,
 'began': 3,
 'building': 4,
 'monticello': 6,
 'at': 2,
 'the': 8,
 'age': 1,
 'of': 7,
 '26': 0}

#### into pandas dataframe

In [18]:
df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
# todense() and toarray()  can be used equally
#column can be labeled either by vectorizer.get_feature_names() or sorted(vectorizer.vocabulary_)
df

Unnamed: 0,26,age,at,began,building,jefferson,monticello,of,the,thomas
0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0
7,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0
9,1,0,0,0,0,0,0,0,0,0


In [19]:
df_nltk = pd.DataFrame(X_nltk.toarray(), columns = sorted(vectorizer.vocabulary_))
df_nltk

Unnamed: 0,26,age,at,began,building,jefferson,monticello,of,the,thomas
0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0
7,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0
9,1,0,0,0,0,0,0,0,0,0


## using keras api
use list as input

In [20]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [21]:
t=Tokenizer() #create tokenizer

#### fit tokenizer on docs
4 attributes: word_counts, word_docs, word_index, document_counts

In [22]:
t.fit_on_texts(tokens) #fit on 

In [23]:
t.word_counts #a dictionary mapping of words and their occurence counts

OrderedDict([('thomas', 1),
             ('jefferson', 1),
             ('began', 1),
             ('building', 1),
             ('monticello', 1),
             ('at', 1),
             ('the', 1),
             ('age', 1),
             ('of', 1),
             ('26', 1)])

In [24]:
t.word_docs #dictionary mapping of words and number of documents that each appears in

defaultdict(int,
            {'thomas': 1,
             'jefferson': 1,
             'began': 1,
             'building': 1,
             'monticello': 1,
             'at': 1,
             'the': 1,
             'age': 1,
             'of': 1,
             '26': 1})

In [25]:
t.word_index # a dictionary of words and their assigned integers

{'thomas': 1,
 'jefferson': 2,
 'began': 3,
 'building': 4,
 'monticello': 5,
 'at': 6,
 'the': 7,
 'age': 8,
 'of': 9,
 '26': 10}

#### creating one vector per document using text_to_matrix
modes:  
- mode='count' count of word in each doc
- mode='binary' default, whether word is present
- mode='tfidf' tfidf-scoring for each word
- mode='freq' frequency of each word as a ratio of words within each document

In [26]:
encoded_docs=t.texts_to_matrix(tokens, mode='count')
encoded_docs

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

#### into pandas dataframe

In [27]:
df=pd.DataFrame(encoded_docs)

In [28]:
df1=df.drop(0, axis=1).copy()
df1.columns=list(t.word_index.keys())
df1

Unnamed: 0,thomas,jefferson,began,building,monticello,at,the,age,of,26
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
