# Very simple text processing example

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

### Instantiate a CountVectorizer object that processes a corpus from memory (default setting)

In [17]:
vectorizer = CountVectorizer(stop_words="english") 

### Provide a corpus and fit_transform the data

In [18]:
corpus = [
     'This is the first document. Second sentence.',
     'The second document looks different from the first.  Second sentence.',
 ]

In [19]:
X = vectorizer.fit_transform(corpus).toarray()
X

array([[0, 1, 0, 1, 1],
       [1, 1, 1, 2, 1]], dtype=int64)

### Create a DataFrame for simpler processing of data

In [20]:
df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,different,document,looks,second,sentence
0,0,1,0,1,1
1,1,1,1,2,1


### Display Word frequencies

In [21]:
numWordsTotal = df.sum().sum()
numOccurencesPerWord = df.sum(axis=0)
numWordsPerDoc = df.sum(axis=1)
display("Total: " + str(numWordsTotal), "Per Word", numOccurencesPerWord, "Per Doc", numWordsPerDoc)

'Total: 9'

'Per Word'

different    1
document     2
looks        1
second       3
sentence     2
dtype: int64

'Per Doc'

0    3
1    6
dtype: int64

### Find Highest Occurences

In [22]:
# This give us all words where the wordcount is the same like the max value
numOccurencesPerWord[numOccurencesPerWord == numOccurencesPerWord.max()].index

Index(['second'], dtype='object')

## Using a tf-idf Vectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(ngram_range = (1, 2), min_df=1, stop_words="english", max_df=1)
X_tfidf = pd.DataFrame(tfidf_vect.fit_transform(corpus).toarray(), columns=tfidf_vect.get_feature_names_out())

In the dataframe we can see, that removing the stop words is done before calculating the ngrams

In [24]:
X_tfidf

Unnamed: 0,different,different second,document looks,document second,looks,looks different,second document
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.408248,0.408248,0.408248,0.0,0.408248,0.408248,0.408248


#### Show the vector-wise normalization

In [25]:
from math import sqrt
for i in range(X_tfidf.shape[0]):
    print(sqrt(X_tfidf.iloc[i,:].map(lambda x: x*x).sum()))

1.0
0.9999999999999999
