## Importing libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

## Creating dataframe

In [2]:
dataset = {'Original Document':['I like eggs', 'I hate cats', 'I like eggs and I like cats']}

In [3]:
df = pd.DataFrame(dataset)
df

Unnamed: 0,Original Document
0,I like eggs
1,I hate cats
2,I like eggs and I like cats


## Only using python

In [4]:
voc = []
for i in df['Original Document']:
    for j in i.split():
        if j not in voc:
            voc.append(j)

voc.sort()

print(f"vocabulary = {voc}")

def count_vec(doc, vocab):
    out = {}
    for i in vocab:
        out[i] = 0

    for j in doc.split():
        out[j] += 1
    
    return list(out.values())

arr = []
for i in df['Original Document'].apply(lambda x: count_vec(x, voc)):
    arr.append(i)

final_df = pd.DataFrame(arr, columns=voc)

vocabulary = ['I', 'and', 'cats', 'eggs', 'hate', 'like']


In [5]:
final_df

Unnamed: 0,I,and,cats,eggs,hate,like
0,1,0,0,1,0,1
1,1,0,1,0,1,0
2,2,1,1,1,0,2


## Using libraries

In [6]:
vectorizer = CountVectorizer()
count_vector = vectorizer.fit_transform(df['Original Document'])

# Convert the count vector to a dense array and get feature names
count_vector_array = count_vector.toarray()
feature_names = vectorizer.get_feature_names_out()

df_count_vector = pd.DataFrame(count_vector_array, columns=feature_names)

In [7]:
df_count_vector

Unnamed: 0,and,cats,eggs,hate,like
0,0,0,1,0,1
1,0,1,0,1,0
2,1,1,1,0,2
