In [2]:
# Bag of words could be defined as a matrix where each row represents a document and columns representing the individual token. One more thing, 
# the sequential order of text is not maintained. Building a "Bag of Words" involves 3 steps

# tokenizing
# counting
# normalizing
# Limitations to keep in mind: 1. Cannot capture phrases or multi-word expressions 2. Sensitive to misspellings, 
#     possible to work around that using a spell corrector or character representation,

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import collections, re

In [4]:
# Read the csv file into a pandas DataFrame
examp = pd.read_csv('convertcsv.csv')
examp.head()

Unnamed: 0,title,category,id,sourceurl,content,publishdate
0,Molly forgot her charger,Technology,kdfdhf333,https://google.com,This is the content right here. It's arguably ...,20190628
1,Connie Also forgot charter,Technology,kdfdhf3dfdf33,https://google.com/charger,This is the content right here and it was here...,20190627


In [5]:
content_ = []
for item in examp["content"]:
    if (item in content_):
        continue
    else:
        content_.append(item)

In [6]:
print(content_)

["This is the content right here. It's arguably the most viral moment of the US star forward's historic World Cup run. And of course, the internet immediately adopted the photo as a meme and symbol of patriotism for an American athlete and activist in her prime.", 'This is the content right here and it was here. Her epic performance led some to suggest that the United States tear down the Confederate monuments and put up Megan Rapinoe monuments. Others pledged their allegiance to Rapinoe as their new President.']


In [7]:
bagsofwords = [ collections.Counter(re.findall(r'\w+', txt))
            for txt in content_]
bagsofwords[0]

Counter({'This': 1,
         'is': 1,
         'the': 5,
         'content': 1,
         'right': 1,
         'here': 1,
         'It': 1,
         's': 2,
         'arguably': 1,
         'most': 1,
         'viral': 1,
         'moment': 1,
         'of': 3,
         'US': 1,
         'star': 1,
         'forward': 1,
         'historic': 1,
         'World': 1,
         'Cup': 1,
         'run': 1,
         'And': 1,
         'course': 1,
         'internet': 1,
         'immediately': 1,
         'adopted': 1,
         'photo': 1,
         'as': 1,
         'a': 1,
         'meme': 1,
         'and': 2,
         'symbol': 1,
         'patriotism': 1,
         'for': 1,
         'an': 1,
         'American': 1,
         'athlete': 1,
         'activist': 1,
         'in': 1,
         'her': 1,
         'prime': 1})

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(content_) 
print(X.toarray())
print(vectorizer.get_feature_names())

[[1 1 0 1 1 3 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 3 0 1 0
  1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 5 0 1 0 0 0 1 1 0 1]
 [0 0 1 0 0 2 0 1 0 1 1 0 0 1 1 0 0 1 2 0 0 0 0 1 1 1 1 0 0 2 0 1 0 1 0 1
  0 1 1 0 1 2 1 0 1 0 1 1 0 1 1 3 2 1 2 1 1 0 0 1 0]]
['activist', 'adopted', 'allegiance', 'american', 'an', 'and', 'arguably', 'as', 'athlete', 'confederate', 'content', 'course', 'cup', 'down', 'epic', 'for', 'forward', 'her', 'here', 'historic', 'immediately', 'in', 'internet', 'is', 'it', 'led', 'megan', 'meme', 'moment', 'monuments', 'most', 'new', 'of', 'others', 'patriotism', 'performance', 'photo', 'pledged', 'president', 'prime', 'put', 'rapinoe', 'right', 'run', 'some', 'star', 'states', 'suggest', 'symbol', 'tear', 'that', 'the', 'their', 'this', 'to', 'united', 'up', 'us', 'viral', 'was', 'world']
