# COUNT THE OCCURRENCE OF A PARTICULAR WORD IN EACH AND ALL REVIEWS

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import pandas as pd

In [40]:
file_dataset = "./CSV/df_processed_polarity_and_satisfaction.csv"
file_processed_text = "./CSV/processed_text.csv"
file_vocabulary_processed = "./CSV/vocabulary_processed.csv"
file_term_counts_per_review = "./CSV/term_counts_per_review.csv"
# file_out = "df_processed_sentimentValue.csv"

In [4]:
df = pd.read_csv(file_dataset)
df.head()

Unnamed: 0,_id,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL,Year,Month,Day,Polarity,Customer_Satisfied
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,https://www.amazon.com/dp/B001E4KFG0,2011,4,27,0.9441,1
1,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,https://www.amazon.com/dp/B000LQOCH0,2008,8,18,0.8265,1
2,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,https://www.amazon.com/dp/B000UA0QIQ,2011,6,13,0.0,0
3,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,https://www.amazon.com/dp/B00813GRG4,2012,9,7,-0.5664,0
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,https://www.amazon.com/dp/B006K2ZZ7K,2012,10,21,0.9468,1


In [5]:
# Reviews
reviews = df[['Id', 'Text']]

In [6]:
#Corpus of original reviews
reviews_corpus = reviews['Text'].tolist()
reviews_corpus[0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

## Raw Count

In [7]:
%%time
#Raw word count
vectorizer_raw = CountVectorizer()
raw_counts = vectorizer_raw.fit_transform(reviews_corpus)

CPU times: user 29 s, sys: 520 ms, total: 29.5 s
Wall time: 29.5 s


In [9]:
feature_names = list(vectorizer_raw.get_feature_names_out())

In [10]:
len(feature_names)

120246

## Count with Preprocessing

### Preprocessing Functions

In [11]:
!pip install html5lib
!pip install lxml
!pip install bs4



In [12]:
import string
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from bs4 import BeautifulSoup

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mjubuntu18/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mjubuntu18/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
stop_words = stopwords.words('english')

In [14]:
#Remove Stop Words
def remove_stopwords(review):
  words = word_tokenize(review)
  words = [word for word in words if word.lower() not in stop_words]
  processed_review = ' '.join(words)

  return processed_review

In [15]:
#Remove Punctuation
def remove_punctuation(review):
  processed_review = review.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
  return processed_review

In [16]:
#Remove numbers
def remove_numbers(review):
  processed_review = ''.join(character for character in review if not character.isdigit())
  return processed_review

In [17]:
#Lowercase sentence
def lowercase_text(text):
    return text.lower()

In [18]:
#Remove Links
def remove_links(review):
  words = review.split()
  for i, word in enumerate(words):
    parsed_url = urlparse(word)
    if parsed_url.scheme and parsed_url.netloc:
      words[i] = ""

  return ' '.join(words)

In [22]:
#Remove html expressions
def remove_html(review):
  process = BeautifulSoup(review, 'html5lib')
  # process = BeautifulSoup(review, 'html.parser')
  processed_review = process.get_text(separator=" ")
  return processed_review

In [25]:
def preprocess_review(review):
  preprocessed_review = remove_html(review)
  # preprocessed_review = remove_links(preprocessed_review)
  preprocessed_review = remove_punctuation(preprocessed_review)
  preprocessed_review = remove_stopwords(preprocessed_review)
  preprocessed_review = remove_numbers(preprocessed_review)
  preprocessed_review = lowercase_text(preprocessed_review)

  return preprocessed_review

### Preprocess the Corpus

In [26]:
%%time
#Preprocess the whole corpus
#Note: 10+ minutes on Colab
preprocessed_corpus = [preprocess_review(review) for review in reviews_corpus]

  process = BeautifulSoup(review, 'html5lib')


CPU times: user 6min 9s, sys: 746 ms, total: 6min 9s
Wall time: 6min 10s


In [27]:
#Turn array into dataframe
processed_text_dataframe = pd.DataFrame(preprocessed_corpus, columns = ['Processed Text'])

#Add review Ids to the dataframe
processed_text_dataframe = pd.concat([reviews["Id"], processed_text_dataframe], axis=1)

In [29]:
processed_text_dataframe.to_csv(file_processed_text, index=False)

In [32]:
processed_text_dataframe.head(10)

Unnamed: 0,Id,Processed Text
0,1,bought several vitality canned dog food produc...
1,3,confection around centuries light pillowy citr...
2,4,looking secret ingredient robitussin believe f...
3,2,product arrived labeled jumbo salted peanuts p...
4,5,great taffy great price wide assortment yummy ...
5,6,got wild hair taffy ordered five pound bag taf...
6,7,saltwater taffy great flavors soft chewy candy...
7,8,taffy good soft chewy flavors amazing would de...
8,9,right mostly sprouting cats eat grass love rot...
9,10,healthy dog food good digestion also good smal...


### Count the ocurrences of each term for each and all reviews

In [33]:
%%time
#Processed Word Count
vectorizer_processed = CountVectorizer()
processed_counts = vectorizer_processed.fit_transform(preprocessed_corpus)

CPU times: user 16.7 s, sys: 264 ms, total: 17 s
Wall time: 17 s


In [34]:
#Processed Vocabulary
feature_names_processed = list(vectorizer_processed.get_feature_names_out())

In [37]:
#Save processed vocabulary
vocab_df = pd.DataFrame(feature_names_processed, columns=['Term'])
vocab_df.to_csv(file_vocabulary_processed, index=False)

In [38]:
len(feature_names_processed)

107110

# Saving the Term Count Per Review

In [39]:
import numpy as np

## Saving the counts for all reviews

In [41]:
#This code shoud work if there is enough RAM available
def save_term_counts_csv():
  term_counts_array = processed_counts.toarray()

  #Turn the array into a pandas dataframe
  term_counts_dataframe = pd.DataFrame(term_counts_array, columns=feature_names_processed)

  #Append the review ID at the beginning for cross referencing
  term_counts_dataframe = pd.concat([reviews["Id"], term_counts_dataframe], axis=1)

  #Save the review as
  term_counts_dataframe.to_csv(file_term_counts_per_review, index=False)

## Save the term counts of a particular set of reviews

In [44]:
def get_review_term_counts(review_ids, filename):
  #List the reviews' row Ids
  index_list = [int(df.index[df['Id'] == id][0]) for id in review_ids]

  #Get a list of terms that are in at least one of the reviews
  vocab_id_list = []

  for ind in index_list:
    review_vocab = processed_counts[ind, :]
    vocab_ids = [vocab_id for vocab_id in review_vocab.nonzero()[1]]

    for vocab_id in vocab_ids:
      if vocab_id not in vocab_id_list:
        vocab_id_list.append(vocab_id)

  #For each term, get the count of it in each review
  id_term_counts = {}

  #First put the review Ids
  id_term_counts['Id'] = review_ids

  for vocab_id in vocab_id_list:
    vocab_counts = []

    for ind in index_list:
      vocab_counts.append(processed_counts[int(ind), int(vocab_id)])

    id_term_counts[feature_names_processed[int(vocab_id)]] = vocab_counts

  #Turn into a pandas DataFrame
  counts_dataframe = pd.DataFrame.from_dict(id_term_counts)

  counts_dataframe.to_csv(f'{filename}.csv', index=False)

  return(counts_dataframe)


In [45]:
#Test
get_review_term_counts([69726], 'review_69726')

Unnamed: 0,Id,product,highly,recommend,flavor,bag,black,one,really,never,...,herbal,result,brew,traditional,stated,darjeeling,tolerate,peach,genuine,fruitty
0,69726,1,1,1,2,1,3,1,1,1,...,1,1,1,1,1,1,1,4,1,1
