# 2.2.0 Numeric Features from Reviews

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

from utils.info2 import annak
from utils.paths import make_dir_line

modality = 'c'
project = 'sentiment_analysis_in_python'
data = make_dir_line(modality, project)

raw = data('raw')
processed = data('processed')

## 2.2.2 Which statement about BOW is true?

You were introduced to a bag-of-words(BOW) and some of its characteristics in the video. Which of the following statements about BOW is true?

R:/ Bag-of-words is a simple but effective method to build a vocabulary of all the words occurring in a document.

## 2.2.3 Your first BOW

In [4]:
# Build the vectorizer and fit it
anna_vect = CountVectorizer()
anna_vect.fit(annak)

# Create the bow representation
anna_bow = anna_vect.transform(annak)

# Print the bag-of-words result 
print(anna_bow.toarray())

[[1 1 1 0 1 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 1 1 1 1 2 1]]


## 2.2.4 BOW using product reviews

In [5]:
reviews = pd.read_csv(raw / "amazon_reviews_sample.csv", sep=',')
reviews = reviews.loc[:,['score','review']]
reviews.head()

Unnamed: 0,score,review
0,1,Stuning even for the non-gamer: This sound tr...
1,1,The best soundtrack ever to anything.: I'm re...
2,1,Amazing!: This soundtrack is my favorite musi...
3,1,Excellent Soundtrack: I truly like this sound...
4,1,"Remember, Pull Your Jaw Off The Floor After H..."


In [6]:
# Build the vectorizer, specify max features 
vect = CountVectorizer(max_features=100)
# Fit the vectorizer
vect.fit(reviews.review)

# Transform the review column
X_review = vect.transform(reviews.review)

# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names_out())
print(X_df.head())

   about  after  all  also  am  an  and  any  are  as  at  be  because  been   
0      0      0    1     0   0   0    2    0    0   0   0   0        0     0  \
1      0      0    0     0   0   0    3    1    1   0   0   1        0     1   
2      0      0    3     0   0   1    4    0    1   1   0   1        0     1   
3      0      0    0     0   0   0    9    0    1   0   0   1        0     0   
4      0      1    0     0   0   0    3    0    1   0   0   1        0     0   

   best  better  book  books  but  buy  by  can  could  did  do  don  even   
0     1       0     0      0    1    0   0    0      0    0   0    0     2  \
1     2       0     0      0    1    1   0    0      0    0   0    0     0   
2     1       0     0      0    0    1   0    1      1    0   0    0     1   
3     1       0     0      0    0    0   1    0      0    0   0    0     0   
4     1       0     0      0    0    0   0    1      0    0   0    0     0   

   first  for  from  get  good  great  had  has  h

## 2.2.6 Specify token sequence length with BOW

In [7]:
# # Build the vectorizer, specify token sequence and fit
# vect = CountVectorizer(ngram_range=(1, 2))
# vect.fit(reviews.review)

# # Transform the review column
# X_review = vect.transform(reviews.review)

# # Create the bow representation
# X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names_out())
# print(X_df.head())

MemoryError: Unable to allocate 24.3 GiB for an array with shape (10000, 326726) and data type int64

MemoryError!!!

## 2.2.7 Size of vocabulary of movies reviews

In [11]:
movies = pd.read_csv(raw / "movies.csv", sep=',')
movies.head()


Unnamed: 0,review,label
0,This short spoof can be found on Elite's Mille...,0
1,A singularly unfunny musical comedy that artif...,0
2,"An excellent series, masterfully acted and dir...",1
3,The master of movie spectacle Cecil B. De Mill...,1
4,I was gifted with this movie as it had such a ...,0


In [16]:
# Build the vectorizer, specify size of vocabulary and fit
vect = CountVectorizer(max_features=100)
vect.fit(movies.review)

# Transform the review column
X_review = vect.transform(movies.review)
# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names_out())
print(X_df.head())

   about  after  all  am  an  and  another  are  as  at  bad  be  been  best   
0      0      0    0   0   0    0        0    0   0   0    0   1     0     0  \
1      0      0    0   0   0    0        0    0   0   0    0   0     0     0   
2      0      0    0   0   1    1        0    0   0   0    0   0     0     0   
3      0      0    0   0   0    0        0    0   0   0    0   0     0     0   
4      0      0    0   0   0    0        0    0   1   0    0   0     0     0   

   br  but  by  can  do  don  ever  fan  film  films  first  for  from  good   
0   0    0   0    1   0    0     0    0     0      0      0    0     0     0  \
1   0    0   0    0   0    0     0    0     0      0      0    0     0     0   
2   0    0   0    0   0    0     0    0     0      0      0    0     0     0   
3   0    0   0    0   0    0     0    0     0      0      0    0     0     0   
4   0    0   0    0   0    0     0    0     0      0      0    0     0     0   

   great  had  has  have  having  horr

In [18]:
# Build and fit the vectorizer
vect = CountVectorizer(max_df=200)
vect.fit(movies.review)

# Transform the review column
X_review = vect.transform(movies.review)
# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names_out())
print(X_df.head())

   about  absolutely  after  all  am  an  and  another  are  as  at  awful   
0      0           0      0    0   0   0    0        0    0   0   0      0  \
1      0           0      0    0   0   0    0        0    0   0   0      0   
2      0           0      0    0   0   1    1        0    0   0   0      0   
3      0           0      0    0   0   0    0        0    0   0   0      0   
4      0           0      0    0   0   0    0        0    0   1   0      0   

   back  bad  based  be  because  been  before  believe  best  better  bit   
0     0    0      0   1        0     0       0        0     0       0    0  \
1     0    0      0   0        0     0       0        0     0       0    0   
2     0    0      0   0        0     0       0        0     0       0    0   
3     0    0      0   0        0     0       0        0     0       0    0   
4     0    0      0   0        0     0       0        0     0       0    0   

   br  but  by  call  came  can  cast  comedy  could  crap  di

In [20]:
# Build and fit the vectorizer
vect = CountVectorizer(min_df=50)
vect.fit(movies.review)

# Transform the review column
X_review = vect.transform(movies.review)
# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names_out())
print(X_df.head())

   and  film  for  have  in  is  it  movie  of  on  one  that  the  this  to   
0    0     0    0     0   0   0   0      0   0   1    0     0    0     1   0  \
1    0     0    0     0   0   0   0      0   0   0    0     1    0     0   0   
2    1     0    0     0   0   0   0      0   0   0    0     0    0     0   0   
3    0     0    0     0   0   0   0      1   1   0    0     0    1     0   0   
4    0     0    0     0   0   0   1      1   0   0    0     0    0     1   0   

   was  
0    0  
1    0  
2    0  
3    0  
4    1  


## 2.2.8 BOW with n-grams and vocabulary size

In [None]:
print('ok_')

ok_
