# Bag of Words
Creates a bag of words and applies TF or TF-IDF.



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Create feature set

In [2]:
import pandas as pd
import pickle
import time
from sklearn.feature_extraction.text import TfidfVectorizer

def bagOfWords(reviews, useTfidf = True, min_df = 0.01, max_df = 1.0):
    if useTfidf:
        vectorizer = TfidfVectorizer(min_df = min_df, max_df = max_df)
    else:
        vectorizer = TfidfVectorizer(use_idf = False, min_df = min_df, max_df = max_df)   
    matrix = vectorizer.fit_transform(reviews)
    return pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names())

firstTime = time.time()
hotelData = pd.read_pickle('./drive/My Drive/for_all.pkl') # loads preprocessed set from NB-preprocessing.ipynb
print ("Time to read preprocessed set: %s seconds" % round(time.time()-firstTime,4))

reviews = hotelData['Review']

textCorpus = []
for review in reviews: # prepare reviews for TF/TFIDF
    item = " ".join([word for word in review])
    textCorpus.append(item)

firstTime = time.time()
#vectors = bagOfWords(textCorpus, False, 0.005, 1.0) # apply TF
vectors = bagOfWords(textCorpus, True, 0.005, 1.0) # apply TF-IDF

vectors["Reviewer_Score"] = hotelData["Reviewer_Score"]
print ("Create compatible dataframe: %s seconds" % round(time.time()-firstTime,4))

print(vectors)

Time to read preprocessed set: 3.3406 seconds
Create compatible dataframe: 7.9055 seconds
           locat      room     staff  Reviewer_Score
0       0.000000  1.000000  0.000000               2
1       0.423895  0.372209  0.825696               1
2       0.335435  0.883604  0.326693               1
3       0.264585  0.929296  0.257690               2
4       0.335435  0.883604  0.326693               2
...          ...       ...       ...             ...
515733  0.606388  0.532450  0.590585               1
515734  0.000000  0.000000  0.000000               2
515735  0.000000  0.000000  0.000000               2
515736  0.000000  1.000000  0.000000               0
515737  0.000000  0.000000  1.000000               1

[515738 rows x 4 columns]


## Save feature set

In [3]:
firstTime = time.time()
vectors.to_pickle("tfidf.pkl")
print ("Time to save file: %s seconds" % round(time.time()-firstTime,4))

Time to save file: 0.024 seconds


## Determine word frequency
Calculates the absoute frequency of each word in all reviews.

In [0]:
wordFrequency = {}

for review in reviews:
    for word in review:
        if word not in wordFrequency.keys():
            wordFrequency[word] = 1
        else:
            wordFrequency[word] += 1
            
print(len(wordFrequency))
print(wordFrequency)

60860
{'park': 25390, 'outsid': 12620, 'hotel': 210637, 'beauti': 16928, 'angri': 195, 'made': 17350, 'post': 504, 'avail': 11131, 'via': 1949, 'possibl': 3825, 'site': 4549, 'use': 23037, 'plane': 259, 'trip': 5838, 'one': 40979, 'make': 17599, 'mistak': 1227, 'book': 33306, 'place': 23516, 'com': 5496, 'stay': 72334, 'night': 39932, '11': 1666, '17': 785, 'juli': 277, 'upon': 2949, 'arriv': 20525, 'small': 61584, 'room': 391974, '2nd': 1319, 'floor': 23693, 'turn': 5307, 'special': 6938, 'reserv': 3675, 'level': 3762, 'duplex': 152, 'would': 50472, 'big': 17053, 'window': 19075, 'high': 10022, 'ceil': 2001, 'ok': 8472, 'mind': 1784, 'broken': 4335, 'close': 41974, 'hello': 226, 'rain': 1254, 'mini': 5552, 'fridg': 5402, 'contain': 498, 'sort': 2766, 'bio': 30, 'weapon': 3, 'least': 2735, 'guess': 1455, 'smell': 7459, 'intim': 277, 'ask': 21983, 'chang': 10260, 'explain': 1858, 'time': 36035, 'btw': 105, 'cost': 7069, 'simpl': 1873, 'doubl': 11989, 'got': 15616, 'way': 11647, 'volum':