Implementing tf-idf in Amazon Fine Food Review Database from Scratch

In [107]:
import sqlite3
import numpy as np
import pandas as pd
import re
import math

from bs4 import BeautifulSoup
from tqdm import tqdm

Importing Database

In [108]:
con = sqlite3.connect('./database.sqlite')
rawData = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score != 3 LIMIT 5000""", con)

In [109]:
rawData.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2715.981,1.5738,2.0178,4.258,1294917000.0
std,1567.91492,5.051184,5.616956,1.331236,47833290.0
min,1.0,0.0,0.0,1.0,961718400.0
25%,1350.75,0.0,0.0,4.0,1270685000.0
50%,2720.5,0.0,1.0,5.0,1309306000.0
75%,4069.25,2.0,2.0,5.0,1330387000.0
max,5427.0,165.0,168.0,5.0,1351210000.0


In [110]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Id                      5000 non-null   int64 
 1   ProductId               5000 non-null   object
 2   UserId                  5000 non-null   object
 3   ProfileName             5000 non-null   object
 4   HelpfulnessNumerator    5000 non-null   int64 
 5   HelpfulnessDenominator  5000 non-null   int64 
 6   Score                   5000 non-null   int64 
 7   Time                    5000 non-null   int64 
 8   Summary                 5000 non-null   object
 9   Text                    5000 non-null   object
dtypes: int64(5), object(5)
memory usage: 390.8+ KB


Now Distinguishing Score Column as Positive and Negative based on Score and separating it from training dataFrame
Modified Score column would be used to match our output.

In [111]:
def partition(score):
    if score > 3:
        return 'Positive'
    return 'Negative'

actualScore = rawData['Score']
rawData['Score'] = actualScore.map(partition)
modifiedData = rawData.drop('Score', axis=1)

Now we have the required data with us in-place,
Lets figure out how to clean the data of text and summary

Using regular expression changing text of negative and other words to their original form

In [112]:
def decontracting(str):
    str = re.sub(r"wont't", 'will not', str)
    str = re.sub(r"can\'t", 'cannot', str)

    str = re.sub(r"n\'t", " not", str)
    str = re.sub(r"\'re", " are", str)
    str = re.sub(r"\'s", " is", str)
    str = re.sub(r"\'d", " would", str)
    str = re.sub(r"\'ll", " will", str)
    str = re.sub(r"\'t", " not", str)
    str = re.sub(r"\'ve", " have", str)
    str = re.sub(r"\'m", " am", str)
    return str    

Declaring our own stopwords

In [113]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

Cleaning data

In [114]:
processedText = []

for sentence in tqdm(modifiedData['Text'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracting(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    processedText.append(sentence.strip())

100%|██████████| 5000/5000 [00:04<00:00, 1144.58it/s]


Now processing same rules of Summary as well

In [115]:
processedSummary = []

for sentence in tqdm(modifiedData['Summary'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracting(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    processedSummary.append(sentence.strip())

100%|██████████| 5000/5000 [00:02<00:00, 1727.67it/s]


Now Lets begin with for what we are here - featurization of tf-idf

In [116]:
listOfDistinctWords = []

for sent in processedText:
    for eachWord in sent.split(' '):
        listOfDistinctWords.append(eachWord)

listOfDistinctWords = set(listOfDistinctWords)
listOfDistinctWords = list(listOfDistinctWords)
listOfDistinctWords.sort()
#listOfDistinctWords = listOfDistinctWords[1:]

In [117]:
tfIdfVect = []

print(processedText[0])
print(listOfDistinctWords.index('bought'))

tfDict = np.zeros((len(processedText), len(listOfDistinctWords)))

for i in range(0, len(processedText)):
    tempArr = processedText[i].split(' ')

    for j in range(0, len(tempArr)):
        tfDict[i][listOfDistinctWords.index(tempArr[j])] += 1
    
    for j in range(0, len(listOfDistinctWords)):
        tfDict[i][j] = tfDict[i][j]/len(tempArr)

print(tfDict)


bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better
1249
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [118]:
print(tfDict.shape)

(5000, 13016)


In [119]:
idfOfEachWord = {}

wordFreqInDc = {}

for eachWord in listOfDistinctWords:
    wordFreqInDc[eachWord] = 0

for eachSent in processedText:
    tempSent = eachSent.split(' ')
    tempSent = set(tempSent)
    tempSent = list(tempSent)
    for eachWord in tempSent:
        wordFreqInDc[eachWord] += 1

numberOfReviews = len(processedText)

for eachWord in wordFreqInDc:
    idfOfEachWord[eachWord] = math.log(numberOfReviews/(wordFreqInDc[eachWord] + 1))




In [120]:
finalTfIdf = np.zeros((len(tfDict), len(tfDict[0])))

print(len(tfDict), len(tfDict[0]))

for i in range(0, len(tfDict)):
    for j in range(0, len(tfDict[0])):
        finalTfIdf[i][j] = tfDict[i][j] * idfOfEachWord[listOfDistinctWords[j]]

print(finalTfIdf)


5000 13016
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
